1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_cache.h" 6 #include "btree_update.h" 7 #include "buckets.h" 8 #include "darray.h" 9 #include "dirent.h" 10 #include "error.h" 11 #include "fs.h" 12 #include "fs-common.h" 13 #include "fsck.h" 14 #include "inode.h" 15 #include "keylist.h" 16 #include "recovery_passes.h" 17 #include "snapshot.h" 18 #include "super.h" 19 #include "xattr.h" 20 21 #include <linux/bsearch.h> 22 #include <linux/dcache.h> /* struct qstr */ 23 24 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, 25 struct bkey_s_c_dirent d) 26 { 27 return inode->bi_dir == d.k->p.inode && 28 inode->bi_dir_offset == d.k->p.offset; 29 } 30 31 static bool dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, 32 struct bch_inode_unpacked *inode) 33 { 34 if (d.v->d_type == DT_SUBVOL 35 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 36 : le64_to_cpu(d.v->d_inum) == inode->bi_inum) 37 return 0; 38 return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 39 } 40 41 static void dirent_inode_mismatch_msg(struct printbuf *out, 42 struct bch_fs *c, 43 struct bkey_s_c_dirent dirent, 44 struct bch_inode_unpacked *inode) 45 { 46 prt_str(out, "inode points to dirent that does not point back:"); 47 prt_newline(out); 48 bch2_bkey_val_to_text(out, c, dirent.s_c); 49 prt_newline(out); 50 bch2_inode_unpacked_to_text(out, inode); 51 } 52 53 static int dirent_points_to_inode(struct bch_fs *c, 54 struct bkey_s_c_dirent dirent, 55 struct bch_inode_unpacked *inode) 56 { 57 int ret = dirent_points_to_inode_nowarn(dirent, inode); 58 if (ret) { 59 struct printbuf buf = PRINTBUF; 60 dirent_inode_mismatch_msg(&buf, c, dirent, inode); 61 bch_warn(c, "%s", buf.buf); 62 printbuf_exit(&buf); 63 } 64 return ret; 65 } 66 67 /* 68 * XXX: this is handling transaction restarts without returning 69 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: 70 */ 71 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, 72 u32 snapshot) 73 { 74 u64 sectors = 0; 75 76 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, 77 SPOS(inum, 0, snapshot), 78 POS(inum, U64_MAX), 79 0, k, ({ 80 if (bkey_extent_is_allocation(k.k)) 81 sectors += k.k->size; 82 0; 83 })); 84 85 return ret ?: sectors; 86 } 87 88 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, 89 u32 snapshot) 90 { 91 u64 subdirs = 0; 92 93 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, 94 SPOS(inum, 0, snapshot), 95 POS(inum, U64_MAX), 96 0, k, ({ 97 if (k.k->type == KEY_TYPE_dirent && 98 bkey_s_c_to_dirent(k).v->d_type == DT_DIR) 99 subdirs++; 100 0; 101 })); 102 103 return ret ?: subdirs; 104 } 105 106 static int subvol_lookup(struct btree_trans *trans, u32 subvol, 107 u32 *snapshot, u64 *inum) 108 { 109 struct bch_subvolume s; 110 int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); 111 112 *snapshot = le32_to_cpu(s.snapshot); 113 *inum = le64_to_cpu(s.inode); 114 return ret; 115 } 116 117 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, 118 struct bch_inode_unpacked *inode) 119 { 120 struct btree_iter iter; 121 struct bkey_s_c k; 122 int ret; 123 124 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), 125 BTREE_ITER_all_snapshots, k, ret) { 126 if (k.k->p.offset != inode_nr) 127 break; 128 if (!bkey_is_inode(k.k)) 129 continue; 130 ret = bch2_inode_unpack(k, inode); 131 goto found; 132 } 133 ret = -BCH_ERR_ENOENT_inode; 134 found: 135 bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); 136 bch2_trans_iter_exit(trans, &iter); 137 return ret; 138 } 139 140 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, 141 struct bch_inode_unpacked *inode, 142 u32 *snapshot) 143 { 144 struct btree_iter iter; 145 struct bkey_s_c k; 146 int ret; 147 148 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 149 SPOS(0, inode_nr, *snapshot), 0); 150 ret = bkey_err(k); 151 if (ret) 152 goto err; 153 154 ret = bkey_is_inode(k.k) 155 ? bch2_inode_unpack(k, inode) 156 : -BCH_ERR_ENOENT_inode; 157 if (!ret) 158 *snapshot = iter.pos.snapshot; 159 err: 160 bch2_trans_iter_exit(trans, &iter); 161 return ret; 162 } 163 164 static int lookup_dirent_in_snapshot(struct btree_trans *trans, 165 struct bch_hash_info hash_info, 166 subvol_inum dir, struct qstr *name, 167 u64 *target, unsigned *type, u32 snapshot) 168 { 169 struct btree_iter iter; 170 struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, 171 &hash_info, dir, name, 0, snapshot); 172 int ret = bkey_err(k); 173 if (ret) 174 return ret; 175 176 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); 177 *target = le64_to_cpu(d.v->d_inum); 178 *type = d.v->d_type; 179 bch2_trans_iter_exit(trans, &iter); 180 return 0; 181 } 182 183 static int __remove_dirent(struct btree_trans *trans, struct bpos pos) 184 { 185 struct bch_fs *c = trans->c; 186 struct btree_iter iter; 187 struct bch_inode_unpacked dir_inode; 188 struct bch_hash_info dir_hash_info; 189 int ret; 190 191 ret = lookup_first_inode(trans, pos.inode, &dir_inode); 192 if (ret) 193 goto err; 194 195 dir_hash_info = bch2_hash_info_init(c, &dir_inode); 196 197 bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); 198 199 ret = bch2_btree_iter_traverse(&iter) ?: 200 bch2_hash_delete_at(trans, bch2_dirent_hash_desc, 201 &dir_hash_info, &iter, 202 BTREE_UPDATE_internal_snapshot_node); 203 bch2_trans_iter_exit(trans, &iter); 204 err: 205 bch_err_fn(c, ret); 206 return ret; 207 } 208 209 /* Get lost+found, create if it doesn't exist: */ 210 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, 211 struct bch_inode_unpacked *lostfound, 212 u64 reattaching_inum) 213 { 214 struct bch_fs *c = trans->c; 215 struct qstr lostfound_str = QSTR("lost+found"); 216 u64 inum = 0; 217 unsigned d_type = 0; 218 int ret; 219 220 struct bch_snapshot_tree st; 221 ret = bch2_snapshot_tree_lookup(trans, 222 bch2_snapshot_tree(c, snapshot), &st); 223 if (ret) 224 return ret; 225 226 subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; 227 228 struct bch_subvolume subvol; 229 ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), 230 false, 0, &subvol); 231 bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", 232 le32_to_cpu(st.master_subvol), snapshot); 233 if (ret) 234 return ret; 235 236 if (!subvol.inode) { 237 struct btree_iter iter; 238 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, 239 BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), 240 0, subvolume); 241 ret = PTR_ERR_OR_ZERO(subvol); 242 if (ret) 243 return ret; 244 245 subvol->v.inode = cpu_to_le64(reattaching_inum); 246 bch2_trans_iter_exit(trans, &iter); 247 } 248 249 root_inum.inum = le64_to_cpu(subvol.inode); 250 251 struct bch_inode_unpacked root_inode; 252 struct bch_hash_info root_hash_info; 253 u32 root_inode_snapshot = snapshot; 254 ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); 255 bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", 256 root_inum.inum, le32_to_cpu(st.master_subvol)); 257 if (ret) 258 return ret; 259 260 root_hash_info = bch2_hash_info_init(c, &root_inode); 261 262 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, 263 &lostfound_str, &inum, &d_type, snapshot); 264 if (bch2_err_matches(ret, ENOENT)) 265 goto create_lostfound; 266 267 bch_err_fn(c, ret); 268 if (ret) 269 return ret; 270 271 if (d_type != DT_DIR) { 272 bch_err(c, "error looking up lost+found: not a directory"); 273 return -BCH_ERR_ENOENT_not_directory; 274 } 275 276 /* 277 * The bch2_check_dirents pass has already run, dangling dirents 278 * shouldn't exist here: 279 */ 280 ret = lookup_inode(trans, inum, lostfound, &snapshot); 281 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", 282 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); 283 return ret; 284 285 create_lostfound: 286 /* 287 * XXX: we could have a nicer log message here if we had a nice way to 288 * walk backpointers to print a path 289 */ 290 bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot)); 291 292 u64 now = bch2_current_time(c); 293 struct btree_iter lostfound_iter = { NULL }; 294 u64 cpu = raw_smp_processor_id(); 295 296 bch2_inode_init_early(c, lostfound); 297 bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); 298 lostfound->bi_dir = root_inode.bi_inum; 299 300 root_inode.bi_nlink++; 301 302 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); 303 if (ret) 304 goto err; 305 306 bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); 307 ret = bch2_btree_iter_traverse(&lostfound_iter); 308 if (ret) 309 goto err; 310 311 ret = bch2_dirent_create_snapshot(trans, 312 0, root_inode.bi_inum, snapshot, &root_hash_info, 313 mode_to_type(lostfound->bi_mode), 314 &lostfound_str, 315 lostfound->bi_inum, 316 &lostfound->bi_dir_offset, 317 STR_HASH_must_create) ?: 318 bch2_inode_write_flags(trans, &lostfound_iter, lostfound, 319 BTREE_UPDATE_internal_snapshot_node); 320 err: 321 bch_err_msg(c, ret, "creating lost+found"); 322 bch2_trans_iter_exit(trans, &lostfound_iter); 323 return ret; 324 } 325 326 static int reattach_inode(struct btree_trans *trans, 327 struct bch_inode_unpacked *inode, 328 u32 inode_snapshot) 329 { 330 struct bch_fs *c = trans->c; 331 struct bch_hash_info dir_hash; 332 struct bch_inode_unpacked lostfound; 333 char name_buf[20]; 334 struct qstr name; 335 u64 dir_offset = 0; 336 u32 dirent_snapshot = inode_snapshot; 337 int ret; 338 339 if (inode->bi_subvol) { 340 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; 341 342 u64 root_inum; 343 ret = subvol_lookup(trans, inode->bi_parent_subvol, 344 &dirent_snapshot, &root_inum); 345 if (ret) 346 return ret; 347 348 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); 349 } else { 350 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 351 } 352 353 ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); 354 if (ret) 355 return ret; 356 357 if (S_ISDIR(inode->bi_mode)) { 358 lostfound.bi_nlink++; 359 360 ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX); 361 if (ret) 362 return ret; 363 } 364 365 dir_hash = bch2_hash_info_init(c, &lostfound); 366 367 name = (struct qstr) QSTR(name_buf); 368 369 ret = bch2_dirent_create_snapshot(trans, 370 inode->bi_parent_subvol, lostfound.bi_inum, 371 dirent_snapshot, 372 &dir_hash, 373 inode_d_type(inode), 374 &name, 375 inode->bi_subvol ?: inode->bi_inum, 376 &dir_offset, 377 STR_HASH_must_create); 378 if (ret) { 379 bch_err_msg(c, ret, "error creating dirent"); 380 return ret; 381 } 382 383 inode->bi_dir = lostfound.bi_inum; 384 inode->bi_dir_offset = dir_offset; 385 386 return __bch2_fsck_write_inode(trans, inode, inode_snapshot); 387 } 388 389 static int remove_backpointer(struct btree_trans *trans, 390 struct bch_inode_unpacked *inode) 391 { 392 if (!inode->bi_dir) 393 return 0; 394 395 struct bch_fs *c = trans->c; 396 struct btree_iter iter; 397 struct bkey_s_c_dirent d = 398 bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, 399 SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0, 400 dirent); 401 int ret = bkey_err(d) ?: 402 dirent_points_to_inode(c, d, inode) ?: 403 __remove_dirent(trans, d.k->p); 404 bch2_trans_iter_exit(trans, &iter); 405 return ret; 406 } 407 408 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) 409 { 410 struct bch_fs *c = trans->c; 411 412 struct bch_inode_unpacked inode; 413 int ret = bch2_inode_find_by_inum_trans(trans, 414 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 415 &inode); 416 if (ret) 417 return ret; 418 419 ret = remove_backpointer(trans, &inode); 420 if (!bch2_err_matches(ret, ENOENT)) 421 bch_err_msg(c, ret, "removing dirent"); 422 if (ret) 423 return ret; 424 425 ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot)); 426 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 427 return ret; 428 } 429 430 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) 431 { 432 struct bch_fs *c = trans->c; 433 434 if (!bch2_snapshot_is_leaf(c, snapshotid)) { 435 bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); 436 return -BCH_ERR_fsck_repair_unimplemented; 437 } 438 439 /* 440 * If inum isn't set, that means we're being called from check_dirents, 441 * not check_inodes - the root of this subvolume doesn't exist or we 442 * would have found it there: 443 */ 444 if (!inum) { 445 struct btree_iter inode_iter = {}; 446 struct bch_inode_unpacked new_inode; 447 u64 cpu = raw_smp_processor_id(); 448 449 bch2_inode_init_early(c, &new_inode); 450 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); 451 452 new_inode.bi_subvol = subvolid; 453 454 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: 455 bch2_btree_iter_traverse(&inode_iter) ?: 456 bch2_inode_write(trans, &inode_iter, &new_inode); 457 bch2_trans_iter_exit(trans, &inode_iter); 458 if (ret) 459 return ret; 460 461 inum = new_inode.bi_inum; 462 } 463 464 bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); 465 466 struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); 467 int ret = PTR_ERR_OR_ZERO(new_subvol); 468 if (ret) 469 return ret; 470 471 bkey_subvolume_init(&new_subvol->k_i); 472 new_subvol->k.p.offset = subvolid; 473 new_subvol->v.snapshot = cpu_to_le32(snapshotid); 474 new_subvol->v.inode = cpu_to_le64(inum); 475 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); 476 if (ret) 477 return ret; 478 479 struct btree_iter iter; 480 struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, 481 BTREE_ID_snapshots, POS(0, snapshotid), 482 0, snapshot); 483 ret = PTR_ERR_OR_ZERO(s); 484 bch_err_msg(c, ret, "getting snapshot %u", snapshotid); 485 if (ret) 486 return ret; 487 488 u32 snapshot_tree = le32_to_cpu(s->v.tree); 489 490 s->v.subvol = cpu_to_le32(subvolid); 491 SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); 492 bch2_trans_iter_exit(trans, &iter); 493 494 struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, 495 BTREE_ID_snapshot_trees, POS(0, snapshot_tree), 496 0, snapshot_tree); 497 ret = PTR_ERR_OR_ZERO(st); 498 bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); 499 if (ret) 500 return ret; 501 502 if (!st->v.master_subvol) 503 st->v.master_subvol = cpu_to_le32(subvolid); 504 505 bch2_trans_iter_exit(trans, &iter); 506 return 0; 507 } 508 509 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) 510 { 511 struct bch_fs *c = trans->c; 512 unsigned i_mode = S_IFREG; 513 u64 i_size = 0; 514 515 switch (btree) { 516 case BTREE_ID_extents: { 517 struct btree_iter iter = {}; 518 519 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); 520 struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); 521 bch2_trans_iter_exit(trans, &iter); 522 int ret = bkey_err(k); 523 if (ret) 524 return ret; 525 526 i_size = k.k->p.offset << 9; 527 break; 528 } 529 case BTREE_ID_dirents: 530 i_mode = S_IFDIR; 531 break; 532 case BTREE_ID_xattrs: 533 break; 534 default: 535 BUG(); 536 } 537 538 struct bch_inode_unpacked new_inode; 539 bch2_inode_init_early(c, &new_inode); 540 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); 541 new_inode.bi_size = i_size; 542 new_inode.bi_inum = inum; 543 544 return __bch2_fsck_write_inode(trans, &new_inode, snapshot); 545 } 546 547 struct snapshots_seen { 548 struct bpos pos; 549 snapshot_id_list ids; 550 }; 551 552 static inline void snapshots_seen_exit(struct snapshots_seen *s) 553 { 554 darray_exit(&s->ids); 555 } 556 557 static inline void snapshots_seen_init(struct snapshots_seen *s) 558 { 559 memset(s, 0, sizeof(*s)); 560 } 561 562 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) 563 { 564 u32 *i; 565 __darray_for_each(s->ids, i) { 566 if (*i == id) 567 return 0; 568 if (*i > id) 569 break; 570 } 571 572 int ret = darray_insert_item(&s->ids, i - s->ids.data, id); 573 if (ret) 574 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 575 s->ids.size); 576 return ret; 577 } 578 579 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, 580 enum btree_id btree_id, struct bpos pos) 581 { 582 if (!bkey_eq(s->pos, pos)) 583 s->ids.nr = 0; 584 s->pos = pos; 585 586 return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); 587 } 588 589 /** 590 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, 591 * and @ancestor hasn't been overwritten in @seen 592 * 593 * @c: filesystem handle 594 * @seen: list of snapshot ids already seen at current position 595 * @id: descendent snapshot id 596 * @ancestor: ancestor snapshot id 597 * 598 * Returns: whether key in @ancestor snapshot is visible in @id snapshot 599 */ 600 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, 601 u32 id, u32 ancestor) 602 { 603 ssize_t i; 604 605 EBUG_ON(id > ancestor); 606 607 /* @ancestor should be the snapshot most recently added to @seen */ 608 EBUG_ON(ancestor != seen->pos.snapshot); 609 EBUG_ON(ancestor != darray_last(seen->ids)); 610 611 if (id == ancestor) 612 return true; 613 614 if (!bch2_snapshot_is_ancestor(c, id, ancestor)) 615 return false; 616 617 /* 618 * We know that @id is a descendant of @ancestor, we're checking if 619 * we've seen a key that overwrote @ancestor - i.e. also a descendent of 620 * @ascestor and with @id as a descendent. 621 * 622 * But we already know that we're scanning IDs between @id and @ancestor 623 * numerically, since snapshot ID lists are kept sorted, so if we find 624 * an id that's an ancestor of @id we're done: 625 */ 626 627 for (i = seen->ids.nr - 2; 628 i >= 0 && seen->ids.data[i] >= id; 629 --i) 630 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) 631 return false; 632 633 return true; 634 } 635 636 /** 637 * ref_visible - given a key with snapshot id @src that points to a key with 638 * snapshot id @dst, test whether there is some snapshot in which @dst is 639 * visible. 640 * 641 * @c: filesystem handle 642 * @s: list of snapshot IDs already seen at @src 643 * @src: snapshot ID of src key 644 * @dst: snapshot ID of dst key 645 * Returns: true if there is some snapshot in which @dst is visible 646 * 647 * Assumes we're visiting @src keys in natural key order 648 */ 649 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, 650 u32 src, u32 dst) 651 { 652 return dst <= src 653 ? key_visible_in_snapshot(c, s, dst, src) 654 : bch2_snapshot_is_ancestor(c, src, dst); 655 } 656 657 static int ref_visible2(struct bch_fs *c, 658 u32 src, struct snapshots_seen *src_seen, 659 u32 dst, struct snapshots_seen *dst_seen) 660 { 661 if (dst > src) { 662 swap(dst, src); 663 swap(dst_seen, src_seen); 664 } 665 return key_visible_in_snapshot(c, src_seen, dst, src); 666 } 667 668 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ 669 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ 670 (_i)->snapshot <= (_snapshot); _i++) \ 671 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) 672 673 struct inode_walker_entry { 674 struct bch_inode_unpacked inode; 675 u32 snapshot; 676 u64 count; 677 }; 678 679 struct inode_walker { 680 bool first_this_inode; 681 bool have_inodes; 682 bool recalculate_sums; 683 struct bpos last_pos; 684 685 DARRAY(struct inode_walker_entry) inodes; 686 }; 687 688 static void inode_walker_exit(struct inode_walker *w) 689 { 690 darray_exit(&w->inodes); 691 } 692 693 static struct inode_walker inode_walker_init(void) 694 { 695 return (struct inode_walker) { 0, }; 696 } 697 698 static int add_inode(struct bch_fs *c, struct inode_walker *w, 699 struct bkey_s_c inode) 700 { 701 struct bch_inode_unpacked u; 702 703 BUG_ON(bch2_inode_unpack(inode, &u)); 704 705 return darray_push(&w->inodes, ((struct inode_walker_entry) { 706 .inode = u, 707 .snapshot = inode.k->p.snapshot, 708 })); 709 } 710 711 static int get_inodes_all_snapshots(struct btree_trans *trans, 712 struct inode_walker *w, u64 inum) 713 { 714 struct bch_fs *c = trans->c; 715 struct btree_iter iter; 716 struct bkey_s_c k; 717 int ret; 718 719 /* 720 * We no longer have inodes for w->last_pos; clear this to avoid 721 * screwing up check_i_sectors/check_subdir_count if we take a 722 * transaction restart here: 723 */ 724 w->have_inodes = false; 725 w->recalculate_sums = false; 726 w->inodes.nr = 0; 727 728 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 729 BTREE_ITER_all_snapshots, k, ret) { 730 if (k.k->p.offset != inum) 731 break; 732 733 if (bkey_is_inode(k.k)) 734 add_inode(c, w, k); 735 } 736 bch2_trans_iter_exit(trans, &iter); 737 738 if (ret) 739 return ret; 740 741 w->first_this_inode = true; 742 w->have_inodes = true; 743 return 0; 744 } 745 746 static struct inode_walker_entry * 747 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) 748 { 749 bool is_whiteout = k.k->type == KEY_TYPE_whiteout; 750 751 struct inode_walker_entry *i; 752 __darray_for_each(w->inodes, i) 753 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) 754 goto found; 755 756 return NULL; 757 found: 758 BUG_ON(k.k->p.snapshot > i->snapshot); 759 760 if (k.k->p.snapshot != i->snapshot && !is_whiteout) { 761 struct inode_walker_entry new = *i; 762 763 new.snapshot = k.k->p.snapshot; 764 new.count = 0; 765 766 struct printbuf buf = PRINTBUF; 767 bch2_bkey_val_to_text(&buf, c, k); 768 769 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" 770 "unexpected because we should always update the inode when we update a key in that inode\n" 771 "%s", 772 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); 773 printbuf_exit(&buf); 774 775 while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) 776 --i; 777 778 size_t pos = i - w->inodes.data; 779 int ret = darray_insert_item(&w->inodes, pos, new); 780 if (ret) 781 return ERR_PTR(ret); 782 783 i = w->inodes.data + pos; 784 } 785 786 return i; 787 } 788 789 static struct inode_walker_entry *walk_inode(struct btree_trans *trans, 790 struct inode_walker *w, 791 struct bkey_s_c k) 792 { 793 if (w->last_pos.inode != k.k->p.inode) { 794 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); 795 if (ret) 796 return ERR_PTR(ret); 797 } 798 799 w->last_pos = k.k->p; 800 801 return lookup_inode_for_snapshot(trans->c, w, k); 802 } 803 804 static int get_visible_inodes(struct btree_trans *trans, 805 struct inode_walker *w, 806 struct snapshots_seen *s, 807 u64 inum) 808 { 809 struct bch_fs *c = trans->c; 810 struct btree_iter iter; 811 struct bkey_s_c k; 812 int ret; 813 814 w->inodes.nr = 0; 815 816 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 817 BTREE_ITER_all_snapshots, k, ret) { 818 if (k.k->p.offset != inum) 819 break; 820 821 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) 822 continue; 823 824 if (bkey_is_inode(k.k)) 825 add_inode(c, w, k); 826 827 if (k.k->p.snapshot >= s->pos.snapshot) 828 break; 829 } 830 bch2_trans_iter_exit(trans, &iter); 831 832 return ret; 833 } 834 835 static int hash_redo_key(struct btree_trans *trans, 836 const struct bch_hash_desc desc, 837 struct bch_hash_info *hash_info, 838 struct btree_iter *k_iter, struct bkey_s_c k) 839 { 840 struct bkey_i *delete; 841 struct bkey_i *tmp; 842 843 delete = bch2_trans_kmalloc(trans, sizeof(*delete)); 844 if (IS_ERR(delete)) 845 return PTR_ERR(delete); 846 847 tmp = bch2_bkey_make_mut_noupdate(trans, k); 848 if (IS_ERR(tmp)) 849 return PTR_ERR(tmp); 850 851 bkey_init(&delete->k); 852 delete->k.p = k_iter->pos; 853 return bch2_btree_iter_traverse(k_iter) ?: 854 bch2_trans_update(trans, k_iter, delete, 0) ?: 855 bch2_hash_set_in_snapshot(trans, desc, hash_info, 856 (subvol_inum) { 0, k.k->p.inode }, 857 k.k->p.snapshot, tmp, 858 STR_HASH_must_create| 859 BTREE_UPDATE_internal_snapshot_node) ?: 860 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 861 } 862 863 static int hash_check_key(struct btree_trans *trans, 864 const struct bch_hash_desc desc, 865 struct bch_hash_info *hash_info, 866 struct btree_iter *k_iter, struct bkey_s_c hash_k) 867 { 868 struct bch_fs *c = trans->c; 869 struct btree_iter iter = { NULL }; 870 struct printbuf buf = PRINTBUF; 871 struct bkey_s_c k; 872 u64 hash; 873 int ret = 0; 874 875 if (hash_k.k->type != desc.key_type) 876 return 0; 877 878 hash = desc.hash_bkey(hash_info, hash_k); 879 880 if (likely(hash == hash_k.k->p.offset)) 881 return 0; 882 883 if (hash_k.k->p.offset < hash) 884 goto bad_hash; 885 886 for_each_btree_key_norestart(trans, iter, desc.btree_id, 887 SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), 888 BTREE_ITER_slots, k, ret) { 889 if (bkey_eq(k.k->p, hash_k.k->p)) 890 break; 891 892 if (fsck_err_on(k.k->type == desc.key_type && 893 !desc.cmp_bkey(k, hash_k), 894 trans, hash_table_key_duplicate, 895 "duplicate hash table keys:\n%s", 896 (printbuf_reset(&buf), 897 bch2_bkey_val_to_text(&buf, c, hash_k), 898 buf.buf))) { 899 ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; 900 break; 901 } 902 903 if (bkey_deleted(k.k)) { 904 bch2_trans_iter_exit(trans, &iter); 905 goto bad_hash; 906 } 907 } 908 out: 909 bch2_trans_iter_exit(trans, &iter); 910 printbuf_exit(&buf); 911 return ret; 912 bad_hash: 913 if (fsck_err(trans, hash_table_key_wrong_offset, 914 "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", 915 bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, 916 (printbuf_reset(&buf), 917 bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { 918 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); 919 bch_err_fn(c, ret); 920 if (ret) 921 return ret; 922 ret = -BCH_ERR_transaction_restart_nested; 923 } 924 fsck_err: 925 goto out; 926 } 927 928 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 929 struct btree_iter *iter, 930 struct bpos pos) 931 { 932 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 933 } 934 935 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, 936 struct btree_iter *iter, 937 struct bch_inode_unpacked *inode, 938 u32 *snapshot) 939 { 940 if (inode->bi_subvol) { 941 u64 inum; 942 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); 943 if (ret) 944 return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); 945 } 946 947 return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); 948 } 949 950 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) 951 { 952 struct btree_iter iter; 953 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); 954 int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; 955 bch2_trans_iter_exit(trans, &iter); 956 return ret; 957 } 958 959 static int check_inode_dirent_inode(struct btree_trans *trans, 960 struct bch_inode_unpacked *inode, 961 bool *write_inode) 962 { 963 struct bch_fs *c = trans->c; 964 struct printbuf buf = PRINTBUF; 965 966 u32 inode_snapshot = inode->bi_snapshot; 967 struct btree_iter dirent_iter = {}; 968 struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); 969 int ret = bkey_err(d); 970 if (ret && !bch2_err_matches(ret, ENOENT)) 971 return ret; 972 973 if (fsck_err_on(ret, 974 trans, inode_points_to_missing_dirent, 975 "inode points to missing dirent\n%s", 976 (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || 977 fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), 978 trans, inode_points_to_wrong_dirent, 979 "%s", 980 (printbuf_reset(&buf), 981 dirent_inode_mismatch_msg(&buf, c, d, inode), 982 buf.buf))) { 983 /* 984 * We just clear the backpointer fields for now. If we find a 985 * dirent that points to this inode in check_dirents(), we'll 986 * update it then; then when we get to check_path() if the 987 * backpointer is still 0 we'll reattach it. 988 */ 989 inode->bi_dir = 0; 990 inode->bi_dir_offset = 0; 991 inode->bi_flags &= ~BCH_INODE_backptr_untrusted; 992 *write_inode = true; 993 } 994 995 ret = 0; 996 fsck_err: 997 bch2_trans_iter_exit(trans, &dirent_iter); 998 printbuf_exit(&buf); 999 bch_err_fn(c, ret); 1000 return ret; 1001 } 1002 1003 static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) 1004 { 1005 subvol_inum inum = { 1006 .subvol = snapshot_t(c, p.snapshot)->subvol, 1007 .inum = p.offset, 1008 }; 1009 1010 /* snapshot tree corruption, can't safely delete */ 1011 if (!inum.subvol) { 1012 bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot); 1013 return true; 1014 } 1015 1016 return __bch2_inode_hash_find(c, inum) != NULL; 1017 } 1018 1019 static int check_inode(struct btree_trans *trans, 1020 struct btree_iter *iter, 1021 struct bkey_s_c k, 1022 struct bch_inode_unpacked *prev, 1023 struct snapshots_seen *s, 1024 bool full) 1025 { 1026 struct bch_fs *c = trans->c; 1027 struct bch_inode_unpacked u; 1028 bool do_update = false; 1029 int ret; 1030 1031 ret = bch2_check_key_has_snapshot(trans, iter, k); 1032 if (ret < 0) 1033 goto err; 1034 if (ret) 1035 return 0; 1036 1037 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1038 if (ret) 1039 goto err; 1040 1041 if (!bkey_is_inode(k.k)) 1042 return 0; 1043 1044 BUG_ON(bch2_inode_unpack(k, &u)); 1045 1046 if (!full && 1047 !(u.bi_flags & (BCH_INODE_i_size_dirty| 1048 BCH_INODE_i_sectors_dirty| 1049 BCH_INODE_unlinked))) 1050 return 0; 1051 1052 if (prev->bi_inum != u.bi_inum) 1053 *prev = u; 1054 1055 if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || 1056 inode_d_type(prev) != inode_d_type(&u), 1057 trans, inode_snapshot_mismatch, 1058 "inodes in different snapshots don't match")) { 1059 bch_err(c, "repair not implemented yet"); 1060 return -BCH_ERR_fsck_repair_unimplemented; 1061 } 1062 1063 if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && 1064 bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { 1065 struct bpos new_min_pos; 1066 1067 ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); 1068 if (ret) 1069 goto err; 1070 1071 u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; 1072 1073 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); 1074 1075 bch_err_msg(c, ret, "in fsck updating inode"); 1076 if (ret) 1077 return ret; 1078 1079 if (!bpos_eq(new_min_pos, POS_MIN)) 1080 bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); 1081 return 0; 1082 } 1083 1084 if (u.bi_flags & BCH_INODE_unlinked) { 1085 if (!test_bit(BCH_FS_started, &c->flags)) { 1086 /* 1087 * If we're not in online fsck, don't delete unlinked 1088 * inodes, just make sure they're on the deleted list. 1089 * 1090 * They might be referred to by a logged operation - 1091 * i.e. we might have crashed in the middle of a 1092 * truncate on an unlinked but open file - so we want to 1093 * let the delete_dead_inodes kill it after resuming 1094 * logged ops. 1095 */ 1096 ret = check_inode_deleted_list(trans, k.k->p); 1097 if (ret < 0) 1098 return ret; 1099 1100 fsck_err_on(!ret, 1101 trans, unlinked_inode_not_on_deleted_list, 1102 "inode %llu:%u unlinked, but not on deleted list", 1103 u.bi_inum, k.k->p.snapshot); 1104 1105 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); 1106 if (ret) 1107 goto err; 1108 } else { 1109 if (fsck_err_on(bch2_inode_is_open(c, k.k->p), 1110 trans, inode_unlinked_and_not_open, 1111 "inode %llu%u unlinked and not open", 1112 u.bi_inum, u.bi_snapshot)) { 1113 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); 1114 bch_err_msg(c, ret, "in fsck deleting inode"); 1115 return ret; 1116 } 1117 } 1118 } 1119 1120 /* i_size_dirty is vestigal, since we now have logged ops for truncate * */ 1121 if (u.bi_flags & BCH_INODE_i_size_dirty && 1122 (!test_bit(BCH_FS_clean_recovery, &c->flags) || 1123 fsck_err(trans, inode_i_size_dirty_but_clean, 1124 "filesystem marked clean, but inode %llu has i_size dirty", 1125 u.bi_inum))) { 1126 bch_verbose(c, "truncating inode %llu", u.bi_inum); 1127 1128 /* 1129 * XXX: need to truncate partial blocks too here - or ideally 1130 * just switch units to bytes and that issue goes away 1131 */ 1132 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, 1133 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, 1134 iter->pos.snapshot), 1135 POS(u.bi_inum, U64_MAX), 1136 0, NULL); 1137 bch_err_msg(c, ret, "in fsck truncating inode"); 1138 if (ret) 1139 return ret; 1140 1141 /* 1142 * We truncated without our normal sector accounting hook, just 1143 * make sure we recalculate it: 1144 */ 1145 u.bi_flags |= BCH_INODE_i_sectors_dirty; 1146 1147 u.bi_flags &= ~BCH_INODE_i_size_dirty; 1148 do_update = true; 1149 } 1150 1151 /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */ 1152 if (u.bi_flags & BCH_INODE_i_sectors_dirty && 1153 (!test_bit(BCH_FS_clean_recovery, &c->flags) || 1154 fsck_err(trans, inode_i_sectors_dirty_but_clean, 1155 "filesystem marked clean, but inode %llu has i_sectors dirty", 1156 u.bi_inum))) { 1157 s64 sectors; 1158 1159 bch_verbose(c, "recounting sectors for inode %llu", 1160 u.bi_inum); 1161 1162 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); 1163 if (sectors < 0) { 1164 bch_err_msg(c, sectors, "in fsck recounting inode sectors"); 1165 return sectors; 1166 } 1167 1168 u.bi_sectors = sectors; 1169 u.bi_flags &= ~BCH_INODE_i_sectors_dirty; 1170 do_update = true; 1171 } 1172 1173 if (u.bi_flags & BCH_INODE_backptr_untrusted) { 1174 u.bi_dir = 0; 1175 u.bi_dir_offset = 0; 1176 u.bi_flags &= ~BCH_INODE_backptr_untrusted; 1177 do_update = true; 1178 } 1179 1180 if (u.bi_dir || u.bi_dir_offset) { 1181 ret = check_inode_dirent_inode(trans, &u, &do_update); 1182 if (ret) 1183 goto err; 1184 } 1185 1186 if (fsck_err_on(u.bi_parent_subvol && 1187 (u.bi_subvol == 0 || 1188 u.bi_subvol == BCACHEFS_ROOT_SUBVOL), 1189 trans, inode_bi_parent_nonzero, 1190 "inode %llu:%u has subvol %u but nonzero parent subvol %u", 1191 u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { 1192 u.bi_parent_subvol = 0; 1193 do_update = true; 1194 } 1195 1196 if (u.bi_subvol) { 1197 struct bch_subvolume s; 1198 1199 ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); 1200 if (ret && !bch2_err_matches(ret, ENOENT)) 1201 goto err; 1202 1203 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 1204 ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); 1205 goto do_update; 1206 } 1207 1208 if (fsck_err_on(ret, 1209 trans, inode_bi_subvol_missing, 1210 "inode %llu:%u bi_subvol points to missing subvolume %u", 1211 u.bi_inum, k.k->p.snapshot, u.bi_subvol) || 1212 fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || 1213 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), 1214 k.k->p.snapshot), 1215 trans, inode_bi_subvol_wrong, 1216 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", 1217 u.bi_inum, k.k->p.snapshot, u.bi_subvol, 1218 le64_to_cpu(s.inode), 1219 le32_to_cpu(s.snapshot))) { 1220 u.bi_subvol = 0; 1221 u.bi_parent_subvol = 0; 1222 do_update = true; 1223 } 1224 } 1225 do_update: 1226 if (do_update) { 1227 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); 1228 bch_err_msg(c, ret, "in fsck updating inode"); 1229 if (ret) 1230 return ret; 1231 } 1232 err: 1233 fsck_err: 1234 bch_err_fn(c, ret); 1235 return ret; 1236 } 1237 1238 int bch2_check_inodes(struct bch_fs *c) 1239 { 1240 bool full = c->opts.fsck; 1241 struct bch_inode_unpacked prev = { 0 }; 1242 struct snapshots_seen s; 1243 1244 snapshots_seen_init(&s); 1245 1246 int ret = bch2_trans_run(c, 1247 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1248 POS_MIN, 1249 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1250 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1251 check_inode(trans, &iter, k, &prev, &s, full))); 1252 1253 snapshots_seen_exit(&s); 1254 bch_err_fn(c, ret); 1255 return ret; 1256 } 1257 1258 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) 1259 { 1260 switch (btree) { 1261 case BTREE_ID_extents: 1262 return S_ISREG(mode) || S_ISLNK(mode); 1263 case BTREE_ID_dirents: 1264 return S_ISDIR(mode); 1265 case BTREE_ID_xattrs: 1266 return true; 1267 default: 1268 BUG(); 1269 } 1270 } 1271 1272 static int check_key_has_inode(struct btree_trans *trans, 1273 struct btree_iter *iter, 1274 struct inode_walker *inode, 1275 struct inode_walker_entry *i, 1276 struct bkey_s_c k) 1277 { 1278 struct bch_fs *c = trans->c; 1279 struct printbuf buf = PRINTBUF; 1280 int ret = PTR_ERR_OR_ZERO(i); 1281 if (ret) 1282 return ret; 1283 1284 if (k.k->type == KEY_TYPE_whiteout) 1285 goto out; 1286 1287 if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { 1288 ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: 1289 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1290 if (ret) 1291 goto err; 1292 1293 inode->last_pos.inode--; 1294 ret = -BCH_ERR_transaction_restart_nested; 1295 goto err; 1296 } 1297 1298 if (fsck_err_on(!i, 1299 trans, key_in_missing_inode, 1300 "key in missing inode:\n %s", 1301 (printbuf_reset(&buf), 1302 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1303 goto delete; 1304 1305 if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), 1306 trans, key_in_wrong_inode_type, 1307 "key for wrong inode mode %o:\n %s", 1308 i->inode.bi_mode, 1309 (printbuf_reset(&buf), 1310 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1311 goto delete; 1312 out: 1313 err: 1314 fsck_err: 1315 printbuf_exit(&buf); 1316 bch_err_fn(c, ret); 1317 return ret; 1318 delete: 1319 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); 1320 goto out; 1321 } 1322 1323 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) 1324 { 1325 struct bch_fs *c = trans->c; 1326 int ret = 0; 1327 s64 count2; 1328 1329 darray_for_each(w->inodes, i) { 1330 if (i->inode.bi_sectors == i->count) 1331 continue; 1332 1333 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); 1334 1335 if (w->recalculate_sums) 1336 i->count = count2; 1337 1338 if (i->count != count2) { 1339 bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1340 w->last_pos.inode, i->snapshot, i->count, count2); 1341 return -BCH_ERR_internal_fsck_err; 1342 } 1343 1344 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), 1345 trans, inode_i_sectors_wrong, 1346 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", 1347 w->last_pos.inode, i->snapshot, 1348 i->inode.bi_sectors, i->count)) { 1349 i->inode.bi_sectors = i->count; 1350 ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); 1351 if (ret) 1352 break; 1353 } 1354 } 1355 fsck_err: 1356 bch_err_fn(c, ret); 1357 return ret; 1358 } 1359 1360 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) 1361 { 1362 u32 restart_count = trans->restart_count; 1363 return check_i_sectors_notnested(trans, w) ?: 1364 trans_was_restarted(trans, restart_count); 1365 } 1366 1367 struct extent_end { 1368 u32 snapshot; 1369 u64 offset; 1370 struct snapshots_seen seen; 1371 }; 1372 1373 struct extent_ends { 1374 struct bpos last_pos; 1375 DARRAY(struct extent_end) e; 1376 }; 1377 1378 static void extent_ends_reset(struct extent_ends *extent_ends) 1379 { 1380 darray_for_each(extent_ends->e, i) 1381 snapshots_seen_exit(&i->seen); 1382 extent_ends->e.nr = 0; 1383 } 1384 1385 static void extent_ends_exit(struct extent_ends *extent_ends) 1386 { 1387 extent_ends_reset(extent_ends); 1388 darray_exit(&extent_ends->e); 1389 } 1390 1391 static void extent_ends_init(struct extent_ends *extent_ends) 1392 { 1393 memset(extent_ends, 0, sizeof(*extent_ends)); 1394 } 1395 1396 static int extent_ends_at(struct bch_fs *c, 1397 struct extent_ends *extent_ends, 1398 struct snapshots_seen *seen, 1399 struct bkey_s_c k) 1400 { 1401 struct extent_end *i, n = (struct extent_end) { 1402 .offset = k.k->p.offset, 1403 .snapshot = k.k->p.snapshot, 1404 .seen = *seen, 1405 }; 1406 1407 n.seen.ids.data = kmemdup(seen->ids.data, 1408 sizeof(seen->ids.data[0]) * seen->ids.size, 1409 GFP_KERNEL); 1410 if (!n.seen.ids.data) 1411 return -BCH_ERR_ENOMEM_fsck_extent_ends_at; 1412 1413 __darray_for_each(extent_ends->e, i) { 1414 if (i->snapshot == k.k->p.snapshot) { 1415 snapshots_seen_exit(&i->seen); 1416 *i = n; 1417 return 0; 1418 } 1419 1420 if (i->snapshot >= k.k->p.snapshot) 1421 break; 1422 } 1423 1424 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); 1425 } 1426 1427 static int overlapping_extents_found(struct btree_trans *trans, 1428 enum btree_id btree, 1429 struct bpos pos1, struct snapshots_seen *pos1_seen, 1430 struct bkey pos2, 1431 bool *fixed, 1432 struct extent_end *extent_end) 1433 { 1434 struct bch_fs *c = trans->c; 1435 struct printbuf buf = PRINTBUF; 1436 struct btree_iter iter1, iter2 = { NULL }; 1437 struct bkey_s_c k1, k2; 1438 int ret; 1439 1440 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); 1441 1442 bch2_trans_iter_init(trans, &iter1, btree, pos1, 1443 BTREE_ITER_all_snapshots| 1444 BTREE_ITER_not_extents); 1445 k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); 1446 ret = bkey_err(k1); 1447 if (ret) 1448 goto err; 1449 1450 prt_str(&buf, "\n "); 1451 bch2_bkey_val_to_text(&buf, c, k1); 1452 1453 if (!bpos_eq(pos1, k1.k->p)) { 1454 prt_str(&buf, "\n wanted\n "); 1455 bch2_bpos_to_text(&buf, pos1); 1456 prt_str(&buf, "\n "); 1457 bch2_bkey_to_text(&buf, &pos2); 1458 1459 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", 1460 __func__, buf.buf); 1461 ret = -BCH_ERR_internal_fsck_err; 1462 goto err; 1463 } 1464 1465 bch2_trans_copy_iter(&iter2, &iter1); 1466 1467 while (1) { 1468 bch2_btree_iter_advance(&iter2); 1469 1470 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); 1471 ret = bkey_err(k2); 1472 if (ret) 1473 goto err; 1474 1475 if (bpos_ge(k2.k->p, pos2.p)) 1476 break; 1477 } 1478 1479 prt_str(&buf, "\n "); 1480 bch2_bkey_val_to_text(&buf, c, k2); 1481 1482 if (bpos_gt(k2.k->p, pos2.p) || 1483 pos2.size != k2.k->size) { 1484 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", 1485 __func__, buf.buf); 1486 ret = -BCH_ERR_internal_fsck_err; 1487 goto err; 1488 } 1489 1490 prt_printf(&buf, "\n overwriting %s extent", 1491 pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); 1492 1493 if (fsck_err(trans, extent_overlapping, 1494 "overlapping extents%s", buf.buf)) { 1495 struct btree_iter *old_iter = &iter1; 1496 struct disk_reservation res = { 0 }; 1497 1498 if (pos1.snapshot < pos2.p.snapshot) { 1499 old_iter = &iter2; 1500 swap(k1, k2); 1501 } 1502 1503 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); 1504 1505 ret = bch2_trans_update_extent_overwrite(trans, old_iter, 1506 BTREE_UPDATE_internal_snapshot_node, 1507 k1, k2) ?: 1508 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); 1509 bch2_disk_reservation_put(c, &res); 1510 1511 if (ret) 1512 goto err; 1513 1514 *fixed = true; 1515 1516 if (pos1.snapshot == pos2.p.snapshot) { 1517 /* 1518 * We overwrote the first extent, and did the overwrite 1519 * in the same snapshot: 1520 */ 1521 extent_end->offset = bkey_start_offset(&pos2); 1522 } else if (pos1.snapshot > pos2.p.snapshot) { 1523 /* 1524 * We overwrote the first extent in pos2's snapshot: 1525 */ 1526 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); 1527 } else { 1528 /* 1529 * We overwrote the second extent - restart 1530 * check_extent() from the top: 1531 */ 1532 ret = -BCH_ERR_transaction_restart_nested; 1533 } 1534 } 1535 fsck_err: 1536 err: 1537 bch2_trans_iter_exit(trans, &iter2); 1538 bch2_trans_iter_exit(trans, &iter1); 1539 printbuf_exit(&buf); 1540 return ret; 1541 } 1542 1543 static int check_overlapping_extents(struct btree_trans *trans, 1544 struct snapshots_seen *seen, 1545 struct extent_ends *extent_ends, 1546 struct bkey_s_c k, 1547 struct btree_iter *iter, 1548 bool *fixed) 1549 { 1550 struct bch_fs *c = trans->c; 1551 int ret = 0; 1552 1553 /* transaction restart, running again */ 1554 if (bpos_eq(extent_ends->last_pos, k.k->p)) 1555 return 0; 1556 1557 if (extent_ends->last_pos.inode != k.k->p.inode) 1558 extent_ends_reset(extent_ends); 1559 1560 darray_for_each(extent_ends->e, i) { 1561 if (i->offset <= bkey_start_offset(k.k)) 1562 continue; 1563 1564 if (!ref_visible2(c, 1565 k.k->p.snapshot, seen, 1566 i->snapshot, &i->seen)) 1567 continue; 1568 1569 ret = overlapping_extents_found(trans, iter->btree_id, 1570 SPOS(iter->pos.inode, 1571 i->offset, 1572 i->snapshot), 1573 &i->seen, 1574 *k.k, fixed, i); 1575 if (ret) 1576 goto err; 1577 } 1578 1579 extent_ends->last_pos = k.k->p; 1580 err: 1581 return ret; 1582 } 1583 1584 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, 1585 struct bkey_s_c k) 1586 { 1587 struct bch_fs *c = trans->c; 1588 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1589 struct bch_extent_crc_unpacked crc; 1590 const union bch_extent_entry *i; 1591 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; 1592 1593 bkey_for_each_crc(k.k, ptrs, crc, i) 1594 if (crc_is_encoded(crc) && 1595 crc.uncompressed_size > encoded_extent_max_sectors) { 1596 struct printbuf buf = PRINTBUF; 1597 1598 bch2_bkey_val_to_text(&buf, c, k); 1599 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); 1600 printbuf_exit(&buf); 1601 } 1602 1603 return 0; 1604 } 1605 1606 static int check_extent(struct btree_trans *trans, struct btree_iter *iter, 1607 struct bkey_s_c k, 1608 struct inode_walker *inode, 1609 struct snapshots_seen *s, 1610 struct extent_ends *extent_ends, 1611 struct disk_reservation *res) 1612 { 1613 struct bch_fs *c = trans->c; 1614 struct printbuf buf = PRINTBUF; 1615 int ret = 0; 1616 1617 ret = bch2_check_key_has_snapshot(trans, iter, k); 1618 if (ret) { 1619 ret = ret < 0 ? ret : 0; 1620 goto out; 1621 } 1622 1623 if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { 1624 ret = check_i_sectors(trans, inode); 1625 if (ret) 1626 goto err; 1627 } 1628 1629 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1630 if (ret) 1631 goto err; 1632 1633 struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); 1634 ret = PTR_ERR_OR_ZERO(extent_i); 1635 if (ret) 1636 goto err; 1637 1638 ret = check_key_has_inode(trans, iter, inode, extent_i, k); 1639 if (ret) 1640 goto err; 1641 1642 if (k.k->type != KEY_TYPE_whiteout) { 1643 ret = check_overlapping_extents(trans, s, extent_ends, k, iter, 1644 &inode->recalculate_sums); 1645 if (ret) 1646 goto err; 1647 1648 /* 1649 * Check inodes in reverse order, from oldest snapshots to 1650 * newest, starting from the inode that matches this extent's 1651 * snapshot. If we didn't have one, iterate over all inodes: 1652 */ 1653 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1654 inode->inodes.data && i >= inode->inodes.data; 1655 --i) { 1656 if (i->snapshot > k.k->p.snapshot || 1657 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) 1658 continue; 1659 1660 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && 1661 k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && 1662 !bkey_extent_is_reservation(k), 1663 trans, extent_past_end_of_inode, 1664 "extent type past end of inode %llu:%u, i_size %llu\n %s", 1665 i->inode.bi_inum, i->snapshot, i->inode.bi_size, 1666 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1667 struct btree_iter iter2; 1668 1669 bch2_trans_copy_iter(&iter2, iter); 1670 bch2_btree_iter_set_snapshot(&iter2, i->snapshot); 1671 ret = bch2_btree_iter_traverse(&iter2) ?: 1672 bch2_btree_delete_at(trans, &iter2, 1673 BTREE_UPDATE_internal_snapshot_node); 1674 bch2_trans_iter_exit(trans, &iter2); 1675 if (ret) 1676 goto err; 1677 1678 iter->k.type = KEY_TYPE_whiteout; 1679 break; 1680 } 1681 } 1682 } 1683 1684 ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); 1685 if (ret) 1686 goto err; 1687 1688 if (bkey_extent_is_allocation(k.k)) { 1689 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1690 inode->inodes.data && i >= inode->inodes.data; 1691 --i) { 1692 if (i->snapshot > k.k->p.snapshot || 1693 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) 1694 continue; 1695 1696 i->count += k.k->size; 1697 } 1698 } 1699 1700 if (k.k->type != KEY_TYPE_whiteout) { 1701 ret = extent_ends_at(c, extent_ends, s, k); 1702 if (ret) 1703 goto err; 1704 } 1705 out: 1706 err: 1707 fsck_err: 1708 printbuf_exit(&buf); 1709 bch_err_fn(c, ret); 1710 return ret; 1711 } 1712 1713 /* 1714 * Walk extents: verify that extents have a corresponding S_ISREG inode, and 1715 * that i_size an i_sectors are consistent 1716 */ 1717 int bch2_check_extents(struct bch_fs *c) 1718 { 1719 struct inode_walker w = inode_walker_init(); 1720 struct snapshots_seen s; 1721 struct extent_ends extent_ends; 1722 struct disk_reservation res = { 0 }; 1723 1724 snapshots_seen_init(&s); 1725 extent_ends_init(&extent_ends); 1726 1727 int ret = bch2_trans_run(c, 1728 for_each_btree_key(trans, iter, BTREE_ID_extents, 1729 POS(BCACHEFS_ROOT_INO, 0), 1730 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ 1731 bch2_disk_reservation_put(c, &res); 1732 check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: 1733 check_extent_overbig(trans, &iter, k); 1734 })) ?: 1735 check_i_sectors_notnested(trans, &w)); 1736 1737 bch2_disk_reservation_put(c, &res); 1738 extent_ends_exit(&extent_ends); 1739 inode_walker_exit(&w); 1740 snapshots_seen_exit(&s); 1741 1742 bch_err_fn(c, ret); 1743 return ret; 1744 } 1745 1746 int bch2_check_indirect_extents(struct bch_fs *c) 1747 { 1748 struct disk_reservation res = { 0 }; 1749 1750 int ret = bch2_trans_run(c, 1751 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, 1752 POS_MIN, 1753 BTREE_ITER_prefetch, k, 1754 &res, NULL, 1755 BCH_TRANS_COMMIT_no_enospc, ({ 1756 bch2_disk_reservation_put(c, &res); 1757 check_extent_overbig(trans, &iter, k); 1758 }))); 1759 1760 bch2_disk_reservation_put(c, &res); 1761 bch_err_fn(c, ret); 1762 return ret; 1763 } 1764 1765 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) 1766 { 1767 struct bch_fs *c = trans->c; 1768 int ret = 0; 1769 s64 count2; 1770 1771 darray_for_each(w->inodes, i) { 1772 if (i->inode.bi_nlink == i->count) 1773 continue; 1774 1775 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); 1776 if (count2 < 0) 1777 return count2; 1778 1779 if (i->count != count2) { 1780 bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", 1781 w->last_pos.inode, i->snapshot, i->count, count2); 1782 i->count = count2; 1783 if (i->inode.bi_nlink == i->count) 1784 continue; 1785 } 1786 1787 if (fsck_err_on(i->inode.bi_nlink != i->count, 1788 trans, inode_dir_wrong_nlink, 1789 "directory %llu:%u with wrong i_nlink: got %u, should be %llu", 1790 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { 1791 i->inode.bi_nlink = i->count; 1792 ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); 1793 if (ret) 1794 break; 1795 } 1796 } 1797 fsck_err: 1798 bch_err_fn(c, ret); 1799 return ret; 1800 } 1801 1802 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) 1803 { 1804 u32 restart_count = trans->restart_count; 1805 return check_subdir_count_notnested(trans, w) ?: 1806 trans_was_restarted(trans, restart_count); 1807 } 1808 1809 noinline_for_stack 1810 static int check_dirent_inode_dirent(struct btree_trans *trans, 1811 struct btree_iter *iter, 1812 struct bkey_s_c_dirent d, 1813 struct bch_inode_unpacked *target, 1814 u32 target_snapshot) 1815 { 1816 struct bch_fs *c = trans->c; 1817 struct printbuf buf = PRINTBUF; 1818 struct btree_iter bp_iter = { NULL }; 1819 int ret = 0; 1820 1821 if (inode_points_to_dirent(target, d)) 1822 return 0; 1823 1824 if (bch2_inode_should_have_bp(target) && 1825 !fsck_err(trans, inode_wrong_backpointer, 1826 "dirent points to inode that does not point back:\n %s", 1827 (bch2_bkey_val_to_text(&buf, c, d.s_c), 1828 prt_printf(&buf, "\n "), 1829 bch2_inode_unpacked_to_text(&buf, target), 1830 buf.buf))) 1831 goto err; 1832 1833 if (!target->bi_dir && 1834 !target->bi_dir_offset) { 1835 target->bi_dir = d.k->p.inode; 1836 target->bi_dir_offset = d.k->p.offset; 1837 return __bch2_fsck_write_inode(trans, target, target_snapshot); 1838 } 1839 1840 struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, 1841 SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); 1842 ret = bkey_err(bp_dirent); 1843 if (ret && !bch2_err_matches(ret, ENOENT)) 1844 goto err; 1845 1846 bool backpointer_exists = !ret; 1847 ret = 0; 1848 1849 if (fsck_err_on(!backpointer_exists, 1850 trans, inode_wrong_backpointer, 1851 "inode %llu:%u has wrong backpointer:\n" 1852 "got %llu:%llu\n" 1853 "should be %llu:%llu", 1854 target->bi_inum, target_snapshot, 1855 target->bi_dir, 1856 target->bi_dir_offset, 1857 d.k->p.inode, 1858 d.k->p.offset)) { 1859 target->bi_dir = d.k->p.inode; 1860 target->bi_dir_offset = d.k->p.offset; 1861 ret = __bch2_fsck_write_inode(trans, target, target_snapshot); 1862 goto out; 1863 } 1864 1865 bch2_bkey_val_to_text(&buf, c, d.s_c); 1866 prt_newline(&buf); 1867 if (backpointer_exists) 1868 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); 1869 1870 if (fsck_err_on(backpointer_exists && 1871 (S_ISDIR(target->bi_mode) || 1872 target->bi_subvol), 1873 trans, inode_dir_multiple_links, 1874 "%s %llu:%u with multiple links\n%s", 1875 S_ISDIR(target->bi_mode) ? "directory" : "subvolume", 1876 target->bi_inum, target_snapshot, buf.buf)) { 1877 ret = __remove_dirent(trans, d.k->p); 1878 goto out; 1879 } 1880 1881 /* 1882 * hardlinked file with nlink 0: 1883 * We're just adjusting nlink here so check_nlinks() will pick 1884 * it up, it ignores inodes with nlink 0 1885 */ 1886 if (fsck_err_on(backpointer_exists && !target->bi_nlink, 1887 trans, inode_multiple_links_but_nlink_0, 1888 "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", 1889 target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { 1890 target->bi_nlink++; 1891 target->bi_flags &= ~BCH_INODE_unlinked; 1892 ret = __bch2_fsck_write_inode(trans, target, target_snapshot); 1893 if (ret) 1894 goto err; 1895 } 1896 out: 1897 err: 1898 fsck_err: 1899 bch2_trans_iter_exit(trans, &bp_iter); 1900 printbuf_exit(&buf); 1901 bch_err_fn(c, ret); 1902 return ret; 1903 } 1904 1905 noinline_for_stack 1906 static int check_dirent_target(struct btree_trans *trans, 1907 struct btree_iter *iter, 1908 struct bkey_s_c_dirent d, 1909 struct bch_inode_unpacked *target, 1910 u32 target_snapshot) 1911 { 1912 struct bch_fs *c = trans->c; 1913 struct bkey_i_dirent *n; 1914 struct printbuf buf = PRINTBUF; 1915 int ret = 0; 1916 1917 ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot); 1918 if (ret) 1919 goto err; 1920 1921 if (fsck_err_on(d.v->d_type != inode_d_type(target), 1922 trans, dirent_d_type_wrong, 1923 "incorrect d_type: got %s, should be %s:\n%s", 1924 bch2_d_type_str(d.v->d_type), 1925 bch2_d_type_str(inode_d_type(target)), 1926 (printbuf_reset(&buf), 1927 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 1928 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); 1929 ret = PTR_ERR_OR_ZERO(n); 1930 if (ret) 1931 goto err; 1932 1933 bkey_reassemble(&n->k_i, d.s_c); 1934 n->v.d_type = inode_d_type(target); 1935 if (n->v.d_type == DT_SUBVOL) { 1936 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); 1937 n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); 1938 } else { 1939 n->v.d_inum = cpu_to_le64(target->bi_inum); 1940 } 1941 1942 ret = bch2_trans_update(trans, iter, &n->k_i, 0); 1943 if (ret) 1944 goto err; 1945 1946 d = dirent_i_to_s_c(n); 1947 } 1948 err: 1949 fsck_err: 1950 printbuf_exit(&buf); 1951 bch_err_fn(c, ret); 1952 return ret; 1953 } 1954 1955 /* find a subvolume that's a descendent of @snapshot: */ 1956 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) 1957 { 1958 struct btree_iter iter; 1959 struct bkey_s_c k; 1960 int ret; 1961 1962 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { 1963 if (k.k->type != KEY_TYPE_subvolume) 1964 continue; 1965 1966 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 1967 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { 1968 bch2_trans_iter_exit(trans, &iter); 1969 *subvolid = k.k->p.offset; 1970 goto found; 1971 } 1972 } 1973 if (!ret) 1974 ret = -ENOENT; 1975 found: 1976 bch2_trans_iter_exit(trans, &iter); 1977 return ret; 1978 } 1979 1980 noinline_for_stack 1981 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, 1982 struct bkey_s_c_dirent d) 1983 { 1984 struct bch_fs *c = trans->c; 1985 struct btree_iter subvol_iter = {}; 1986 struct bch_inode_unpacked subvol_root; 1987 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); 1988 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 1989 u32 parent_snapshot; 1990 u32 new_parent_subvol = 0; 1991 u64 parent_inum; 1992 struct printbuf buf = PRINTBUF; 1993 int ret = 0; 1994 1995 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); 1996 if (ret && !bch2_err_matches(ret, ENOENT)) 1997 return ret; 1998 1999 if (ret || 2000 (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { 2001 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); 2002 if (ret2 && !bch2_err_matches(ret, ENOENT)) 2003 return ret2; 2004 } 2005 2006 if (ret && 2007 !new_parent_subvol && 2008 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 2009 /* 2010 * Couldn't find a subvol for dirent's snapshot - but we lost 2011 * subvols, so we need to reconstruct: 2012 */ 2013 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); 2014 if (ret) 2015 return ret; 2016 2017 parent_snapshot = d.k->p.snapshot; 2018 } 2019 2020 if (fsck_err_on(ret, 2021 trans, dirent_to_missing_parent_subvol, 2022 "dirent parent_subvol points to missing subvolume\n%s", 2023 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || 2024 fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), 2025 trans, dirent_not_visible_in_parent_subvol, 2026 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", 2027 parent_snapshot, 2028 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 2029 if (!new_parent_subvol) { 2030 bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); 2031 return -BCH_ERR_fsck_repair_unimplemented; 2032 } 2033 2034 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); 2035 ret = PTR_ERR_OR_ZERO(new_dirent); 2036 if (ret) 2037 goto err; 2038 2039 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); 2040 } 2041 2042 struct bkey_s_c_subvolume s = 2043 bch2_bkey_get_iter_typed(trans, &subvol_iter, 2044 BTREE_ID_subvolumes, POS(0, target_subvol), 2045 0, subvolume); 2046 ret = bkey_err(s.s_c); 2047 if (ret && !bch2_err_matches(ret, ENOENT)) 2048 return ret; 2049 2050 if (ret) { 2051 if (fsck_err(trans, dirent_to_missing_subvol, 2052 "dirent points to missing subvolume\n%s", 2053 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) 2054 return __remove_dirent(trans, d.k->p); 2055 ret = 0; 2056 goto out; 2057 } 2058 2059 if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, 2060 trans, subvol_fs_path_parent_wrong, 2061 "subvol with wrong fs_path_parent, should be be %u\n%s", 2062 parent_subvol, 2063 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 2064 struct bkey_i_subvolume *n = 2065 bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); 2066 ret = PTR_ERR_OR_ZERO(n); 2067 if (ret) 2068 goto err; 2069 2070 n->v.fs_path_parent = cpu_to_le32(parent_subvol); 2071 } 2072 2073 u64 target_inum = le64_to_cpu(s.v->inode); 2074 u32 target_snapshot = le32_to_cpu(s.v->snapshot); 2075 2076 ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); 2077 if (ret && !bch2_err_matches(ret, ENOENT)) 2078 goto err; 2079 2080 if (ret) { 2081 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); 2082 ret = -BCH_ERR_fsck_repair_unimplemented; 2083 goto err; 2084 } 2085 2086 if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, 2087 trans, inode_bi_parent_wrong, 2088 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", 2089 target_inum, 2090 subvol_root.bi_parent_subvol, parent_subvol)) { 2091 subvol_root.bi_parent_subvol = parent_subvol; 2092 ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); 2093 if (ret) 2094 goto err; 2095 } 2096 2097 ret = check_dirent_target(trans, iter, d, &subvol_root, 2098 target_snapshot); 2099 if (ret) 2100 goto err; 2101 out: 2102 err: 2103 fsck_err: 2104 bch2_trans_iter_exit(trans, &subvol_iter); 2105 printbuf_exit(&buf); 2106 return ret; 2107 } 2108 2109 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, 2110 struct bkey_s_c k, 2111 struct bch_hash_info *hash_info, 2112 struct inode_walker *dir, 2113 struct inode_walker *target, 2114 struct snapshots_seen *s) 2115 { 2116 struct bch_fs *c = trans->c; 2117 struct inode_walker_entry *i; 2118 struct printbuf buf = PRINTBUF; 2119 int ret = 0; 2120 2121 ret = bch2_check_key_has_snapshot(trans, iter, k); 2122 if (ret) { 2123 ret = ret < 0 ? ret : 0; 2124 goto out; 2125 } 2126 2127 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 2128 if (ret) 2129 goto err; 2130 2131 if (k.k->type == KEY_TYPE_whiteout) 2132 goto out; 2133 2134 if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { 2135 ret = check_subdir_count(trans, dir); 2136 if (ret) 2137 goto err; 2138 } 2139 2140 i = walk_inode(trans, dir, k); 2141 ret = PTR_ERR_OR_ZERO(i); 2142 if (ret < 0) 2143 goto err; 2144 2145 ret = check_key_has_inode(trans, iter, dir, i, k); 2146 if (ret) 2147 goto err; 2148 2149 if (!i) 2150 goto out; 2151 2152 if (dir->first_this_inode) 2153 *hash_info = bch2_hash_info_init(c, &i->inode); 2154 dir->first_this_inode = false; 2155 2156 ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); 2157 if (ret < 0) 2158 goto err; 2159 if (ret) { 2160 /* dirent has been deleted */ 2161 ret = 0; 2162 goto out; 2163 } 2164 2165 if (k.k->type != KEY_TYPE_dirent) 2166 goto out; 2167 2168 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2169 2170 if (d.v->d_type == DT_SUBVOL) { 2171 ret = check_dirent_to_subvol(trans, iter, d); 2172 if (ret) 2173 goto err; 2174 } else { 2175 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); 2176 if (ret) 2177 goto err; 2178 2179 if (fsck_err_on(!target->inodes.nr, 2180 trans, dirent_to_missing_inode, 2181 "dirent points to missing inode:\n%s", 2182 (printbuf_reset(&buf), 2183 bch2_bkey_val_to_text(&buf, c, k), 2184 buf.buf))) { 2185 ret = __remove_dirent(trans, d.k->p); 2186 if (ret) 2187 goto err; 2188 } 2189 2190 darray_for_each(target->inodes, i) { 2191 ret = check_dirent_target(trans, iter, d, 2192 &i->inode, i->snapshot); 2193 if (ret) 2194 goto err; 2195 } 2196 } 2197 2198 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 2199 if (ret) 2200 goto err; 2201 2202 if (d.v->d_type == DT_DIR) 2203 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) 2204 i->count++; 2205 out: 2206 err: 2207 fsck_err: 2208 printbuf_exit(&buf); 2209 bch_err_fn(c, ret); 2210 return ret; 2211 } 2212 2213 /* 2214 * Walk dirents: verify that they all have a corresponding S_ISDIR inode, 2215 * validate d_type 2216 */ 2217 int bch2_check_dirents(struct bch_fs *c) 2218 { 2219 struct inode_walker dir = inode_walker_init(); 2220 struct inode_walker target = inode_walker_init(); 2221 struct snapshots_seen s; 2222 struct bch_hash_info hash_info; 2223 2224 snapshots_seen_init(&s); 2225 2226 int ret = bch2_trans_run(c, 2227 for_each_btree_key(trans, iter, BTREE_ID_dirents, 2228 POS(BCACHEFS_ROOT_INO, 0), 2229 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 2230 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: 2231 check_subdir_count_notnested(trans, &dir)); 2232 2233 snapshots_seen_exit(&s); 2234 inode_walker_exit(&dir); 2235 inode_walker_exit(&target); 2236 bch_err_fn(c, ret); 2237 return ret; 2238 } 2239 2240 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, 2241 struct bkey_s_c k, 2242 struct bch_hash_info *hash_info, 2243 struct inode_walker *inode) 2244 { 2245 struct bch_fs *c = trans->c; 2246 struct inode_walker_entry *i; 2247 int ret; 2248 2249 ret = bch2_check_key_has_snapshot(trans, iter, k); 2250 if (ret < 0) 2251 return ret; 2252 if (ret) 2253 return 0; 2254 2255 i = walk_inode(trans, inode, k); 2256 ret = PTR_ERR_OR_ZERO(i); 2257 if (ret) 2258 return ret; 2259 2260 ret = check_key_has_inode(trans, iter, inode, i, k); 2261 if (ret) 2262 return ret; 2263 2264 if (!i) 2265 return 0; 2266 2267 if (inode->first_this_inode) 2268 *hash_info = bch2_hash_info_init(c, &i->inode); 2269 inode->first_this_inode = false; 2270 2271 ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); 2272 bch_err_fn(c, ret); 2273 return ret; 2274 } 2275 2276 /* 2277 * Walk xattrs: verify that they all have a corresponding inode 2278 */ 2279 int bch2_check_xattrs(struct bch_fs *c) 2280 { 2281 struct inode_walker inode = inode_walker_init(); 2282 struct bch_hash_info hash_info; 2283 int ret = 0; 2284 2285 ret = bch2_trans_run(c, 2286 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, 2287 POS(BCACHEFS_ROOT_INO, 0), 2288 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, 2289 k, 2290 NULL, NULL, 2291 BCH_TRANS_COMMIT_no_enospc, 2292 check_xattr(trans, &iter, k, &hash_info, &inode))); 2293 2294 inode_walker_exit(&inode); 2295 bch_err_fn(c, ret); 2296 return ret; 2297 } 2298 2299 static int check_root_trans(struct btree_trans *trans) 2300 { 2301 struct bch_fs *c = trans->c; 2302 struct bch_inode_unpacked root_inode; 2303 u32 snapshot; 2304 u64 inum; 2305 int ret; 2306 2307 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); 2308 if (ret && !bch2_err_matches(ret, ENOENT)) 2309 return ret; 2310 2311 if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, 2312 "root subvol missing")) { 2313 struct bkey_i_subvolume *root_subvol = 2314 bch2_trans_kmalloc(trans, sizeof(*root_subvol)); 2315 ret = PTR_ERR_OR_ZERO(root_subvol); 2316 if (ret) 2317 goto err; 2318 2319 snapshot = U32_MAX; 2320 inum = BCACHEFS_ROOT_INO; 2321 2322 bkey_subvolume_init(&root_subvol->k_i); 2323 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; 2324 root_subvol->v.flags = 0; 2325 root_subvol->v.snapshot = cpu_to_le32(snapshot); 2326 root_subvol->v.inode = cpu_to_le64(inum); 2327 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); 2328 bch_err_msg(c, ret, "writing root subvol"); 2329 if (ret) 2330 goto err; 2331 } 2332 2333 ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); 2334 if (ret && !bch2_err_matches(ret, ENOENT)) 2335 return ret; 2336 2337 if (mustfix_fsck_err_on(ret, 2338 trans, root_dir_missing, 2339 "root directory missing") || 2340 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), 2341 trans, root_inode_not_dir, 2342 "root inode not a directory")) { 2343 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 2344 0, NULL); 2345 root_inode.bi_inum = inum; 2346 2347 ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot); 2348 bch_err_msg(c, ret, "writing root inode"); 2349 } 2350 err: 2351 fsck_err: 2352 return ret; 2353 } 2354 2355 /* Get root directory, create if it doesn't exist: */ 2356 int bch2_check_root(struct bch_fs *c) 2357 { 2358 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2359 check_root_trans(trans)); 2360 bch_err_fn(c, ret); 2361 return ret; 2362 } 2363 2364 typedef DARRAY(u32) darray_u32; 2365 2366 static bool darray_u32_has(darray_u32 *d, u32 v) 2367 { 2368 darray_for_each(*d, i) 2369 if (*i == v) 2370 return true; 2371 return false; 2372 } 2373 2374 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) 2375 { 2376 struct bch_fs *c = trans->c; 2377 struct btree_iter parent_iter = {}; 2378 darray_u32 subvol_path = {}; 2379 struct printbuf buf = PRINTBUF; 2380 int ret = 0; 2381 2382 if (k.k->type != KEY_TYPE_subvolume) 2383 return 0; 2384 2385 while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { 2386 ret = darray_push(&subvol_path, k.k->p.offset); 2387 if (ret) 2388 goto err; 2389 2390 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2391 2392 struct bch_inode_unpacked subvol_root; 2393 ret = bch2_inode_find_by_inum_trans(trans, 2394 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 2395 &subvol_root); 2396 if (ret) 2397 break; 2398 2399 /* 2400 * We've checked that inode backpointers point to valid dirents; 2401 * here, it's sufficient to check that the subvolume root has a 2402 * dirent: 2403 */ 2404 if (fsck_err_on(!subvol_root.bi_dir, 2405 trans, subvol_unreachable, 2406 "unreachable subvolume %s", 2407 (bch2_bkey_val_to_text(&buf, c, s.s_c), 2408 prt_newline(&buf), 2409 bch2_inode_unpacked_to_text(&buf, &subvol_root), 2410 buf.buf))) { 2411 ret = reattach_subvol(trans, s); 2412 break; 2413 } 2414 2415 u32 parent = le32_to_cpu(s.v->fs_path_parent); 2416 2417 if (darray_u32_has(&subvol_path, parent)) { 2418 if (fsck_err(c, subvol_loop, "subvolume loop")) 2419 ret = reattach_subvol(trans, s); 2420 break; 2421 } 2422 2423 bch2_trans_iter_exit(trans, &parent_iter); 2424 bch2_trans_iter_init(trans, &parent_iter, 2425 BTREE_ID_subvolumes, POS(0, parent), 0); 2426 k = bch2_btree_iter_peek_slot(&parent_iter); 2427 ret = bkey_err(k); 2428 if (ret) 2429 goto err; 2430 2431 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, 2432 trans, subvol_unreachable, 2433 "unreachable subvolume %s", 2434 (bch2_bkey_val_to_text(&buf, c, s.s_c), 2435 buf.buf))) { 2436 ret = reattach_subvol(trans, s); 2437 break; 2438 } 2439 } 2440 fsck_err: 2441 err: 2442 printbuf_exit(&buf); 2443 darray_exit(&subvol_path); 2444 bch2_trans_iter_exit(trans, &parent_iter); 2445 return ret; 2446 } 2447 2448 int bch2_check_subvolume_structure(struct bch_fs *c) 2449 { 2450 int ret = bch2_trans_run(c, 2451 for_each_btree_key_commit(trans, iter, 2452 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, 2453 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2454 check_subvol_path(trans, &iter, k))); 2455 bch_err_fn(c, ret); 2456 return ret; 2457 } 2458 2459 struct pathbuf_entry { 2460 u64 inum; 2461 u32 snapshot; 2462 }; 2463 2464 typedef DARRAY(struct pathbuf_entry) pathbuf; 2465 2466 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) 2467 { 2468 darray_for_each(*p, i) 2469 if (i->inum == inum && 2470 i->snapshot == snapshot) 2471 return true; 2472 return false; 2473 } 2474 2475 /* 2476 * Check that a given inode is reachable from its subvolume root - we already 2477 * verified subvolume connectivity: 2478 * 2479 * XXX: we should also be verifying that inodes are in the right subvolumes 2480 */ 2481 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) 2482 { 2483 struct bch_fs *c = trans->c; 2484 struct btree_iter inode_iter = {}; 2485 struct bch_inode_unpacked inode; 2486 struct printbuf buf = PRINTBUF; 2487 u32 snapshot = inode_k.k->p.snapshot; 2488 int ret = 0; 2489 2490 p->nr = 0; 2491 2492 BUG_ON(bch2_inode_unpack(inode_k, &inode)); 2493 2494 while (!inode.bi_subvol) { 2495 struct btree_iter dirent_iter; 2496 struct bkey_s_c_dirent d; 2497 u32 parent_snapshot = snapshot; 2498 2499 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); 2500 ret = bkey_err(d.s_c); 2501 if (ret && !bch2_err_matches(ret, ENOENT)) 2502 break; 2503 2504 if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) 2505 bch2_trans_iter_exit(trans, &dirent_iter); 2506 2507 if (bch2_err_matches(ret, ENOENT)) { 2508 ret = 0; 2509 if (fsck_err(trans, inode_unreachable, 2510 "unreachable inode\n%s", 2511 (printbuf_reset(&buf), 2512 bch2_bkey_val_to_text(&buf, c, inode_k), 2513 buf.buf))) 2514 ret = reattach_inode(trans, &inode, snapshot); 2515 goto out; 2516 } 2517 2518 bch2_trans_iter_exit(trans, &dirent_iter); 2519 2520 if (!S_ISDIR(inode.bi_mode)) 2521 break; 2522 2523 ret = darray_push(p, ((struct pathbuf_entry) { 2524 .inum = inode.bi_inum, 2525 .snapshot = snapshot, 2526 })); 2527 if (ret) 2528 return ret; 2529 2530 snapshot = parent_snapshot; 2531 2532 bch2_trans_iter_exit(trans, &inode_iter); 2533 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, 2534 SPOS(0, inode.bi_dir, snapshot), 0); 2535 ret = bkey_err(inode_k) ?: 2536 !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode 2537 : bch2_inode_unpack(inode_k, &inode); 2538 if (ret) { 2539 /* Should have been caught in dirents pass */ 2540 bch_err_msg(c, ret, "error looking up parent directory"); 2541 break; 2542 } 2543 2544 snapshot = inode_k.k->p.snapshot; 2545 2546 if (path_is_dup(p, inode.bi_inum, snapshot)) { 2547 /* XXX print path */ 2548 bch_err(c, "directory structure loop"); 2549 2550 darray_for_each(*p, i) 2551 pr_err("%llu:%u", i->inum, i->snapshot); 2552 pr_err("%llu:%u", inode.bi_inum, snapshot); 2553 2554 if (fsck_err(trans, dir_loop, "directory structure loop")) { 2555 ret = remove_backpointer(trans, &inode); 2556 bch_err_msg(c, ret, "removing dirent"); 2557 if (ret) 2558 break; 2559 2560 ret = reattach_inode(trans, &inode, snapshot); 2561 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 2562 } 2563 break; 2564 } 2565 } 2566 out: 2567 fsck_err: 2568 bch2_trans_iter_exit(trans, &inode_iter); 2569 printbuf_exit(&buf); 2570 bch_err_fn(c, ret); 2571 return ret; 2572 } 2573 2574 /* 2575 * Check for unreachable inodes, as well as loops in the directory structure: 2576 * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's 2577 * unreachable: 2578 */ 2579 int bch2_check_directory_structure(struct bch_fs *c) 2580 { 2581 pathbuf path = { 0, }; 2582 int ret; 2583 2584 ret = bch2_trans_run(c, 2585 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, 2586 BTREE_ITER_intent| 2587 BTREE_ITER_prefetch| 2588 BTREE_ITER_all_snapshots, k, 2589 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 2590 if (!bkey_is_inode(k.k)) 2591 continue; 2592 2593 if (bch2_inode_flags(k) & BCH_INODE_unlinked) 2594 continue; 2595 2596 check_path(trans, &path, k); 2597 }))); 2598 darray_exit(&path); 2599 2600 bch_err_fn(c, ret); 2601 return ret; 2602 } 2603 2604 struct nlink_table { 2605 size_t nr; 2606 size_t size; 2607 2608 struct nlink { 2609 u64 inum; 2610 u32 snapshot; 2611 u32 count; 2612 } *d; 2613 }; 2614 2615 static int add_nlink(struct bch_fs *c, struct nlink_table *t, 2616 u64 inum, u32 snapshot) 2617 { 2618 if (t->nr == t->size) { 2619 size_t new_size = max_t(size_t, 128UL, t->size * 2); 2620 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); 2621 2622 if (!d) { 2623 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", 2624 new_size); 2625 return -BCH_ERR_ENOMEM_fsck_add_nlink; 2626 } 2627 2628 if (t->d) 2629 memcpy(d, t->d, t->size * sizeof(t->d[0])); 2630 kvfree(t->d); 2631 2632 t->d = d; 2633 t->size = new_size; 2634 } 2635 2636 2637 t->d[t->nr++] = (struct nlink) { 2638 .inum = inum, 2639 .snapshot = snapshot, 2640 }; 2641 2642 return 0; 2643 } 2644 2645 static int nlink_cmp(const void *_l, const void *_r) 2646 { 2647 const struct nlink *l = _l; 2648 const struct nlink *r = _r; 2649 2650 return cmp_int(l->inum, r->inum); 2651 } 2652 2653 static void inc_link(struct bch_fs *c, struct snapshots_seen *s, 2654 struct nlink_table *links, 2655 u64 range_start, u64 range_end, u64 inum, u32 snapshot) 2656 { 2657 struct nlink *link, key = { 2658 .inum = inum, .snapshot = U32_MAX, 2659 }; 2660 2661 if (inum < range_start || inum >= range_end) 2662 return; 2663 2664 link = __inline_bsearch(&key, links->d, links->nr, 2665 sizeof(links->d[0]), nlink_cmp); 2666 if (!link) 2667 return; 2668 2669 while (link > links->d && link[0].inum == link[-1].inum) 2670 --link; 2671 2672 for (; link < links->d + links->nr && link->inum == inum; link++) 2673 if (ref_visible(c, s, snapshot, link->snapshot)) { 2674 link->count++; 2675 if (link->snapshot >= snapshot) 2676 break; 2677 } 2678 } 2679 2680 noinline_for_stack 2681 static int check_nlinks_find_hardlinks(struct bch_fs *c, 2682 struct nlink_table *t, 2683 u64 start, u64 *end) 2684 { 2685 int ret = bch2_trans_run(c, 2686 for_each_btree_key(trans, iter, BTREE_ID_inodes, 2687 POS(0, start), 2688 BTREE_ITER_intent| 2689 BTREE_ITER_prefetch| 2690 BTREE_ITER_all_snapshots, k, ({ 2691 if (!bkey_is_inode(k.k)) 2692 continue; 2693 2694 /* Should never fail, checked by bch2_inode_invalid: */ 2695 struct bch_inode_unpacked u; 2696 BUG_ON(bch2_inode_unpack(k, &u)); 2697 2698 /* 2699 * Backpointer and directory structure checks are sufficient for 2700 * directories, since they can't have hardlinks: 2701 */ 2702 if (S_ISDIR(u.bi_mode)) 2703 continue; 2704 2705 if (!u.bi_nlink) 2706 continue; 2707 2708 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); 2709 if (ret) { 2710 *end = k.k->p.offset; 2711 ret = 0; 2712 break; 2713 } 2714 0; 2715 }))); 2716 2717 bch_err_fn(c, ret); 2718 return ret; 2719 } 2720 2721 noinline_for_stack 2722 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, 2723 u64 range_start, u64 range_end) 2724 { 2725 struct snapshots_seen s; 2726 2727 snapshots_seen_init(&s); 2728 2729 int ret = bch2_trans_run(c, 2730 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, 2731 BTREE_ITER_intent| 2732 BTREE_ITER_prefetch| 2733 BTREE_ITER_all_snapshots, k, ({ 2734 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); 2735 if (ret) 2736 break; 2737 2738 if (k.k->type == KEY_TYPE_dirent) { 2739 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2740 2741 if (d.v->d_type != DT_DIR && 2742 d.v->d_type != DT_SUBVOL) 2743 inc_link(c, &s, links, range_start, range_end, 2744 le64_to_cpu(d.v->d_inum), d.k->p.snapshot); 2745 } 2746 0; 2747 }))); 2748 2749 snapshots_seen_exit(&s); 2750 2751 bch_err_fn(c, ret); 2752 return ret; 2753 } 2754 2755 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, 2756 struct bkey_s_c k, 2757 struct nlink_table *links, 2758 size_t *idx, u64 range_end) 2759 { 2760 struct bch_inode_unpacked u; 2761 struct nlink *link = &links->d[*idx]; 2762 int ret = 0; 2763 2764 if (k.k->p.offset >= range_end) 2765 return 1; 2766 2767 if (!bkey_is_inode(k.k)) 2768 return 0; 2769 2770 BUG_ON(bch2_inode_unpack(k, &u)); 2771 2772 if (S_ISDIR(u.bi_mode)) 2773 return 0; 2774 2775 if (!u.bi_nlink) 2776 return 0; 2777 2778 while ((cmp_int(link->inum, k.k->p.offset) ?: 2779 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { 2780 BUG_ON(*idx == links->nr); 2781 link = &links->d[++*idx]; 2782 } 2783 2784 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, 2785 trans, inode_wrong_nlink, 2786 "inode %llu type %s has wrong i_nlink (%u, should be %u)", 2787 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], 2788 bch2_inode_nlink_get(&u), link->count)) { 2789 bch2_inode_nlink_set(&u, link->count); 2790 ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot); 2791 } 2792 fsck_err: 2793 return ret; 2794 } 2795 2796 noinline_for_stack 2797 static int check_nlinks_update_hardlinks(struct bch_fs *c, 2798 struct nlink_table *links, 2799 u64 range_start, u64 range_end) 2800 { 2801 size_t idx = 0; 2802 2803 int ret = bch2_trans_run(c, 2804 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 2805 POS(0, range_start), 2806 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 2807 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2808 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); 2809 if (ret < 0) { 2810 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); 2811 return ret; 2812 } 2813 2814 return 0; 2815 } 2816 2817 int bch2_check_nlinks(struct bch_fs *c) 2818 { 2819 struct nlink_table links = { 0 }; 2820 u64 this_iter_range_start, next_iter_range_start = 0; 2821 int ret = 0; 2822 2823 do { 2824 this_iter_range_start = next_iter_range_start; 2825 next_iter_range_start = U64_MAX; 2826 2827 ret = check_nlinks_find_hardlinks(c, &links, 2828 this_iter_range_start, 2829 &next_iter_range_start); 2830 2831 ret = check_nlinks_walk_dirents(c, &links, 2832 this_iter_range_start, 2833 next_iter_range_start); 2834 if (ret) 2835 break; 2836 2837 ret = check_nlinks_update_hardlinks(c, &links, 2838 this_iter_range_start, 2839 next_iter_range_start); 2840 if (ret) 2841 break; 2842 2843 links.nr = 0; 2844 } while (next_iter_range_start != U64_MAX); 2845 2846 kvfree(links.d); 2847 bch_err_fn(c, ret); 2848 return ret; 2849 } 2850 2851 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, 2852 struct bkey_s_c k) 2853 { 2854 struct bkey_s_c_reflink_p p; 2855 struct bkey_i_reflink_p *u; 2856 2857 if (k.k->type != KEY_TYPE_reflink_p) 2858 return 0; 2859 2860 p = bkey_s_c_to_reflink_p(k); 2861 2862 if (!p.v->front_pad && !p.v->back_pad) 2863 return 0; 2864 2865 u = bch2_trans_kmalloc(trans, sizeof(*u)); 2866 int ret = PTR_ERR_OR_ZERO(u); 2867 if (ret) 2868 return ret; 2869 2870 bkey_reassemble(&u->k_i, k); 2871 u->v.front_pad = 0; 2872 u->v.back_pad = 0; 2873 2874 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); 2875 } 2876 2877 int bch2_fix_reflink_p(struct bch_fs *c) 2878 { 2879 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) 2880 return 0; 2881 2882 int ret = bch2_trans_run(c, 2883 for_each_btree_key_commit(trans, iter, 2884 BTREE_ID_extents, POS_MIN, 2885 BTREE_ITER_intent|BTREE_ITER_prefetch| 2886 BTREE_ITER_all_snapshots, k, 2887 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2888 fix_reflink_p_key(trans, &iter, k))); 2889 bch_err_fn(c, ret); 2890 return ret; 2891 } 2892