1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_cache.h" 6 #include "btree_update.h" 7 #include "buckets.h" 8 #include "darray.h" 9 #include "dirent.h" 10 #include "error.h" 11 #include "fs.h" 12 #include "fs-common.h" 13 #include "fsck.h" 14 #include "inode.h" 15 #include "keylist.h" 16 #include "recovery_passes.h" 17 #include "snapshot.h" 18 #include "super.h" 19 #include "xattr.h" 20 21 #include <linux/bsearch.h> 22 #include <linux/dcache.h> /* struct qstr */ 23 24 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, 25 struct bkey_s_c_dirent d) 26 { 27 return inode->bi_dir == d.k->p.inode && 28 inode->bi_dir_offset == d.k->p.offset; 29 } 30 31 static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, 32 struct bch_inode_unpacked *inode) 33 { 34 if (d.v->d_type == DT_SUBVOL 35 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 36 : le64_to_cpu(d.v->d_inum) == inode->bi_inum) 37 return 0; 38 return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 39 } 40 41 static void dirent_inode_mismatch_msg(struct printbuf *out, 42 struct bch_fs *c, 43 struct bkey_s_c_dirent dirent, 44 struct bch_inode_unpacked *inode) 45 { 46 prt_str(out, "inode points to dirent that does not point back:"); 47 prt_newline(out); 48 bch2_bkey_val_to_text(out, c, dirent.s_c); 49 prt_newline(out); 50 bch2_inode_unpacked_to_text(out, inode); 51 } 52 53 static int dirent_points_to_inode(struct bch_fs *c, 54 struct bkey_s_c_dirent dirent, 55 struct bch_inode_unpacked *inode) 56 { 57 int ret = dirent_points_to_inode_nowarn(dirent, inode); 58 if (ret) { 59 struct printbuf buf = PRINTBUF; 60 dirent_inode_mismatch_msg(&buf, c, dirent, inode); 61 bch_warn(c, "%s", buf.buf); 62 printbuf_exit(&buf); 63 } 64 return ret; 65 } 66 67 /* 68 * XXX: this is handling transaction restarts without returning 69 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: 70 */ 71 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, 72 u32 snapshot) 73 { 74 u64 sectors = 0; 75 76 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, 77 SPOS(inum, 0, snapshot), 78 POS(inum, U64_MAX), 79 0, k, ({ 80 if (bkey_extent_is_allocation(k.k)) 81 sectors += k.k->size; 82 0; 83 })); 84 85 return ret ?: sectors; 86 } 87 88 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, 89 u32 snapshot) 90 { 91 u64 subdirs = 0; 92 93 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, 94 SPOS(inum, 0, snapshot), 95 POS(inum, U64_MAX), 96 0, k, ({ 97 if (k.k->type == KEY_TYPE_dirent && 98 bkey_s_c_to_dirent(k).v->d_type == DT_DIR) 99 subdirs++; 100 0; 101 })); 102 103 return ret ?: subdirs; 104 } 105 106 static int subvol_lookup(struct btree_trans *trans, u32 subvol, 107 u32 *snapshot, u64 *inum) 108 { 109 struct bch_subvolume s; 110 int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); 111 112 *snapshot = le32_to_cpu(s.snapshot); 113 *inum = le64_to_cpu(s.inode); 114 return ret; 115 } 116 117 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, 118 struct bch_inode_unpacked *inode) 119 { 120 struct btree_iter iter; 121 struct bkey_s_c k; 122 int ret; 123 124 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), 125 BTREE_ITER_all_snapshots, k, ret) { 126 if (k.k->p.offset != inode_nr) 127 break; 128 if (!bkey_is_inode(k.k)) 129 continue; 130 ret = bch2_inode_unpack(k, inode); 131 goto found; 132 } 133 ret = -BCH_ERR_ENOENT_inode; 134 found: 135 bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); 136 bch2_trans_iter_exit(trans, &iter); 137 return ret; 138 } 139 140 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, 141 struct bch_inode_unpacked *inode) 142 { 143 struct btree_iter iter; 144 struct bkey_s_c k; 145 int ret; 146 147 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 148 SPOS(0, inode_nr, snapshot), 0); 149 ret = bkey_err(k); 150 if (ret) 151 goto err; 152 153 ret = bkey_is_inode(k.k) 154 ? bch2_inode_unpack(k, inode) 155 : -BCH_ERR_ENOENT_inode; 156 err: 157 bch2_trans_iter_exit(trans, &iter); 158 return ret; 159 } 160 161 static int lookup_dirent_in_snapshot(struct btree_trans *trans, 162 struct bch_hash_info hash_info, 163 subvol_inum dir, struct qstr *name, 164 u64 *target, unsigned *type, u32 snapshot) 165 { 166 struct btree_iter iter; 167 struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, 168 &hash_info, dir, name, 0, snapshot); 169 int ret = bkey_err(k); 170 if (ret) 171 return ret; 172 173 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); 174 *target = le64_to_cpu(d.v->d_inum); 175 *type = d.v->d_type; 176 bch2_trans_iter_exit(trans, &iter); 177 return 0; 178 } 179 180 static int __remove_dirent(struct btree_trans *trans, struct bpos pos) 181 { 182 struct bch_fs *c = trans->c; 183 struct btree_iter iter; 184 struct bch_inode_unpacked dir_inode; 185 struct bch_hash_info dir_hash_info; 186 int ret; 187 188 ret = lookup_first_inode(trans, pos.inode, &dir_inode); 189 if (ret) 190 goto err; 191 192 dir_hash_info = bch2_hash_info_init(c, &dir_inode); 193 194 bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); 195 196 ret = bch2_btree_iter_traverse(&iter) ?: 197 bch2_hash_delete_at(trans, bch2_dirent_hash_desc, 198 &dir_hash_info, &iter, 199 BTREE_UPDATE_internal_snapshot_node); 200 bch2_trans_iter_exit(trans, &iter); 201 err: 202 bch_err_fn(c, ret); 203 return ret; 204 } 205 206 /* Get lost+found, create if it doesn't exist: */ 207 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, 208 struct bch_inode_unpacked *lostfound, 209 u64 reattaching_inum) 210 { 211 struct bch_fs *c = trans->c; 212 struct qstr lostfound_str = QSTR("lost+found"); 213 u64 inum = 0; 214 unsigned d_type = 0; 215 int ret; 216 217 struct bch_snapshot_tree st; 218 ret = bch2_snapshot_tree_lookup(trans, 219 bch2_snapshot_tree(c, snapshot), &st); 220 if (ret) 221 return ret; 222 223 subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; 224 225 struct bch_subvolume subvol; 226 ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), 227 false, 0, &subvol); 228 bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", 229 le32_to_cpu(st.master_subvol), snapshot); 230 if (ret) 231 return ret; 232 233 if (!subvol.inode) { 234 struct btree_iter iter; 235 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, 236 BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), 237 0, subvolume); 238 ret = PTR_ERR_OR_ZERO(subvol); 239 if (ret) 240 return ret; 241 242 subvol->v.inode = cpu_to_le64(reattaching_inum); 243 bch2_trans_iter_exit(trans, &iter); 244 } 245 246 root_inum.inum = le64_to_cpu(subvol.inode); 247 248 struct bch_inode_unpacked root_inode; 249 struct bch_hash_info root_hash_info; 250 ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); 251 bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", 252 root_inum.inum, le32_to_cpu(st.master_subvol)); 253 if (ret) 254 return ret; 255 256 root_hash_info = bch2_hash_info_init(c, &root_inode); 257 258 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, 259 &lostfound_str, &inum, &d_type, snapshot); 260 if (bch2_err_matches(ret, ENOENT)) 261 goto create_lostfound; 262 263 bch_err_fn(c, ret); 264 if (ret) 265 return ret; 266 267 if (d_type != DT_DIR) { 268 bch_err(c, "error looking up lost+found: not a directory"); 269 return -BCH_ERR_ENOENT_not_directory; 270 } 271 272 /* 273 * The bch2_check_dirents pass has already run, dangling dirents 274 * shouldn't exist here: 275 */ 276 ret = lookup_inode(trans, inum, snapshot, lostfound); 277 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", 278 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); 279 return ret; 280 281 create_lostfound: 282 /* 283 * we always create lost+found in the root snapshot; we don't want 284 * different branches of the snapshot tree to have different lost+found 285 */ 286 snapshot = le32_to_cpu(st.root_snapshot); 287 /* 288 * XXX: we could have a nicer log message here if we had a nice way to 289 * walk backpointers to print a path 290 */ 291 bch_notice(c, "creating lost+found in subvol %llu snapshot %u", 292 root_inum.subvol, le32_to_cpu(st.root_snapshot)); 293 294 u64 now = bch2_current_time(c); 295 struct btree_iter lostfound_iter = { NULL }; 296 u64 cpu = raw_smp_processor_id(); 297 298 bch2_inode_init_early(c, lostfound); 299 bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); 300 lostfound->bi_dir = root_inode.bi_inum; 301 lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); 302 303 root_inode.bi_nlink++; 304 305 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); 306 if (ret) 307 goto err; 308 309 bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); 310 ret = bch2_btree_iter_traverse(&lostfound_iter); 311 if (ret) 312 goto err; 313 314 ret = bch2_dirent_create_snapshot(trans, 315 0, root_inode.bi_inum, snapshot, &root_hash_info, 316 mode_to_type(lostfound->bi_mode), 317 &lostfound_str, 318 lostfound->bi_inum, 319 &lostfound->bi_dir_offset, 320 STR_HASH_must_create) ?: 321 bch2_inode_write_flags(trans, &lostfound_iter, lostfound, 322 BTREE_UPDATE_internal_snapshot_node); 323 err: 324 bch_err_msg(c, ret, "creating lost+found"); 325 bch2_trans_iter_exit(trans, &lostfound_iter); 326 return ret; 327 } 328 329 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) 330 { 331 if (inode->bi_inum == BCACHEFS_ROOT_INO && 332 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) 333 return false; 334 335 return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); 336 } 337 338 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) 339 { 340 struct btree_iter iter; 341 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, 342 SPOS(d_pos.inode, d_pos.offset, snapshot), 343 BTREE_ITER_intent| 344 BTREE_ITER_with_updates); 345 int ret = bkey_err(k); 346 if (ret) 347 return ret; 348 349 if (bpos_eq(k.k->p, d_pos)) { 350 /* 351 * delet_at() doesn't work because the update path doesn't 352 * internally use BTREE_ITER_with_updates yet 353 */ 354 struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); 355 ret = PTR_ERR_OR_ZERO(k); 356 if (ret) 357 goto err; 358 359 bkey_init(&k->k); 360 k->k.type = KEY_TYPE_whiteout; 361 k->k.p = iter.pos; 362 ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); 363 } 364 err: 365 bch2_trans_iter_exit(trans, &iter); 366 return ret; 367 } 368 369 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) 370 { 371 struct bch_fs *c = trans->c; 372 struct bch_inode_unpacked lostfound; 373 char name_buf[20]; 374 int ret; 375 376 u32 dirent_snapshot = inode->bi_snapshot; 377 if (inode->bi_subvol) { 378 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; 379 380 u64 root_inum; 381 ret = subvol_lookup(trans, inode->bi_parent_subvol, 382 &dirent_snapshot, &root_inum); 383 if (ret) 384 return ret; 385 386 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); 387 } else { 388 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 389 } 390 391 ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); 392 if (ret) 393 return ret; 394 395 lostfound.bi_nlink += S_ISDIR(inode->bi_mode); 396 397 /* ensure lost+found inode is also present in inode snapshot */ 398 if (!inode->bi_subvol) { 399 BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot)); 400 lostfound.bi_snapshot = inode->bi_snapshot; 401 } 402 403 ret = __bch2_fsck_write_inode(trans, &lostfound); 404 if (ret) 405 return ret; 406 407 struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); 408 struct qstr name = (struct qstr) QSTR(name_buf); 409 410 inode->bi_dir = lostfound.bi_inum; 411 412 ret = bch2_dirent_create_snapshot(trans, 413 inode->bi_parent_subvol, lostfound.bi_inum, 414 dirent_snapshot, 415 &dir_hash, 416 inode_d_type(inode), 417 &name, 418 inode->bi_subvol ?: inode->bi_inum, 419 &inode->bi_dir_offset, 420 STR_HASH_must_create); 421 if (ret) { 422 bch_err_msg(c, ret, "error creating dirent"); 423 return ret; 424 } 425 426 ret = __bch2_fsck_write_inode(trans, inode); 427 if (ret) 428 return ret; 429 430 /* 431 * Fix up inodes in child snapshots: if they should also be reattached 432 * update the backpointer field, if they should not be we need to emit 433 * whiteouts for the dirent we just created. 434 */ 435 if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { 436 snapshot_id_list whiteouts_done; 437 struct btree_iter iter; 438 struct bkey_s_c k; 439 440 darray_init(&whiteouts_done); 441 442 for_each_btree_key_reverse_norestart(trans, iter, 443 BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), 444 BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { 445 if (k.k->p.offset != inode->bi_inum) 446 break; 447 448 if (!bkey_is_inode(k.k) || 449 !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || 450 snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) 451 continue; 452 453 struct bch_inode_unpacked child_inode; 454 bch2_inode_unpack(k, &child_inode); 455 456 if (!inode_should_reattach(&child_inode)) { 457 ret = maybe_delete_dirent(trans, 458 SPOS(lostfound.bi_inum, inode->bi_dir_offset, 459 dirent_snapshot), 460 k.k->p.snapshot); 461 if (ret) 462 break; 463 464 ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); 465 if (ret) 466 break; 467 } else { 468 iter.snapshot = k.k->p.snapshot; 469 child_inode.bi_dir = inode->bi_dir; 470 child_inode.bi_dir_offset = inode->bi_dir_offset; 471 472 ret = bch2_inode_write_flags(trans, &iter, &child_inode, 473 BTREE_UPDATE_internal_snapshot_node); 474 if (ret) 475 break; 476 } 477 } 478 darray_exit(&whiteouts_done); 479 bch2_trans_iter_exit(trans, &iter); 480 } 481 482 return ret; 483 } 484 485 static int remove_backpointer(struct btree_trans *trans, 486 struct bch_inode_unpacked *inode) 487 { 488 if (!inode->bi_dir) 489 return 0; 490 491 struct bch_fs *c = trans->c; 492 struct btree_iter iter; 493 struct bkey_s_c_dirent d = 494 bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, 495 SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0, 496 dirent); 497 int ret = bkey_err(d) ?: 498 dirent_points_to_inode(c, d, inode) ?: 499 __remove_dirent(trans, d.k->p); 500 bch2_trans_iter_exit(trans, &iter); 501 return ret; 502 } 503 504 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) 505 { 506 struct bch_fs *c = trans->c; 507 508 struct bch_inode_unpacked inode; 509 int ret = bch2_inode_find_by_inum_trans(trans, 510 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 511 &inode); 512 if (ret) 513 return ret; 514 515 ret = remove_backpointer(trans, &inode); 516 if (!bch2_err_matches(ret, ENOENT)) 517 bch_err_msg(c, ret, "removing dirent"); 518 if (ret) 519 return ret; 520 521 ret = reattach_inode(trans, &inode); 522 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 523 return ret; 524 } 525 526 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) 527 { 528 struct bch_fs *c = trans->c; 529 530 if (!bch2_snapshot_is_leaf(c, snapshotid)) { 531 bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); 532 return -BCH_ERR_fsck_repair_unimplemented; 533 } 534 535 /* 536 * If inum isn't set, that means we're being called from check_dirents, 537 * not check_inodes - the root of this subvolume doesn't exist or we 538 * would have found it there: 539 */ 540 if (!inum) { 541 struct btree_iter inode_iter = {}; 542 struct bch_inode_unpacked new_inode; 543 u64 cpu = raw_smp_processor_id(); 544 545 bch2_inode_init_early(c, &new_inode); 546 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); 547 548 new_inode.bi_subvol = subvolid; 549 550 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: 551 bch2_btree_iter_traverse(&inode_iter) ?: 552 bch2_inode_write(trans, &inode_iter, &new_inode); 553 bch2_trans_iter_exit(trans, &inode_iter); 554 if (ret) 555 return ret; 556 557 inum = new_inode.bi_inum; 558 } 559 560 bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); 561 562 struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); 563 int ret = PTR_ERR_OR_ZERO(new_subvol); 564 if (ret) 565 return ret; 566 567 bkey_subvolume_init(&new_subvol->k_i); 568 new_subvol->k.p.offset = subvolid; 569 new_subvol->v.snapshot = cpu_to_le32(snapshotid); 570 new_subvol->v.inode = cpu_to_le64(inum); 571 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); 572 if (ret) 573 return ret; 574 575 struct btree_iter iter; 576 struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, 577 BTREE_ID_snapshots, POS(0, snapshotid), 578 0, snapshot); 579 ret = PTR_ERR_OR_ZERO(s); 580 bch_err_msg(c, ret, "getting snapshot %u", snapshotid); 581 if (ret) 582 return ret; 583 584 u32 snapshot_tree = le32_to_cpu(s->v.tree); 585 586 s->v.subvol = cpu_to_le32(subvolid); 587 SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); 588 bch2_trans_iter_exit(trans, &iter); 589 590 struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, 591 BTREE_ID_snapshot_trees, POS(0, snapshot_tree), 592 0, snapshot_tree); 593 ret = PTR_ERR_OR_ZERO(st); 594 bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); 595 if (ret) 596 return ret; 597 598 if (!st->v.master_subvol) 599 st->v.master_subvol = cpu_to_le32(subvolid); 600 601 bch2_trans_iter_exit(trans, &iter); 602 return 0; 603 } 604 605 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) 606 { 607 struct bch_fs *c = trans->c; 608 unsigned i_mode = S_IFREG; 609 u64 i_size = 0; 610 611 switch (btree) { 612 case BTREE_ID_extents: { 613 struct btree_iter iter = {}; 614 615 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); 616 struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); 617 bch2_trans_iter_exit(trans, &iter); 618 int ret = bkey_err(k); 619 if (ret) 620 return ret; 621 622 i_size = k.k->p.offset << 9; 623 break; 624 } 625 case BTREE_ID_dirents: 626 i_mode = S_IFDIR; 627 break; 628 case BTREE_ID_xattrs: 629 break; 630 default: 631 BUG(); 632 } 633 634 struct bch_inode_unpacked new_inode; 635 bch2_inode_init_early(c, &new_inode); 636 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); 637 new_inode.bi_size = i_size; 638 new_inode.bi_inum = inum; 639 new_inode.bi_snapshot = snapshot; 640 641 return __bch2_fsck_write_inode(trans, &new_inode); 642 } 643 644 struct snapshots_seen { 645 struct bpos pos; 646 snapshot_id_list ids; 647 }; 648 649 static inline void snapshots_seen_exit(struct snapshots_seen *s) 650 { 651 darray_exit(&s->ids); 652 } 653 654 static inline void snapshots_seen_init(struct snapshots_seen *s) 655 { 656 memset(s, 0, sizeof(*s)); 657 } 658 659 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) 660 { 661 u32 *i; 662 __darray_for_each(s->ids, i) { 663 if (*i == id) 664 return 0; 665 if (*i > id) 666 break; 667 } 668 669 int ret = darray_insert_item(&s->ids, i - s->ids.data, id); 670 if (ret) 671 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 672 s->ids.size); 673 return ret; 674 } 675 676 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, 677 enum btree_id btree_id, struct bpos pos) 678 { 679 if (!bkey_eq(s->pos, pos)) 680 s->ids.nr = 0; 681 s->pos = pos; 682 683 return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); 684 } 685 686 /** 687 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, 688 * and @ancestor hasn't been overwritten in @seen 689 * 690 * @c: filesystem handle 691 * @seen: list of snapshot ids already seen at current position 692 * @id: descendent snapshot id 693 * @ancestor: ancestor snapshot id 694 * 695 * Returns: whether key in @ancestor snapshot is visible in @id snapshot 696 */ 697 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, 698 u32 id, u32 ancestor) 699 { 700 ssize_t i; 701 702 EBUG_ON(id > ancestor); 703 704 /* @ancestor should be the snapshot most recently added to @seen */ 705 EBUG_ON(ancestor != seen->pos.snapshot); 706 EBUG_ON(ancestor != darray_last(seen->ids)); 707 708 if (id == ancestor) 709 return true; 710 711 if (!bch2_snapshot_is_ancestor(c, id, ancestor)) 712 return false; 713 714 /* 715 * We know that @id is a descendant of @ancestor, we're checking if 716 * we've seen a key that overwrote @ancestor - i.e. also a descendent of 717 * @ascestor and with @id as a descendent. 718 * 719 * But we already know that we're scanning IDs between @id and @ancestor 720 * numerically, since snapshot ID lists are kept sorted, so if we find 721 * an id that's an ancestor of @id we're done: 722 */ 723 724 for (i = seen->ids.nr - 2; 725 i >= 0 && seen->ids.data[i] >= id; 726 --i) 727 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) 728 return false; 729 730 return true; 731 } 732 733 /** 734 * ref_visible - given a key with snapshot id @src that points to a key with 735 * snapshot id @dst, test whether there is some snapshot in which @dst is 736 * visible. 737 * 738 * @c: filesystem handle 739 * @s: list of snapshot IDs already seen at @src 740 * @src: snapshot ID of src key 741 * @dst: snapshot ID of dst key 742 * Returns: true if there is some snapshot in which @dst is visible 743 * 744 * Assumes we're visiting @src keys in natural key order 745 */ 746 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, 747 u32 src, u32 dst) 748 { 749 return dst <= src 750 ? key_visible_in_snapshot(c, s, dst, src) 751 : bch2_snapshot_is_ancestor(c, src, dst); 752 } 753 754 static int ref_visible2(struct bch_fs *c, 755 u32 src, struct snapshots_seen *src_seen, 756 u32 dst, struct snapshots_seen *dst_seen) 757 { 758 if (dst > src) { 759 swap(dst, src); 760 swap(dst_seen, src_seen); 761 } 762 return key_visible_in_snapshot(c, src_seen, dst, src); 763 } 764 765 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ 766 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ 767 (_i)->snapshot <= (_snapshot); _i++) \ 768 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) 769 770 struct inode_walker_entry { 771 struct bch_inode_unpacked inode; 772 u32 snapshot; 773 u64 count; 774 }; 775 776 struct inode_walker { 777 bool first_this_inode; 778 bool have_inodes; 779 bool recalculate_sums; 780 struct bpos last_pos; 781 782 DARRAY(struct inode_walker_entry) inodes; 783 }; 784 785 static void inode_walker_exit(struct inode_walker *w) 786 { 787 darray_exit(&w->inodes); 788 } 789 790 static struct inode_walker inode_walker_init(void) 791 { 792 return (struct inode_walker) { 0, }; 793 } 794 795 static int add_inode(struct bch_fs *c, struct inode_walker *w, 796 struct bkey_s_c inode) 797 { 798 struct bch_inode_unpacked u; 799 800 BUG_ON(bch2_inode_unpack(inode, &u)); 801 802 return darray_push(&w->inodes, ((struct inode_walker_entry) { 803 .inode = u, 804 .snapshot = inode.k->p.snapshot, 805 })); 806 } 807 808 static int get_inodes_all_snapshots(struct btree_trans *trans, 809 struct inode_walker *w, u64 inum) 810 { 811 struct bch_fs *c = trans->c; 812 struct btree_iter iter; 813 struct bkey_s_c k; 814 int ret; 815 816 /* 817 * We no longer have inodes for w->last_pos; clear this to avoid 818 * screwing up check_i_sectors/check_subdir_count if we take a 819 * transaction restart here: 820 */ 821 w->have_inodes = false; 822 w->recalculate_sums = false; 823 w->inodes.nr = 0; 824 825 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 826 BTREE_ITER_all_snapshots, k, ret) { 827 if (k.k->p.offset != inum) 828 break; 829 830 if (bkey_is_inode(k.k)) 831 add_inode(c, w, k); 832 } 833 bch2_trans_iter_exit(trans, &iter); 834 835 if (ret) 836 return ret; 837 838 w->first_this_inode = true; 839 w->have_inodes = true; 840 return 0; 841 } 842 843 static struct inode_walker_entry * 844 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) 845 { 846 bool is_whiteout = k.k->type == KEY_TYPE_whiteout; 847 848 struct inode_walker_entry *i; 849 __darray_for_each(w->inodes, i) 850 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) 851 goto found; 852 853 return NULL; 854 found: 855 BUG_ON(k.k->p.snapshot > i->snapshot); 856 857 if (k.k->p.snapshot != i->snapshot && !is_whiteout) { 858 struct inode_walker_entry new = *i; 859 860 new.snapshot = k.k->p.snapshot; 861 new.count = 0; 862 863 struct printbuf buf = PRINTBUF; 864 bch2_bkey_val_to_text(&buf, c, k); 865 866 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" 867 "unexpected because we should always update the inode when we update a key in that inode\n" 868 "%s", 869 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); 870 printbuf_exit(&buf); 871 872 while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) 873 --i; 874 875 size_t pos = i - w->inodes.data; 876 int ret = darray_insert_item(&w->inodes, pos, new); 877 if (ret) 878 return ERR_PTR(ret); 879 880 i = w->inodes.data + pos; 881 } 882 883 return i; 884 } 885 886 static struct inode_walker_entry *walk_inode(struct btree_trans *trans, 887 struct inode_walker *w, 888 struct bkey_s_c k) 889 { 890 if (w->last_pos.inode != k.k->p.inode) { 891 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); 892 if (ret) 893 return ERR_PTR(ret); 894 } 895 896 w->last_pos = k.k->p; 897 898 return lookup_inode_for_snapshot(trans->c, w, k); 899 } 900 901 static int get_visible_inodes(struct btree_trans *trans, 902 struct inode_walker *w, 903 struct snapshots_seen *s, 904 u64 inum) 905 { 906 struct bch_fs *c = trans->c; 907 struct btree_iter iter; 908 struct bkey_s_c k; 909 int ret; 910 911 w->inodes.nr = 0; 912 913 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 914 BTREE_ITER_all_snapshots, k, ret) { 915 if (k.k->p.offset != inum) 916 break; 917 918 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) 919 continue; 920 921 if (bkey_is_inode(k.k)) 922 add_inode(c, w, k); 923 924 if (k.k->p.snapshot >= s->pos.snapshot) 925 break; 926 } 927 bch2_trans_iter_exit(trans, &iter); 928 929 return ret; 930 } 931 932 static int hash_redo_key(struct btree_trans *trans, 933 const struct bch_hash_desc desc, 934 struct bch_hash_info *hash_info, 935 struct btree_iter *k_iter, struct bkey_s_c k) 936 { 937 struct bkey_i *delete; 938 struct bkey_i *tmp; 939 940 delete = bch2_trans_kmalloc(trans, sizeof(*delete)); 941 if (IS_ERR(delete)) 942 return PTR_ERR(delete); 943 944 tmp = bch2_bkey_make_mut_noupdate(trans, k); 945 if (IS_ERR(tmp)) 946 return PTR_ERR(tmp); 947 948 bkey_init(&delete->k); 949 delete->k.p = k_iter->pos; 950 return bch2_btree_iter_traverse(k_iter) ?: 951 bch2_trans_update(trans, k_iter, delete, 0) ?: 952 bch2_hash_set_in_snapshot(trans, desc, hash_info, 953 (subvol_inum) { 0, k.k->p.inode }, 954 k.k->p.snapshot, tmp, 955 STR_HASH_must_create| 956 BTREE_UPDATE_internal_snapshot_node) ?: 957 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 958 } 959 960 static int hash_check_key(struct btree_trans *trans, 961 const struct bch_hash_desc desc, 962 struct bch_hash_info *hash_info, 963 struct btree_iter *k_iter, struct bkey_s_c hash_k) 964 { 965 struct bch_fs *c = trans->c; 966 struct btree_iter iter = { NULL }; 967 struct printbuf buf = PRINTBUF; 968 struct bkey_s_c k; 969 u64 hash; 970 int ret = 0; 971 972 if (hash_k.k->type != desc.key_type) 973 return 0; 974 975 hash = desc.hash_bkey(hash_info, hash_k); 976 977 if (likely(hash == hash_k.k->p.offset)) 978 return 0; 979 980 if (hash_k.k->p.offset < hash) 981 goto bad_hash; 982 983 for_each_btree_key_norestart(trans, iter, desc.btree_id, 984 SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), 985 BTREE_ITER_slots, k, ret) { 986 if (bkey_eq(k.k->p, hash_k.k->p)) 987 break; 988 989 if (fsck_err_on(k.k->type == desc.key_type && 990 !desc.cmp_bkey(k, hash_k), 991 trans, hash_table_key_duplicate, 992 "duplicate hash table keys:\n%s", 993 (printbuf_reset(&buf), 994 bch2_bkey_val_to_text(&buf, c, hash_k), 995 buf.buf))) { 996 ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; 997 break; 998 } 999 1000 if (bkey_deleted(k.k)) { 1001 bch2_trans_iter_exit(trans, &iter); 1002 goto bad_hash; 1003 } 1004 } 1005 out: 1006 bch2_trans_iter_exit(trans, &iter); 1007 printbuf_exit(&buf); 1008 return ret; 1009 bad_hash: 1010 if (fsck_err(trans, hash_table_key_wrong_offset, 1011 "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", 1012 bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, 1013 (printbuf_reset(&buf), 1014 bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { 1015 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); 1016 bch_err_fn(c, ret); 1017 if (ret) 1018 return ret; 1019 ret = -BCH_ERR_transaction_restart_nested; 1020 } 1021 fsck_err: 1022 goto out; 1023 } 1024 1025 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 1026 struct btree_iter *iter, 1027 struct bpos pos) 1028 { 1029 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 1030 } 1031 1032 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, 1033 struct btree_iter *iter, 1034 struct bch_inode_unpacked *inode, 1035 u32 *snapshot) 1036 { 1037 if (inode->bi_subvol) { 1038 u64 inum; 1039 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); 1040 if (ret) 1041 return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); 1042 } 1043 1044 return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); 1045 } 1046 1047 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) 1048 { 1049 struct btree_iter iter; 1050 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); 1051 int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; 1052 bch2_trans_iter_exit(trans, &iter); 1053 return ret; 1054 } 1055 1056 static int check_inode_dirent_inode(struct btree_trans *trans, 1057 struct bch_inode_unpacked *inode, 1058 bool *write_inode) 1059 { 1060 struct bch_fs *c = trans->c; 1061 struct printbuf buf = PRINTBUF; 1062 1063 u32 inode_snapshot = inode->bi_snapshot; 1064 struct btree_iter dirent_iter = {}; 1065 struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); 1066 int ret = bkey_err(d); 1067 if (ret && !bch2_err_matches(ret, ENOENT)) 1068 return ret; 1069 1070 if (fsck_err_on(ret, 1071 trans, inode_points_to_missing_dirent, 1072 "inode points to missing dirent\n%s", 1073 (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || 1074 fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), 1075 trans, inode_points_to_wrong_dirent, 1076 "%s", 1077 (printbuf_reset(&buf), 1078 dirent_inode_mismatch_msg(&buf, c, d, inode), 1079 buf.buf))) { 1080 /* 1081 * We just clear the backpointer fields for now. If we find a 1082 * dirent that points to this inode in check_dirents(), we'll 1083 * update it then; then when we get to check_path() if the 1084 * backpointer is still 0 we'll reattach it. 1085 */ 1086 inode->bi_dir = 0; 1087 inode->bi_dir_offset = 0; 1088 *write_inode = true; 1089 } 1090 1091 ret = 0; 1092 fsck_err: 1093 bch2_trans_iter_exit(trans, &dirent_iter); 1094 printbuf_exit(&buf); 1095 bch_err_fn(c, ret); 1096 return ret; 1097 } 1098 1099 static int check_inode(struct btree_trans *trans, 1100 struct btree_iter *iter, 1101 struct bkey_s_c k, 1102 struct bch_inode_unpacked *prev, 1103 struct snapshots_seen *s) 1104 { 1105 struct bch_fs *c = trans->c; 1106 struct printbuf buf = PRINTBUF; 1107 struct bch_inode_unpacked u; 1108 bool do_update = false; 1109 int ret; 1110 1111 ret = bch2_check_key_has_snapshot(trans, iter, k); 1112 if (ret < 0) 1113 goto err; 1114 if (ret) 1115 return 0; 1116 1117 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1118 if (ret) 1119 goto err; 1120 1121 if (!bkey_is_inode(k.k)) 1122 return 0; 1123 1124 BUG_ON(bch2_inode_unpack(k, &u)); 1125 1126 if (prev->bi_inum != u.bi_inum) 1127 *prev = u; 1128 1129 if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || 1130 inode_d_type(prev) != inode_d_type(&u), 1131 trans, inode_snapshot_mismatch, 1132 "inodes in different snapshots don't match")) { 1133 bch_err(c, "repair not implemented yet"); 1134 ret = -BCH_ERR_fsck_repair_unimplemented; 1135 goto err_noprint; 1136 } 1137 1138 if (u.bi_dir || u.bi_dir_offset) { 1139 ret = check_inode_dirent_inode(trans, &u, &do_update); 1140 if (ret) 1141 goto err; 1142 } 1143 1144 if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked), 1145 trans, inode_unlinked_but_has_dirent, 1146 "inode unlinked but has dirent\n%s", 1147 (printbuf_reset(&buf), 1148 bch2_inode_unpacked_to_text(&buf, &u), 1149 buf.buf))) { 1150 u.bi_flags &= ~BCH_INODE_unlinked; 1151 do_update = true; 1152 } 1153 1154 if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) { 1155 /* Check for this early so that check_unreachable_inode() will reattach it */ 1156 1157 ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot); 1158 if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty) 1159 goto err; 1160 1161 fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty, 1162 "dir unlinked but not empty\n%s", 1163 (printbuf_reset(&buf), 1164 bch2_inode_unpacked_to_text(&buf, &u), 1165 buf.buf)); 1166 u.bi_flags &= ~BCH_INODE_unlinked; 1167 do_update = true; 1168 ret = 0; 1169 } 1170 1171 ret = bch2_inode_has_child_snapshots(trans, k.k->p); 1172 if (ret < 0) 1173 goto err; 1174 1175 if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), 1176 trans, inode_has_child_snapshots_wrong, 1177 "inode has_child_snapshots flag wrong (should be %u)\n%s", 1178 ret, 1179 (printbuf_reset(&buf), 1180 bch2_inode_unpacked_to_text(&buf, &u), 1181 buf.buf))) { 1182 if (ret) 1183 u.bi_flags |= BCH_INODE_has_child_snapshot; 1184 else 1185 u.bi_flags &= ~BCH_INODE_has_child_snapshot; 1186 do_update = true; 1187 } 1188 ret = 0; 1189 1190 if ((u.bi_flags & BCH_INODE_unlinked) && 1191 !(u.bi_flags & BCH_INODE_has_child_snapshot)) { 1192 if (!test_bit(BCH_FS_started, &c->flags)) { 1193 /* 1194 * If we're not in online fsck, don't delete unlinked 1195 * inodes, just make sure they're on the deleted list. 1196 * 1197 * They might be referred to by a logged operation - 1198 * i.e. we might have crashed in the middle of a 1199 * truncate on an unlinked but open file - so we want to 1200 * let the delete_dead_inodes kill it after resuming 1201 * logged ops. 1202 */ 1203 ret = check_inode_deleted_list(trans, k.k->p); 1204 if (ret < 0) 1205 goto err_noprint; 1206 1207 fsck_err_on(!ret, 1208 trans, unlinked_inode_not_on_deleted_list, 1209 "inode %llu:%u unlinked, but not on deleted list", 1210 u.bi_inum, k.k->p.snapshot); 1211 1212 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); 1213 if (ret) 1214 goto err; 1215 } else { 1216 ret = bch2_inode_or_descendents_is_open(trans, k.k->p); 1217 if (ret < 0) 1218 goto err; 1219 1220 if (fsck_err_on(!ret, 1221 trans, inode_unlinked_and_not_open, 1222 "inode %llu%u unlinked and not open", 1223 u.bi_inum, u.bi_snapshot)) { 1224 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); 1225 bch_err_msg(c, ret, "in fsck deleting inode"); 1226 goto err_noprint; 1227 } 1228 ret = 0; 1229 } 1230 } 1231 1232 if (fsck_err_on(u.bi_parent_subvol && 1233 (u.bi_subvol == 0 || 1234 u.bi_subvol == BCACHEFS_ROOT_SUBVOL), 1235 trans, inode_bi_parent_nonzero, 1236 "inode %llu:%u has subvol %u but nonzero parent subvol %u", 1237 u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { 1238 u.bi_parent_subvol = 0; 1239 do_update = true; 1240 } 1241 1242 if (u.bi_subvol) { 1243 struct bch_subvolume s; 1244 1245 ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); 1246 if (ret && !bch2_err_matches(ret, ENOENT)) 1247 goto err; 1248 1249 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 1250 ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); 1251 goto do_update; 1252 } 1253 1254 if (fsck_err_on(ret, 1255 trans, inode_bi_subvol_missing, 1256 "inode %llu:%u bi_subvol points to missing subvolume %u", 1257 u.bi_inum, k.k->p.snapshot, u.bi_subvol) || 1258 fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || 1259 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), 1260 k.k->p.snapshot), 1261 trans, inode_bi_subvol_wrong, 1262 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", 1263 u.bi_inum, k.k->p.snapshot, u.bi_subvol, 1264 le64_to_cpu(s.inode), 1265 le32_to_cpu(s.snapshot))) { 1266 u.bi_subvol = 0; 1267 u.bi_parent_subvol = 0; 1268 do_update = true; 1269 } 1270 } 1271 do_update: 1272 if (do_update) { 1273 ret = __bch2_fsck_write_inode(trans, &u); 1274 bch_err_msg(c, ret, "in fsck updating inode"); 1275 if (ret) 1276 goto err_noprint; 1277 } 1278 err: 1279 fsck_err: 1280 bch_err_fn(c, ret); 1281 err_noprint: 1282 printbuf_exit(&buf); 1283 return ret; 1284 } 1285 1286 int bch2_check_inodes(struct bch_fs *c) 1287 { 1288 struct bch_inode_unpacked prev = { 0 }; 1289 struct snapshots_seen s; 1290 1291 snapshots_seen_init(&s); 1292 1293 int ret = bch2_trans_run(c, 1294 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1295 POS_MIN, 1296 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1297 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1298 check_inode(trans, &iter, k, &prev, &s))); 1299 1300 snapshots_seen_exit(&s); 1301 bch_err_fn(c, ret); 1302 return ret; 1303 } 1304 1305 static int find_oldest_inode_needs_reattach(struct btree_trans *trans, 1306 struct bch_inode_unpacked *inode) 1307 { 1308 struct bch_fs *c = trans->c; 1309 struct btree_iter iter; 1310 struct bkey_s_c k; 1311 int ret = 0; 1312 1313 /* 1314 * We look for inodes to reattach in natural key order, leaves first, 1315 * but we should do the reattach at the oldest version that needs to be 1316 * reattached: 1317 */ 1318 for_each_btree_key_norestart(trans, iter, 1319 BTREE_ID_inodes, 1320 SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), 1321 BTREE_ITER_all_snapshots, k, ret) { 1322 if (k.k->p.offset != inode->bi_inum) 1323 break; 1324 1325 if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) 1326 continue; 1327 1328 if (!bkey_is_inode(k.k)) 1329 break; 1330 1331 struct bch_inode_unpacked parent_inode; 1332 bch2_inode_unpack(k, &parent_inode); 1333 1334 if (!inode_should_reattach(&parent_inode)) 1335 break; 1336 1337 *inode = parent_inode; 1338 } 1339 bch2_trans_iter_exit(trans, &iter); 1340 1341 return ret; 1342 } 1343 1344 static int check_unreachable_inode(struct btree_trans *trans, 1345 struct btree_iter *iter, 1346 struct bkey_s_c k) 1347 { 1348 struct printbuf buf = PRINTBUF; 1349 int ret = 0; 1350 1351 if (!bkey_is_inode(k.k)) 1352 return 0; 1353 1354 struct bch_inode_unpacked inode; 1355 BUG_ON(bch2_inode_unpack(k, &inode)); 1356 1357 if (!inode_should_reattach(&inode)) 1358 return 0; 1359 1360 ret = find_oldest_inode_needs_reattach(trans, &inode); 1361 if (ret) 1362 return ret; 1363 1364 if (fsck_err(trans, inode_unreachable, 1365 "unreachable inode:\n%s", 1366 (bch2_inode_unpacked_to_text(&buf, &inode), 1367 buf.buf))) 1368 ret = reattach_inode(trans, &inode); 1369 fsck_err: 1370 printbuf_exit(&buf); 1371 return ret; 1372 } 1373 1374 /* 1375 * Reattach unreachable (but not unlinked) inodes 1376 * 1377 * Run after check_inodes() and check_dirents(), so we node that inode 1378 * backpointer fields point to valid dirents, and every inode that has a dirent 1379 * that points to it has its backpointer field set - so we're just looking for 1380 * non-unlinked inodes without backpointers: 1381 * 1382 * XXX: this is racy w.r.t. hardlink removal in online fsck 1383 */ 1384 int bch2_check_unreachable_inodes(struct bch_fs *c) 1385 { 1386 int ret = bch2_trans_run(c, 1387 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1388 POS_MIN, 1389 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1390 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1391 check_unreachable_inode(trans, &iter, k))); 1392 bch_err_fn(c, ret); 1393 return ret; 1394 } 1395 1396 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) 1397 { 1398 switch (btree) { 1399 case BTREE_ID_extents: 1400 return S_ISREG(mode) || S_ISLNK(mode); 1401 case BTREE_ID_dirents: 1402 return S_ISDIR(mode); 1403 case BTREE_ID_xattrs: 1404 return true; 1405 default: 1406 BUG(); 1407 } 1408 } 1409 1410 static int check_key_has_inode(struct btree_trans *trans, 1411 struct btree_iter *iter, 1412 struct inode_walker *inode, 1413 struct inode_walker_entry *i, 1414 struct bkey_s_c k) 1415 { 1416 struct bch_fs *c = trans->c; 1417 struct printbuf buf = PRINTBUF; 1418 int ret = PTR_ERR_OR_ZERO(i); 1419 if (ret) 1420 return ret; 1421 1422 if (k.k->type == KEY_TYPE_whiteout) 1423 goto out; 1424 1425 if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { 1426 ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: 1427 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1428 if (ret) 1429 goto err; 1430 1431 inode->last_pos.inode--; 1432 ret = -BCH_ERR_transaction_restart_nested; 1433 goto err; 1434 } 1435 1436 if (fsck_err_on(!i, 1437 trans, key_in_missing_inode, 1438 "key in missing inode:\n %s", 1439 (printbuf_reset(&buf), 1440 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1441 goto delete; 1442 1443 if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), 1444 trans, key_in_wrong_inode_type, 1445 "key for wrong inode mode %o:\n %s", 1446 i->inode.bi_mode, 1447 (printbuf_reset(&buf), 1448 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1449 goto delete; 1450 out: 1451 err: 1452 fsck_err: 1453 printbuf_exit(&buf); 1454 bch_err_fn(c, ret); 1455 return ret; 1456 delete: 1457 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); 1458 goto out; 1459 } 1460 1461 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) 1462 { 1463 struct bch_fs *c = trans->c; 1464 int ret = 0; 1465 s64 count2; 1466 1467 darray_for_each(w->inodes, i) { 1468 if (i->inode.bi_sectors == i->count) 1469 continue; 1470 1471 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); 1472 1473 if (w->recalculate_sums) 1474 i->count = count2; 1475 1476 if (i->count != count2) { 1477 bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1478 w->last_pos.inode, i->snapshot, i->count, count2); 1479 return -BCH_ERR_internal_fsck_err; 1480 } 1481 1482 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), 1483 trans, inode_i_sectors_wrong, 1484 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", 1485 w->last_pos.inode, i->snapshot, 1486 i->inode.bi_sectors, i->count)) { 1487 i->inode.bi_sectors = i->count; 1488 ret = bch2_fsck_write_inode(trans, &i->inode); 1489 if (ret) 1490 break; 1491 } 1492 } 1493 fsck_err: 1494 bch_err_fn(c, ret); 1495 return ret; 1496 } 1497 1498 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) 1499 { 1500 u32 restart_count = trans->restart_count; 1501 return check_i_sectors_notnested(trans, w) ?: 1502 trans_was_restarted(trans, restart_count); 1503 } 1504 1505 struct extent_end { 1506 u32 snapshot; 1507 u64 offset; 1508 struct snapshots_seen seen; 1509 }; 1510 1511 struct extent_ends { 1512 struct bpos last_pos; 1513 DARRAY(struct extent_end) e; 1514 }; 1515 1516 static void extent_ends_reset(struct extent_ends *extent_ends) 1517 { 1518 darray_for_each(extent_ends->e, i) 1519 snapshots_seen_exit(&i->seen); 1520 extent_ends->e.nr = 0; 1521 } 1522 1523 static void extent_ends_exit(struct extent_ends *extent_ends) 1524 { 1525 extent_ends_reset(extent_ends); 1526 darray_exit(&extent_ends->e); 1527 } 1528 1529 static void extent_ends_init(struct extent_ends *extent_ends) 1530 { 1531 memset(extent_ends, 0, sizeof(*extent_ends)); 1532 } 1533 1534 static int extent_ends_at(struct bch_fs *c, 1535 struct extent_ends *extent_ends, 1536 struct snapshots_seen *seen, 1537 struct bkey_s_c k) 1538 { 1539 struct extent_end *i, n = (struct extent_end) { 1540 .offset = k.k->p.offset, 1541 .snapshot = k.k->p.snapshot, 1542 .seen = *seen, 1543 }; 1544 1545 n.seen.ids.data = kmemdup(seen->ids.data, 1546 sizeof(seen->ids.data[0]) * seen->ids.size, 1547 GFP_KERNEL); 1548 if (!n.seen.ids.data) 1549 return -BCH_ERR_ENOMEM_fsck_extent_ends_at; 1550 1551 __darray_for_each(extent_ends->e, i) { 1552 if (i->snapshot == k.k->p.snapshot) { 1553 snapshots_seen_exit(&i->seen); 1554 *i = n; 1555 return 0; 1556 } 1557 1558 if (i->snapshot >= k.k->p.snapshot) 1559 break; 1560 } 1561 1562 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); 1563 } 1564 1565 static int overlapping_extents_found(struct btree_trans *trans, 1566 enum btree_id btree, 1567 struct bpos pos1, struct snapshots_seen *pos1_seen, 1568 struct bkey pos2, 1569 bool *fixed, 1570 struct extent_end *extent_end) 1571 { 1572 struct bch_fs *c = trans->c; 1573 struct printbuf buf = PRINTBUF; 1574 struct btree_iter iter1, iter2 = { NULL }; 1575 struct bkey_s_c k1, k2; 1576 int ret; 1577 1578 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); 1579 1580 bch2_trans_iter_init(trans, &iter1, btree, pos1, 1581 BTREE_ITER_all_snapshots| 1582 BTREE_ITER_not_extents); 1583 k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); 1584 ret = bkey_err(k1); 1585 if (ret) 1586 goto err; 1587 1588 prt_str(&buf, "\n "); 1589 bch2_bkey_val_to_text(&buf, c, k1); 1590 1591 if (!bpos_eq(pos1, k1.k->p)) { 1592 prt_str(&buf, "\n wanted\n "); 1593 bch2_bpos_to_text(&buf, pos1); 1594 prt_str(&buf, "\n "); 1595 bch2_bkey_to_text(&buf, &pos2); 1596 1597 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", 1598 __func__, buf.buf); 1599 ret = -BCH_ERR_internal_fsck_err; 1600 goto err; 1601 } 1602 1603 bch2_trans_copy_iter(&iter2, &iter1); 1604 1605 while (1) { 1606 bch2_btree_iter_advance(&iter2); 1607 1608 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); 1609 ret = bkey_err(k2); 1610 if (ret) 1611 goto err; 1612 1613 if (bpos_ge(k2.k->p, pos2.p)) 1614 break; 1615 } 1616 1617 prt_str(&buf, "\n "); 1618 bch2_bkey_val_to_text(&buf, c, k2); 1619 1620 if (bpos_gt(k2.k->p, pos2.p) || 1621 pos2.size != k2.k->size) { 1622 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", 1623 __func__, buf.buf); 1624 ret = -BCH_ERR_internal_fsck_err; 1625 goto err; 1626 } 1627 1628 prt_printf(&buf, "\n overwriting %s extent", 1629 pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); 1630 1631 if (fsck_err(trans, extent_overlapping, 1632 "overlapping extents%s", buf.buf)) { 1633 struct btree_iter *old_iter = &iter1; 1634 struct disk_reservation res = { 0 }; 1635 1636 if (pos1.snapshot < pos2.p.snapshot) { 1637 old_iter = &iter2; 1638 swap(k1, k2); 1639 } 1640 1641 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); 1642 1643 ret = bch2_trans_update_extent_overwrite(trans, old_iter, 1644 BTREE_UPDATE_internal_snapshot_node, 1645 k1, k2) ?: 1646 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); 1647 bch2_disk_reservation_put(c, &res); 1648 1649 if (ret) 1650 goto err; 1651 1652 *fixed = true; 1653 1654 if (pos1.snapshot == pos2.p.snapshot) { 1655 /* 1656 * We overwrote the first extent, and did the overwrite 1657 * in the same snapshot: 1658 */ 1659 extent_end->offset = bkey_start_offset(&pos2); 1660 } else if (pos1.snapshot > pos2.p.snapshot) { 1661 /* 1662 * We overwrote the first extent in pos2's snapshot: 1663 */ 1664 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); 1665 } else { 1666 /* 1667 * We overwrote the second extent - restart 1668 * check_extent() from the top: 1669 */ 1670 ret = -BCH_ERR_transaction_restart_nested; 1671 } 1672 } 1673 fsck_err: 1674 err: 1675 bch2_trans_iter_exit(trans, &iter2); 1676 bch2_trans_iter_exit(trans, &iter1); 1677 printbuf_exit(&buf); 1678 return ret; 1679 } 1680 1681 static int check_overlapping_extents(struct btree_trans *trans, 1682 struct snapshots_seen *seen, 1683 struct extent_ends *extent_ends, 1684 struct bkey_s_c k, 1685 struct btree_iter *iter, 1686 bool *fixed) 1687 { 1688 struct bch_fs *c = trans->c; 1689 int ret = 0; 1690 1691 /* transaction restart, running again */ 1692 if (bpos_eq(extent_ends->last_pos, k.k->p)) 1693 return 0; 1694 1695 if (extent_ends->last_pos.inode != k.k->p.inode) 1696 extent_ends_reset(extent_ends); 1697 1698 darray_for_each(extent_ends->e, i) { 1699 if (i->offset <= bkey_start_offset(k.k)) 1700 continue; 1701 1702 if (!ref_visible2(c, 1703 k.k->p.snapshot, seen, 1704 i->snapshot, &i->seen)) 1705 continue; 1706 1707 ret = overlapping_extents_found(trans, iter->btree_id, 1708 SPOS(iter->pos.inode, 1709 i->offset, 1710 i->snapshot), 1711 &i->seen, 1712 *k.k, fixed, i); 1713 if (ret) 1714 goto err; 1715 } 1716 1717 extent_ends->last_pos = k.k->p; 1718 err: 1719 return ret; 1720 } 1721 1722 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, 1723 struct bkey_s_c k) 1724 { 1725 struct bch_fs *c = trans->c; 1726 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1727 struct bch_extent_crc_unpacked crc; 1728 const union bch_extent_entry *i; 1729 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; 1730 1731 bkey_for_each_crc(k.k, ptrs, crc, i) 1732 if (crc_is_encoded(crc) && 1733 crc.uncompressed_size > encoded_extent_max_sectors) { 1734 struct printbuf buf = PRINTBUF; 1735 1736 bch2_bkey_val_to_text(&buf, c, k); 1737 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); 1738 printbuf_exit(&buf); 1739 } 1740 1741 return 0; 1742 } 1743 1744 static int check_extent(struct btree_trans *trans, struct btree_iter *iter, 1745 struct bkey_s_c k, 1746 struct inode_walker *inode, 1747 struct snapshots_seen *s, 1748 struct extent_ends *extent_ends, 1749 struct disk_reservation *res) 1750 { 1751 struct bch_fs *c = trans->c; 1752 struct printbuf buf = PRINTBUF; 1753 int ret = 0; 1754 1755 ret = bch2_check_key_has_snapshot(trans, iter, k); 1756 if (ret) { 1757 ret = ret < 0 ? ret : 0; 1758 goto out; 1759 } 1760 1761 if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { 1762 ret = check_i_sectors(trans, inode); 1763 if (ret) 1764 goto err; 1765 } 1766 1767 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1768 if (ret) 1769 goto err; 1770 1771 struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); 1772 ret = PTR_ERR_OR_ZERO(extent_i); 1773 if (ret) 1774 goto err; 1775 1776 ret = check_key_has_inode(trans, iter, inode, extent_i, k); 1777 if (ret) 1778 goto err; 1779 1780 if (k.k->type != KEY_TYPE_whiteout) { 1781 ret = check_overlapping_extents(trans, s, extent_ends, k, iter, 1782 &inode->recalculate_sums); 1783 if (ret) 1784 goto err; 1785 1786 /* 1787 * Check inodes in reverse order, from oldest snapshots to 1788 * newest, starting from the inode that matches this extent's 1789 * snapshot. If we didn't have one, iterate over all inodes: 1790 */ 1791 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1792 inode->inodes.data && i >= inode->inodes.data; 1793 --i) { 1794 if (i->snapshot > k.k->p.snapshot || 1795 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) 1796 continue; 1797 1798 if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && 1799 !bkey_extent_is_reservation(k), 1800 trans, extent_past_end_of_inode, 1801 "extent type past end of inode %llu:%u, i_size %llu\n %s", 1802 i->inode.bi_inum, i->snapshot, i->inode.bi_size, 1803 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1804 struct btree_iter iter2; 1805 1806 bch2_trans_copy_iter(&iter2, iter); 1807 bch2_btree_iter_set_snapshot(&iter2, i->snapshot); 1808 ret = bch2_btree_iter_traverse(&iter2) ?: 1809 bch2_btree_delete_at(trans, &iter2, 1810 BTREE_UPDATE_internal_snapshot_node); 1811 bch2_trans_iter_exit(trans, &iter2); 1812 if (ret) 1813 goto err; 1814 1815 iter->k.type = KEY_TYPE_whiteout; 1816 break; 1817 } 1818 } 1819 } 1820 1821 ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); 1822 if (ret) 1823 goto err; 1824 1825 if (bkey_extent_is_allocation(k.k)) { 1826 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1827 inode->inodes.data && i >= inode->inodes.data; 1828 --i) { 1829 if (i->snapshot > k.k->p.snapshot || 1830 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) 1831 continue; 1832 1833 i->count += k.k->size; 1834 } 1835 } 1836 1837 if (k.k->type != KEY_TYPE_whiteout) { 1838 ret = extent_ends_at(c, extent_ends, s, k); 1839 if (ret) 1840 goto err; 1841 } 1842 out: 1843 err: 1844 fsck_err: 1845 printbuf_exit(&buf); 1846 bch_err_fn(c, ret); 1847 return ret; 1848 } 1849 1850 /* 1851 * Walk extents: verify that extents have a corresponding S_ISREG inode, and 1852 * that i_size an i_sectors are consistent 1853 */ 1854 int bch2_check_extents(struct bch_fs *c) 1855 { 1856 struct inode_walker w = inode_walker_init(); 1857 struct snapshots_seen s; 1858 struct extent_ends extent_ends; 1859 struct disk_reservation res = { 0 }; 1860 1861 snapshots_seen_init(&s); 1862 extent_ends_init(&extent_ends); 1863 1864 int ret = bch2_trans_run(c, 1865 for_each_btree_key(trans, iter, BTREE_ID_extents, 1866 POS(BCACHEFS_ROOT_INO, 0), 1867 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ 1868 bch2_disk_reservation_put(c, &res); 1869 check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: 1870 check_extent_overbig(trans, &iter, k); 1871 })) ?: 1872 check_i_sectors_notnested(trans, &w)); 1873 1874 bch2_disk_reservation_put(c, &res); 1875 extent_ends_exit(&extent_ends); 1876 inode_walker_exit(&w); 1877 snapshots_seen_exit(&s); 1878 1879 bch_err_fn(c, ret); 1880 return ret; 1881 } 1882 1883 int bch2_check_indirect_extents(struct bch_fs *c) 1884 { 1885 struct disk_reservation res = { 0 }; 1886 1887 int ret = bch2_trans_run(c, 1888 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, 1889 POS_MIN, 1890 BTREE_ITER_prefetch, k, 1891 &res, NULL, 1892 BCH_TRANS_COMMIT_no_enospc, ({ 1893 bch2_disk_reservation_put(c, &res); 1894 check_extent_overbig(trans, &iter, k); 1895 }))); 1896 1897 bch2_disk_reservation_put(c, &res); 1898 bch_err_fn(c, ret); 1899 return ret; 1900 } 1901 1902 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) 1903 { 1904 struct bch_fs *c = trans->c; 1905 int ret = 0; 1906 s64 count2; 1907 1908 darray_for_each(w->inodes, i) { 1909 if (i->inode.bi_nlink == i->count) 1910 continue; 1911 1912 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); 1913 if (count2 < 0) 1914 return count2; 1915 1916 if (i->count != count2) { 1917 bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", 1918 w->last_pos.inode, i->snapshot, i->count, count2); 1919 i->count = count2; 1920 if (i->inode.bi_nlink == i->count) 1921 continue; 1922 } 1923 1924 if (fsck_err_on(i->inode.bi_nlink != i->count, 1925 trans, inode_dir_wrong_nlink, 1926 "directory %llu:%u with wrong i_nlink: got %u, should be %llu", 1927 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { 1928 i->inode.bi_nlink = i->count; 1929 ret = bch2_fsck_write_inode(trans, &i->inode); 1930 if (ret) 1931 break; 1932 } 1933 } 1934 fsck_err: 1935 bch_err_fn(c, ret); 1936 return ret; 1937 } 1938 1939 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) 1940 { 1941 u32 restart_count = trans->restart_count; 1942 return check_subdir_count_notnested(trans, w) ?: 1943 trans_was_restarted(trans, restart_count); 1944 } 1945 1946 noinline_for_stack 1947 static int check_dirent_inode_dirent(struct btree_trans *trans, 1948 struct btree_iter *iter, 1949 struct bkey_s_c_dirent d, 1950 struct bch_inode_unpacked *target) 1951 { 1952 struct bch_fs *c = trans->c; 1953 struct printbuf buf = PRINTBUF; 1954 struct btree_iter bp_iter = { NULL }; 1955 int ret = 0; 1956 1957 if (inode_points_to_dirent(target, d)) 1958 return 0; 1959 1960 if (!target->bi_dir && 1961 !target->bi_dir_offset) { 1962 fsck_err_on(S_ISDIR(target->bi_mode), 1963 trans, inode_dir_missing_backpointer, 1964 "directory with missing backpointer\n%s", 1965 (printbuf_reset(&buf), 1966 bch2_bkey_val_to_text(&buf, c, d.s_c), 1967 prt_printf(&buf, "\n"), 1968 bch2_inode_unpacked_to_text(&buf, target), 1969 buf.buf)); 1970 1971 fsck_err_on(target->bi_flags & BCH_INODE_unlinked, 1972 trans, inode_unlinked_but_has_dirent, 1973 "inode unlinked but has dirent\n%s", 1974 (printbuf_reset(&buf), 1975 bch2_bkey_val_to_text(&buf, c, d.s_c), 1976 prt_printf(&buf, "\n"), 1977 bch2_inode_unpacked_to_text(&buf, target), 1978 buf.buf)); 1979 1980 target->bi_flags &= ~BCH_INODE_unlinked; 1981 target->bi_dir = d.k->p.inode; 1982 target->bi_dir_offset = d.k->p.offset; 1983 return __bch2_fsck_write_inode(trans, target); 1984 } 1985 1986 if (bch2_inode_should_have_bp(target) && 1987 !fsck_err(trans, inode_wrong_backpointer, 1988 "dirent points to inode that does not point back:\n %s", 1989 (bch2_bkey_val_to_text(&buf, c, d.s_c), 1990 prt_printf(&buf, "\n "), 1991 bch2_inode_unpacked_to_text(&buf, target), 1992 buf.buf))) 1993 goto err; 1994 1995 struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, 1996 SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot)); 1997 ret = bkey_err(bp_dirent); 1998 if (ret && !bch2_err_matches(ret, ENOENT)) 1999 goto err; 2000 2001 bool backpointer_exists = !ret; 2002 ret = 0; 2003 2004 if (fsck_err_on(!backpointer_exists, 2005 trans, inode_wrong_backpointer, 2006 "inode %llu:%u has wrong backpointer:\n" 2007 "got %llu:%llu\n" 2008 "should be %llu:%llu", 2009 target->bi_inum, target->bi_snapshot, 2010 target->bi_dir, 2011 target->bi_dir_offset, 2012 d.k->p.inode, 2013 d.k->p.offset)) { 2014 target->bi_dir = d.k->p.inode; 2015 target->bi_dir_offset = d.k->p.offset; 2016 ret = __bch2_fsck_write_inode(trans, target); 2017 goto out; 2018 } 2019 2020 bch2_bkey_val_to_text(&buf, c, d.s_c); 2021 prt_newline(&buf); 2022 if (backpointer_exists) 2023 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); 2024 2025 if (fsck_err_on(backpointer_exists && 2026 (S_ISDIR(target->bi_mode) || 2027 target->bi_subvol), 2028 trans, inode_dir_multiple_links, 2029 "%s %llu:%u with multiple links\n%s", 2030 S_ISDIR(target->bi_mode) ? "directory" : "subvolume", 2031 target->bi_inum, target->bi_snapshot, buf.buf)) { 2032 ret = __remove_dirent(trans, d.k->p); 2033 goto out; 2034 } 2035 2036 /* 2037 * hardlinked file with nlink 0: 2038 * We're just adjusting nlink here so check_nlinks() will pick 2039 * it up, it ignores inodes with nlink 0 2040 */ 2041 if (fsck_err_on(backpointer_exists && !target->bi_nlink, 2042 trans, inode_multiple_links_but_nlink_0, 2043 "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", 2044 target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { 2045 target->bi_nlink++; 2046 target->bi_flags &= ~BCH_INODE_unlinked; 2047 ret = __bch2_fsck_write_inode(trans, target); 2048 if (ret) 2049 goto err; 2050 } 2051 out: 2052 err: 2053 fsck_err: 2054 bch2_trans_iter_exit(trans, &bp_iter); 2055 printbuf_exit(&buf); 2056 bch_err_fn(c, ret); 2057 return ret; 2058 } 2059 2060 noinline_for_stack 2061 static int check_dirent_target(struct btree_trans *trans, 2062 struct btree_iter *iter, 2063 struct bkey_s_c_dirent d, 2064 struct bch_inode_unpacked *target) 2065 { 2066 struct bch_fs *c = trans->c; 2067 struct bkey_i_dirent *n; 2068 struct printbuf buf = PRINTBUF; 2069 int ret = 0; 2070 2071 ret = check_dirent_inode_dirent(trans, iter, d, target); 2072 if (ret) 2073 goto err; 2074 2075 if (fsck_err_on(d.v->d_type != inode_d_type(target), 2076 trans, dirent_d_type_wrong, 2077 "incorrect d_type: got %s, should be %s:\n%s", 2078 bch2_d_type_str(d.v->d_type), 2079 bch2_d_type_str(inode_d_type(target)), 2080 (printbuf_reset(&buf), 2081 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 2082 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); 2083 ret = PTR_ERR_OR_ZERO(n); 2084 if (ret) 2085 goto err; 2086 2087 bkey_reassemble(&n->k_i, d.s_c); 2088 n->v.d_type = inode_d_type(target); 2089 if (n->v.d_type == DT_SUBVOL) { 2090 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); 2091 n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); 2092 } else { 2093 n->v.d_inum = cpu_to_le64(target->bi_inum); 2094 } 2095 2096 ret = bch2_trans_update(trans, iter, &n->k_i, 0); 2097 if (ret) 2098 goto err; 2099 2100 d = dirent_i_to_s_c(n); 2101 } 2102 err: 2103 fsck_err: 2104 printbuf_exit(&buf); 2105 bch_err_fn(c, ret); 2106 return ret; 2107 } 2108 2109 /* find a subvolume that's a descendent of @snapshot: */ 2110 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) 2111 { 2112 struct btree_iter iter; 2113 struct bkey_s_c k; 2114 int ret; 2115 2116 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { 2117 if (k.k->type != KEY_TYPE_subvolume) 2118 continue; 2119 2120 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2121 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { 2122 bch2_trans_iter_exit(trans, &iter); 2123 *subvolid = k.k->p.offset; 2124 goto found; 2125 } 2126 } 2127 if (!ret) 2128 ret = -ENOENT; 2129 found: 2130 bch2_trans_iter_exit(trans, &iter); 2131 return ret; 2132 } 2133 2134 noinline_for_stack 2135 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, 2136 struct bkey_s_c_dirent d) 2137 { 2138 struct bch_fs *c = trans->c; 2139 struct btree_iter subvol_iter = {}; 2140 struct bch_inode_unpacked subvol_root; 2141 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); 2142 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 2143 u32 parent_snapshot; 2144 u32 new_parent_subvol = 0; 2145 u64 parent_inum; 2146 struct printbuf buf = PRINTBUF; 2147 int ret = 0; 2148 2149 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); 2150 if (ret && !bch2_err_matches(ret, ENOENT)) 2151 return ret; 2152 2153 if (ret || 2154 (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { 2155 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); 2156 if (ret2 && !bch2_err_matches(ret, ENOENT)) 2157 return ret2; 2158 } 2159 2160 if (ret && 2161 !new_parent_subvol && 2162 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 2163 /* 2164 * Couldn't find a subvol for dirent's snapshot - but we lost 2165 * subvols, so we need to reconstruct: 2166 */ 2167 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); 2168 if (ret) 2169 return ret; 2170 2171 parent_snapshot = d.k->p.snapshot; 2172 } 2173 2174 if (fsck_err_on(ret, 2175 trans, dirent_to_missing_parent_subvol, 2176 "dirent parent_subvol points to missing subvolume\n%s", 2177 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || 2178 fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), 2179 trans, dirent_not_visible_in_parent_subvol, 2180 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", 2181 parent_snapshot, 2182 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 2183 if (!new_parent_subvol) { 2184 bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); 2185 return -BCH_ERR_fsck_repair_unimplemented; 2186 } 2187 2188 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); 2189 ret = PTR_ERR_OR_ZERO(new_dirent); 2190 if (ret) 2191 goto err; 2192 2193 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); 2194 } 2195 2196 struct bkey_s_c_subvolume s = 2197 bch2_bkey_get_iter_typed(trans, &subvol_iter, 2198 BTREE_ID_subvolumes, POS(0, target_subvol), 2199 0, subvolume); 2200 ret = bkey_err(s.s_c); 2201 if (ret && !bch2_err_matches(ret, ENOENT)) 2202 return ret; 2203 2204 if (ret) { 2205 if (fsck_err(trans, dirent_to_missing_subvol, 2206 "dirent points to missing subvolume\n%s", 2207 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) 2208 return __remove_dirent(trans, d.k->p); 2209 ret = 0; 2210 goto out; 2211 } 2212 2213 if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, 2214 trans, subvol_fs_path_parent_wrong, 2215 "subvol with wrong fs_path_parent, should be be %u\n%s", 2216 parent_subvol, 2217 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 2218 struct bkey_i_subvolume *n = 2219 bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); 2220 ret = PTR_ERR_OR_ZERO(n); 2221 if (ret) 2222 goto err; 2223 2224 n->v.fs_path_parent = cpu_to_le32(parent_subvol); 2225 } 2226 2227 u64 target_inum = le64_to_cpu(s.v->inode); 2228 u32 target_snapshot = le32_to_cpu(s.v->snapshot); 2229 2230 ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root); 2231 if (ret && !bch2_err_matches(ret, ENOENT)) 2232 goto err; 2233 2234 if (ret) { 2235 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); 2236 ret = -BCH_ERR_fsck_repair_unimplemented; 2237 goto err; 2238 } 2239 2240 if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, 2241 trans, inode_bi_parent_wrong, 2242 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", 2243 target_inum, 2244 subvol_root.bi_parent_subvol, parent_subvol)) { 2245 subvol_root.bi_parent_subvol = parent_subvol; 2246 subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot); 2247 ret = __bch2_fsck_write_inode(trans, &subvol_root); 2248 if (ret) 2249 goto err; 2250 } 2251 2252 ret = check_dirent_target(trans, iter, d, &subvol_root); 2253 if (ret) 2254 goto err; 2255 out: 2256 err: 2257 fsck_err: 2258 bch2_trans_iter_exit(trans, &subvol_iter); 2259 printbuf_exit(&buf); 2260 return ret; 2261 } 2262 2263 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, 2264 struct bkey_s_c k, 2265 struct bch_hash_info *hash_info, 2266 struct inode_walker *dir, 2267 struct inode_walker *target, 2268 struct snapshots_seen *s) 2269 { 2270 struct bch_fs *c = trans->c; 2271 struct inode_walker_entry *i; 2272 struct printbuf buf = PRINTBUF; 2273 int ret = 0; 2274 2275 ret = bch2_check_key_has_snapshot(trans, iter, k); 2276 if (ret) { 2277 ret = ret < 0 ? ret : 0; 2278 goto out; 2279 } 2280 2281 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 2282 if (ret) 2283 goto err; 2284 2285 if (k.k->type == KEY_TYPE_whiteout) 2286 goto out; 2287 2288 if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { 2289 ret = check_subdir_count(trans, dir); 2290 if (ret) 2291 goto err; 2292 } 2293 2294 i = walk_inode(trans, dir, k); 2295 ret = PTR_ERR_OR_ZERO(i); 2296 if (ret < 0) 2297 goto err; 2298 2299 ret = check_key_has_inode(trans, iter, dir, i, k); 2300 if (ret) 2301 goto err; 2302 2303 if (!i) 2304 goto out; 2305 2306 if (dir->first_this_inode) 2307 *hash_info = bch2_hash_info_init(c, &i->inode); 2308 dir->first_this_inode = false; 2309 2310 ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); 2311 if (ret < 0) 2312 goto err; 2313 if (ret) { 2314 /* dirent has been deleted */ 2315 ret = 0; 2316 goto out; 2317 } 2318 2319 if (k.k->type != KEY_TYPE_dirent) 2320 goto out; 2321 2322 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2323 2324 if (d.v->d_type == DT_SUBVOL) { 2325 ret = check_dirent_to_subvol(trans, iter, d); 2326 if (ret) 2327 goto err; 2328 } else { 2329 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); 2330 if (ret) 2331 goto err; 2332 2333 if (fsck_err_on(!target->inodes.nr, 2334 trans, dirent_to_missing_inode, 2335 "dirent points to missing inode:\n%s", 2336 (printbuf_reset(&buf), 2337 bch2_bkey_val_to_text(&buf, c, k), 2338 buf.buf))) { 2339 ret = __remove_dirent(trans, d.k->p); 2340 if (ret) 2341 goto err; 2342 } 2343 2344 darray_for_each(target->inodes, i) { 2345 ret = check_dirent_target(trans, iter, d, &i->inode); 2346 if (ret) 2347 goto err; 2348 } 2349 } 2350 2351 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 2352 if (ret) 2353 goto err; 2354 2355 if (d.v->d_type == DT_DIR) 2356 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) 2357 i->count++; 2358 out: 2359 err: 2360 fsck_err: 2361 printbuf_exit(&buf); 2362 bch_err_fn(c, ret); 2363 return ret; 2364 } 2365 2366 /* 2367 * Walk dirents: verify that they all have a corresponding S_ISDIR inode, 2368 * validate d_type 2369 */ 2370 int bch2_check_dirents(struct bch_fs *c) 2371 { 2372 struct inode_walker dir = inode_walker_init(); 2373 struct inode_walker target = inode_walker_init(); 2374 struct snapshots_seen s; 2375 struct bch_hash_info hash_info; 2376 2377 snapshots_seen_init(&s); 2378 2379 int ret = bch2_trans_run(c, 2380 for_each_btree_key(trans, iter, BTREE_ID_dirents, 2381 POS(BCACHEFS_ROOT_INO, 0), 2382 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 2383 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: 2384 check_subdir_count_notnested(trans, &dir)); 2385 2386 snapshots_seen_exit(&s); 2387 inode_walker_exit(&dir); 2388 inode_walker_exit(&target); 2389 bch_err_fn(c, ret); 2390 return ret; 2391 } 2392 2393 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, 2394 struct bkey_s_c k, 2395 struct bch_hash_info *hash_info, 2396 struct inode_walker *inode) 2397 { 2398 struct bch_fs *c = trans->c; 2399 struct inode_walker_entry *i; 2400 int ret; 2401 2402 ret = bch2_check_key_has_snapshot(trans, iter, k); 2403 if (ret < 0) 2404 return ret; 2405 if (ret) 2406 return 0; 2407 2408 i = walk_inode(trans, inode, k); 2409 ret = PTR_ERR_OR_ZERO(i); 2410 if (ret) 2411 return ret; 2412 2413 ret = check_key_has_inode(trans, iter, inode, i, k); 2414 if (ret) 2415 return ret; 2416 2417 if (!i) 2418 return 0; 2419 2420 if (inode->first_this_inode) 2421 *hash_info = bch2_hash_info_init(c, &i->inode); 2422 inode->first_this_inode = false; 2423 2424 ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); 2425 bch_err_fn(c, ret); 2426 return ret; 2427 } 2428 2429 /* 2430 * Walk xattrs: verify that they all have a corresponding inode 2431 */ 2432 int bch2_check_xattrs(struct bch_fs *c) 2433 { 2434 struct inode_walker inode = inode_walker_init(); 2435 struct bch_hash_info hash_info; 2436 int ret = 0; 2437 2438 ret = bch2_trans_run(c, 2439 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, 2440 POS(BCACHEFS_ROOT_INO, 0), 2441 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, 2442 k, 2443 NULL, NULL, 2444 BCH_TRANS_COMMIT_no_enospc, 2445 check_xattr(trans, &iter, k, &hash_info, &inode))); 2446 2447 inode_walker_exit(&inode); 2448 bch_err_fn(c, ret); 2449 return ret; 2450 } 2451 2452 static int check_root_trans(struct btree_trans *trans) 2453 { 2454 struct bch_fs *c = trans->c; 2455 struct bch_inode_unpacked root_inode; 2456 u32 snapshot; 2457 u64 inum; 2458 int ret; 2459 2460 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); 2461 if (ret && !bch2_err_matches(ret, ENOENT)) 2462 return ret; 2463 2464 if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, 2465 "root subvol missing")) { 2466 struct bkey_i_subvolume *root_subvol = 2467 bch2_trans_kmalloc(trans, sizeof(*root_subvol)); 2468 ret = PTR_ERR_OR_ZERO(root_subvol); 2469 if (ret) 2470 goto err; 2471 2472 snapshot = U32_MAX; 2473 inum = BCACHEFS_ROOT_INO; 2474 2475 bkey_subvolume_init(&root_subvol->k_i); 2476 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; 2477 root_subvol->v.flags = 0; 2478 root_subvol->v.snapshot = cpu_to_le32(snapshot); 2479 root_subvol->v.inode = cpu_to_le64(inum); 2480 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); 2481 bch_err_msg(c, ret, "writing root subvol"); 2482 if (ret) 2483 goto err; 2484 } 2485 2486 ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode); 2487 if (ret && !bch2_err_matches(ret, ENOENT)) 2488 return ret; 2489 2490 if (mustfix_fsck_err_on(ret, 2491 trans, root_dir_missing, 2492 "root directory missing") || 2493 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), 2494 trans, root_inode_not_dir, 2495 "root inode not a directory")) { 2496 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 2497 0, NULL); 2498 root_inode.bi_inum = inum; 2499 root_inode.bi_snapshot = snapshot; 2500 2501 ret = __bch2_fsck_write_inode(trans, &root_inode); 2502 bch_err_msg(c, ret, "writing root inode"); 2503 } 2504 err: 2505 fsck_err: 2506 return ret; 2507 } 2508 2509 /* Get root directory, create if it doesn't exist: */ 2510 int bch2_check_root(struct bch_fs *c) 2511 { 2512 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2513 check_root_trans(trans)); 2514 bch_err_fn(c, ret); 2515 return ret; 2516 } 2517 2518 typedef DARRAY(u32) darray_u32; 2519 2520 static bool darray_u32_has(darray_u32 *d, u32 v) 2521 { 2522 darray_for_each(*d, i) 2523 if (*i == v) 2524 return true; 2525 return false; 2526 } 2527 2528 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) 2529 { 2530 struct bch_fs *c = trans->c; 2531 struct btree_iter parent_iter = {}; 2532 darray_u32 subvol_path = {}; 2533 struct printbuf buf = PRINTBUF; 2534 int ret = 0; 2535 2536 if (k.k->type != KEY_TYPE_subvolume) 2537 return 0; 2538 2539 while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { 2540 ret = darray_push(&subvol_path, k.k->p.offset); 2541 if (ret) 2542 goto err; 2543 2544 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2545 2546 struct bch_inode_unpacked subvol_root; 2547 ret = bch2_inode_find_by_inum_trans(trans, 2548 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 2549 &subvol_root); 2550 if (ret) 2551 break; 2552 2553 u32 parent = le32_to_cpu(s.v->fs_path_parent); 2554 2555 if (darray_u32_has(&subvol_path, parent)) { 2556 if (fsck_err(c, subvol_loop, "subvolume loop")) 2557 ret = reattach_subvol(trans, s); 2558 break; 2559 } 2560 2561 bch2_trans_iter_exit(trans, &parent_iter); 2562 bch2_trans_iter_init(trans, &parent_iter, 2563 BTREE_ID_subvolumes, POS(0, parent), 0); 2564 k = bch2_btree_iter_peek_slot(&parent_iter); 2565 ret = bkey_err(k); 2566 if (ret) 2567 goto err; 2568 2569 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, 2570 trans, subvol_unreachable, 2571 "unreachable subvolume %s", 2572 (bch2_bkey_val_to_text(&buf, c, s.s_c), 2573 buf.buf))) { 2574 ret = reattach_subvol(trans, s); 2575 break; 2576 } 2577 } 2578 fsck_err: 2579 err: 2580 printbuf_exit(&buf); 2581 darray_exit(&subvol_path); 2582 bch2_trans_iter_exit(trans, &parent_iter); 2583 return ret; 2584 } 2585 2586 int bch2_check_subvolume_structure(struct bch_fs *c) 2587 { 2588 int ret = bch2_trans_run(c, 2589 for_each_btree_key_commit(trans, iter, 2590 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, 2591 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2592 check_subvol_path(trans, &iter, k))); 2593 bch_err_fn(c, ret); 2594 return ret; 2595 } 2596 2597 struct pathbuf_entry { 2598 u64 inum; 2599 u32 snapshot; 2600 }; 2601 2602 typedef DARRAY(struct pathbuf_entry) pathbuf; 2603 2604 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) 2605 { 2606 darray_for_each(*p, i) 2607 if (i->inum == inum && 2608 i->snapshot == snapshot) 2609 return true; 2610 return false; 2611 } 2612 2613 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) 2614 { 2615 struct bch_fs *c = trans->c; 2616 struct btree_iter inode_iter = {}; 2617 struct bch_inode_unpacked inode; 2618 struct printbuf buf = PRINTBUF; 2619 u32 snapshot = inode_k.k->p.snapshot; 2620 int ret = 0; 2621 2622 p->nr = 0; 2623 2624 BUG_ON(bch2_inode_unpack(inode_k, &inode)); 2625 2626 if (!S_ISDIR(inode.bi_mode)) 2627 return 0; 2628 2629 while (!inode.bi_subvol) { 2630 struct btree_iter dirent_iter; 2631 struct bkey_s_c_dirent d; 2632 u32 parent_snapshot = snapshot; 2633 2634 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); 2635 ret = bkey_err(d.s_c); 2636 if (ret && !bch2_err_matches(ret, ENOENT)) 2637 break; 2638 2639 if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) 2640 bch2_trans_iter_exit(trans, &dirent_iter); 2641 2642 if (bch2_err_matches(ret, ENOENT)) { 2643 printbuf_reset(&buf); 2644 bch2_bkey_val_to_text(&buf, c, inode_k); 2645 bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", 2646 bch2_err_str(ret), buf.buf); 2647 goto out; 2648 } 2649 2650 bch2_trans_iter_exit(trans, &dirent_iter); 2651 2652 ret = darray_push(p, ((struct pathbuf_entry) { 2653 .inum = inode.bi_inum, 2654 .snapshot = snapshot, 2655 })); 2656 if (ret) 2657 return ret; 2658 2659 snapshot = parent_snapshot; 2660 2661 bch2_trans_iter_exit(trans, &inode_iter); 2662 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, 2663 SPOS(0, inode.bi_dir, snapshot), 0); 2664 ret = bkey_err(inode_k) ?: 2665 !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode 2666 : bch2_inode_unpack(inode_k, &inode); 2667 if (ret) { 2668 /* Should have been caught in dirents pass */ 2669 bch_err_msg(c, ret, "error looking up parent directory"); 2670 break; 2671 } 2672 2673 snapshot = inode_k.k->p.snapshot; 2674 2675 if (path_is_dup(p, inode.bi_inum, snapshot)) { 2676 /* XXX print path */ 2677 bch_err(c, "directory structure loop"); 2678 2679 darray_for_each(*p, i) 2680 pr_err("%llu:%u", i->inum, i->snapshot); 2681 pr_err("%llu:%u", inode.bi_inum, snapshot); 2682 2683 if (fsck_err(trans, dir_loop, "directory structure loop")) { 2684 ret = remove_backpointer(trans, &inode); 2685 bch_err_msg(c, ret, "removing dirent"); 2686 if (ret) 2687 break; 2688 2689 ret = reattach_inode(trans, &inode); 2690 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 2691 } 2692 break; 2693 } 2694 } 2695 out: 2696 fsck_err: 2697 bch2_trans_iter_exit(trans, &inode_iter); 2698 printbuf_exit(&buf); 2699 bch_err_fn(c, ret); 2700 return ret; 2701 } 2702 2703 /* 2704 * Check for loops in the directory structure: all other connectivity issues 2705 * have been fixed by prior passes 2706 */ 2707 int bch2_check_directory_structure(struct bch_fs *c) 2708 { 2709 pathbuf path = { 0, }; 2710 int ret; 2711 2712 ret = bch2_trans_run(c, 2713 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, 2714 BTREE_ITER_intent| 2715 BTREE_ITER_prefetch| 2716 BTREE_ITER_all_snapshots, k, 2717 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 2718 if (!bkey_is_inode(k.k)) 2719 continue; 2720 2721 if (bch2_inode_flags(k) & BCH_INODE_unlinked) 2722 continue; 2723 2724 check_path(trans, &path, k); 2725 }))); 2726 darray_exit(&path); 2727 2728 bch_err_fn(c, ret); 2729 return ret; 2730 } 2731 2732 struct nlink_table { 2733 size_t nr; 2734 size_t size; 2735 2736 struct nlink { 2737 u64 inum; 2738 u32 snapshot; 2739 u32 count; 2740 } *d; 2741 }; 2742 2743 static int add_nlink(struct bch_fs *c, struct nlink_table *t, 2744 u64 inum, u32 snapshot) 2745 { 2746 if (t->nr == t->size) { 2747 size_t new_size = max_t(size_t, 128UL, t->size * 2); 2748 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); 2749 2750 if (!d) { 2751 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", 2752 new_size); 2753 return -BCH_ERR_ENOMEM_fsck_add_nlink; 2754 } 2755 2756 if (t->d) 2757 memcpy(d, t->d, t->size * sizeof(t->d[0])); 2758 kvfree(t->d); 2759 2760 t->d = d; 2761 t->size = new_size; 2762 } 2763 2764 2765 t->d[t->nr++] = (struct nlink) { 2766 .inum = inum, 2767 .snapshot = snapshot, 2768 }; 2769 2770 return 0; 2771 } 2772 2773 static int nlink_cmp(const void *_l, const void *_r) 2774 { 2775 const struct nlink *l = _l; 2776 const struct nlink *r = _r; 2777 2778 return cmp_int(l->inum, r->inum); 2779 } 2780 2781 static void inc_link(struct bch_fs *c, struct snapshots_seen *s, 2782 struct nlink_table *links, 2783 u64 range_start, u64 range_end, u64 inum, u32 snapshot) 2784 { 2785 struct nlink *link, key = { 2786 .inum = inum, .snapshot = U32_MAX, 2787 }; 2788 2789 if (inum < range_start || inum >= range_end) 2790 return; 2791 2792 link = __inline_bsearch(&key, links->d, links->nr, 2793 sizeof(links->d[0]), nlink_cmp); 2794 if (!link) 2795 return; 2796 2797 while (link > links->d && link[0].inum == link[-1].inum) 2798 --link; 2799 2800 for (; link < links->d + links->nr && link->inum == inum; link++) 2801 if (ref_visible(c, s, snapshot, link->snapshot)) { 2802 link->count++; 2803 if (link->snapshot >= snapshot) 2804 break; 2805 } 2806 } 2807 2808 noinline_for_stack 2809 static int check_nlinks_find_hardlinks(struct bch_fs *c, 2810 struct nlink_table *t, 2811 u64 start, u64 *end) 2812 { 2813 int ret = bch2_trans_run(c, 2814 for_each_btree_key(trans, iter, BTREE_ID_inodes, 2815 POS(0, start), 2816 BTREE_ITER_intent| 2817 BTREE_ITER_prefetch| 2818 BTREE_ITER_all_snapshots, k, ({ 2819 if (!bkey_is_inode(k.k)) 2820 continue; 2821 2822 /* Should never fail, checked by bch2_inode_invalid: */ 2823 struct bch_inode_unpacked u; 2824 BUG_ON(bch2_inode_unpack(k, &u)); 2825 2826 /* 2827 * Backpointer and directory structure checks are sufficient for 2828 * directories, since they can't have hardlinks: 2829 */ 2830 if (S_ISDIR(u.bi_mode)) 2831 continue; 2832 2833 /* 2834 * Previous passes ensured that bi_nlink is nonzero if 2835 * it had multiple hardlinks: 2836 */ 2837 if (!u.bi_nlink) 2838 continue; 2839 2840 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); 2841 if (ret) { 2842 *end = k.k->p.offset; 2843 ret = 0; 2844 break; 2845 } 2846 0; 2847 }))); 2848 2849 bch_err_fn(c, ret); 2850 return ret; 2851 } 2852 2853 noinline_for_stack 2854 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, 2855 u64 range_start, u64 range_end) 2856 { 2857 struct snapshots_seen s; 2858 2859 snapshots_seen_init(&s); 2860 2861 int ret = bch2_trans_run(c, 2862 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, 2863 BTREE_ITER_intent| 2864 BTREE_ITER_prefetch| 2865 BTREE_ITER_all_snapshots, k, ({ 2866 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); 2867 if (ret) 2868 break; 2869 2870 if (k.k->type == KEY_TYPE_dirent) { 2871 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2872 2873 if (d.v->d_type != DT_DIR && 2874 d.v->d_type != DT_SUBVOL) 2875 inc_link(c, &s, links, range_start, range_end, 2876 le64_to_cpu(d.v->d_inum), d.k->p.snapshot); 2877 } 2878 0; 2879 }))); 2880 2881 snapshots_seen_exit(&s); 2882 2883 bch_err_fn(c, ret); 2884 return ret; 2885 } 2886 2887 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, 2888 struct bkey_s_c k, 2889 struct nlink_table *links, 2890 size_t *idx, u64 range_end) 2891 { 2892 struct bch_inode_unpacked u; 2893 struct nlink *link = &links->d[*idx]; 2894 int ret = 0; 2895 2896 if (k.k->p.offset >= range_end) 2897 return 1; 2898 2899 if (!bkey_is_inode(k.k)) 2900 return 0; 2901 2902 BUG_ON(bch2_inode_unpack(k, &u)); 2903 2904 if (S_ISDIR(u.bi_mode)) 2905 return 0; 2906 2907 if (!u.bi_nlink) 2908 return 0; 2909 2910 while ((cmp_int(link->inum, k.k->p.offset) ?: 2911 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { 2912 BUG_ON(*idx == links->nr); 2913 link = &links->d[++*idx]; 2914 } 2915 2916 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, 2917 trans, inode_wrong_nlink, 2918 "inode %llu type %s has wrong i_nlink (%u, should be %u)", 2919 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], 2920 bch2_inode_nlink_get(&u), link->count)) { 2921 bch2_inode_nlink_set(&u, link->count); 2922 ret = __bch2_fsck_write_inode(trans, &u); 2923 } 2924 fsck_err: 2925 return ret; 2926 } 2927 2928 noinline_for_stack 2929 static int check_nlinks_update_hardlinks(struct bch_fs *c, 2930 struct nlink_table *links, 2931 u64 range_start, u64 range_end) 2932 { 2933 size_t idx = 0; 2934 2935 int ret = bch2_trans_run(c, 2936 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 2937 POS(0, range_start), 2938 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 2939 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2940 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); 2941 if (ret < 0) { 2942 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); 2943 return ret; 2944 } 2945 2946 return 0; 2947 } 2948 2949 int bch2_check_nlinks(struct bch_fs *c) 2950 { 2951 struct nlink_table links = { 0 }; 2952 u64 this_iter_range_start, next_iter_range_start = 0; 2953 int ret = 0; 2954 2955 do { 2956 this_iter_range_start = next_iter_range_start; 2957 next_iter_range_start = U64_MAX; 2958 2959 ret = check_nlinks_find_hardlinks(c, &links, 2960 this_iter_range_start, 2961 &next_iter_range_start); 2962 2963 ret = check_nlinks_walk_dirents(c, &links, 2964 this_iter_range_start, 2965 next_iter_range_start); 2966 if (ret) 2967 break; 2968 2969 ret = check_nlinks_update_hardlinks(c, &links, 2970 this_iter_range_start, 2971 next_iter_range_start); 2972 if (ret) 2973 break; 2974 2975 links.nr = 0; 2976 } while (next_iter_range_start != U64_MAX); 2977 2978 kvfree(links.d); 2979 bch_err_fn(c, ret); 2980 return ret; 2981 } 2982 2983 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, 2984 struct bkey_s_c k) 2985 { 2986 struct bkey_s_c_reflink_p p; 2987 struct bkey_i_reflink_p *u; 2988 2989 if (k.k->type != KEY_TYPE_reflink_p) 2990 return 0; 2991 2992 p = bkey_s_c_to_reflink_p(k); 2993 2994 if (!p.v->front_pad && !p.v->back_pad) 2995 return 0; 2996 2997 u = bch2_trans_kmalloc(trans, sizeof(*u)); 2998 int ret = PTR_ERR_OR_ZERO(u); 2999 if (ret) 3000 return ret; 3001 3002 bkey_reassemble(&u->k_i, k); 3003 u->v.front_pad = 0; 3004 u->v.back_pad = 0; 3005 3006 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); 3007 } 3008 3009 int bch2_fix_reflink_p(struct bch_fs *c) 3010 { 3011 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) 3012 return 0; 3013 3014 int ret = bch2_trans_run(c, 3015 for_each_btree_key_commit(trans, iter, 3016 BTREE_ID_extents, POS_MIN, 3017 BTREE_ITER_intent|BTREE_ITER_prefetch| 3018 BTREE_ITER_all_snapshots, k, 3019 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 3020 fix_reflink_p_key(trans, &iter, k))); 3021 bch_err_fn(c, ret); 3022 return ret; 3023 } 3024