1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_cache.h" 6 #include "btree_update.h" 7 #include "buckets.h" 8 #include "darray.h" 9 #include "dirent.h" 10 #include "error.h" 11 #include "fs.h" 12 #include "fs-common.h" 13 #include "fsck.h" 14 #include "inode.h" 15 #include "keylist.h" 16 #include "recovery_passes.h" 17 #include "snapshot.h" 18 #include "super.h" 19 #include "xattr.h" 20 21 #include <linux/bsearch.h> 22 #include <linux/dcache.h> /* struct qstr */ 23 24 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, 25 struct bkey_s_c_dirent d) 26 { 27 return inode->bi_dir == d.k->p.inode && 28 inode->bi_dir_offset == d.k->p.offset; 29 } 30 31 static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, 32 struct bch_inode_unpacked *inode) 33 { 34 if (d.v->d_type == DT_SUBVOL 35 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 36 : le64_to_cpu(d.v->d_inum) == inode->bi_inum) 37 return 0; 38 return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 39 } 40 41 static void dirent_inode_mismatch_msg(struct printbuf *out, 42 struct bch_fs *c, 43 struct bkey_s_c_dirent dirent, 44 struct bch_inode_unpacked *inode) 45 { 46 prt_str(out, "inode points to dirent that does not point back:"); 47 prt_newline(out); 48 bch2_bkey_val_to_text(out, c, dirent.s_c); 49 prt_newline(out); 50 bch2_inode_unpacked_to_text(out, inode); 51 } 52 53 static int dirent_points_to_inode(struct bch_fs *c, 54 struct bkey_s_c_dirent dirent, 55 struct bch_inode_unpacked *inode) 56 { 57 int ret = dirent_points_to_inode_nowarn(dirent, inode); 58 if (ret) { 59 struct printbuf buf = PRINTBUF; 60 dirent_inode_mismatch_msg(&buf, c, dirent, inode); 61 bch_warn(c, "%s", buf.buf); 62 printbuf_exit(&buf); 63 } 64 return ret; 65 } 66 67 /* 68 * XXX: this is handling transaction restarts without returning 69 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: 70 */ 71 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, 72 u32 snapshot) 73 { 74 u64 sectors = 0; 75 76 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, 77 SPOS(inum, 0, snapshot), 78 POS(inum, U64_MAX), 79 0, k, ({ 80 if (bkey_extent_is_allocation(k.k)) 81 sectors += k.k->size; 82 0; 83 })); 84 85 return ret ?: sectors; 86 } 87 88 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, 89 u32 snapshot) 90 { 91 u64 subdirs = 0; 92 93 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, 94 SPOS(inum, 0, snapshot), 95 POS(inum, U64_MAX), 96 0, k, ({ 97 if (k.k->type == KEY_TYPE_dirent && 98 bkey_s_c_to_dirent(k).v->d_type == DT_DIR) 99 subdirs++; 100 0; 101 })); 102 103 return ret ?: subdirs; 104 } 105 106 static int subvol_lookup(struct btree_trans *trans, u32 subvol, 107 u32 *snapshot, u64 *inum) 108 { 109 struct bch_subvolume s; 110 int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); 111 112 *snapshot = le32_to_cpu(s.snapshot); 113 *inum = le64_to_cpu(s.inode); 114 return ret; 115 } 116 117 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, 118 struct bch_inode_unpacked *inode) 119 { 120 struct btree_iter iter; 121 struct bkey_s_c k; 122 int ret; 123 124 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), 125 BTREE_ITER_all_snapshots, k, ret) { 126 if (k.k->p.offset != inode_nr) 127 break; 128 if (!bkey_is_inode(k.k)) 129 continue; 130 ret = bch2_inode_unpack(k, inode); 131 goto found; 132 } 133 ret = -BCH_ERR_ENOENT_inode; 134 found: 135 bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); 136 bch2_trans_iter_exit(trans, &iter); 137 return ret; 138 } 139 140 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, 141 struct bch_inode_unpacked *inode) 142 { 143 struct btree_iter iter; 144 struct bkey_s_c k; 145 int ret; 146 147 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 148 SPOS(0, inode_nr, snapshot), 0); 149 ret = bkey_err(k); 150 if (ret) 151 goto err; 152 153 ret = bkey_is_inode(k.k) 154 ? bch2_inode_unpack(k, inode) 155 : -BCH_ERR_ENOENT_inode; 156 err: 157 bch2_trans_iter_exit(trans, &iter); 158 return ret; 159 } 160 161 static int lookup_dirent_in_snapshot(struct btree_trans *trans, 162 struct bch_hash_info hash_info, 163 subvol_inum dir, struct qstr *name, 164 u64 *target, unsigned *type, u32 snapshot) 165 { 166 struct btree_iter iter; 167 struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, 168 &hash_info, dir, name, 0, snapshot); 169 int ret = bkey_err(k); 170 if (ret) 171 return ret; 172 173 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); 174 *target = le64_to_cpu(d.v->d_inum); 175 *type = d.v->d_type; 176 bch2_trans_iter_exit(trans, &iter); 177 return 0; 178 } 179 180 static int __remove_dirent(struct btree_trans *trans, struct bpos pos) 181 { 182 struct bch_fs *c = trans->c; 183 struct btree_iter iter; 184 struct bch_inode_unpacked dir_inode; 185 struct bch_hash_info dir_hash_info; 186 int ret; 187 188 ret = lookup_first_inode(trans, pos.inode, &dir_inode); 189 if (ret) 190 goto err; 191 192 dir_hash_info = bch2_hash_info_init(c, &dir_inode); 193 194 bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); 195 196 ret = bch2_btree_iter_traverse(&iter) ?: 197 bch2_hash_delete_at(trans, bch2_dirent_hash_desc, 198 &dir_hash_info, &iter, 199 BTREE_UPDATE_internal_snapshot_node); 200 bch2_trans_iter_exit(trans, &iter); 201 err: 202 bch_err_fn(c, ret); 203 return ret; 204 } 205 206 /* Get lost+found, create if it doesn't exist: */ 207 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, 208 struct bch_inode_unpacked *lostfound, 209 u64 reattaching_inum) 210 { 211 struct bch_fs *c = trans->c; 212 struct qstr lostfound_str = QSTR("lost+found"); 213 u64 inum = 0; 214 unsigned d_type = 0; 215 int ret; 216 217 struct bch_snapshot_tree st; 218 ret = bch2_snapshot_tree_lookup(trans, 219 bch2_snapshot_tree(c, snapshot), &st); 220 if (ret) 221 return ret; 222 223 subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; 224 225 struct bch_subvolume subvol; 226 ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), 227 false, 0, &subvol); 228 bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", 229 le32_to_cpu(st.master_subvol), snapshot); 230 if (ret) 231 return ret; 232 233 if (!subvol.inode) { 234 struct btree_iter iter; 235 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, 236 BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), 237 0, subvolume); 238 ret = PTR_ERR_OR_ZERO(subvol); 239 if (ret) 240 return ret; 241 242 subvol->v.inode = cpu_to_le64(reattaching_inum); 243 bch2_trans_iter_exit(trans, &iter); 244 } 245 246 root_inum.inum = le64_to_cpu(subvol.inode); 247 248 struct bch_inode_unpacked root_inode; 249 struct bch_hash_info root_hash_info; 250 ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); 251 bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", 252 root_inum.inum, le32_to_cpu(st.master_subvol)); 253 if (ret) 254 return ret; 255 256 root_hash_info = bch2_hash_info_init(c, &root_inode); 257 258 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, 259 &lostfound_str, &inum, &d_type, snapshot); 260 if (bch2_err_matches(ret, ENOENT)) 261 goto create_lostfound; 262 263 bch_err_fn(c, ret); 264 if (ret) 265 return ret; 266 267 if (d_type != DT_DIR) { 268 bch_err(c, "error looking up lost+found: not a directory"); 269 return -BCH_ERR_ENOENT_not_directory; 270 } 271 272 /* 273 * The bch2_check_dirents pass has already run, dangling dirents 274 * shouldn't exist here: 275 */ 276 ret = lookup_inode(trans, inum, snapshot, lostfound); 277 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", 278 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); 279 return ret; 280 281 create_lostfound: 282 /* 283 * we always create lost+found in the root snapshot; we don't want 284 * different branches of the snapshot tree to have different lost+found 285 */ 286 snapshot = le32_to_cpu(st.root_snapshot); 287 /* 288 * XXX: we could have a nicer log message here if we had a nice way to 289 * walk backpointers to print a path 290 */ 291 bch_notice(c, "creating lost+found in subvol %llu snapshot %u", 292 root_inum.subvol, le32_to_cpu(st.root_snapshot)); 293 294 u64 now = bch2_current_time(c); 295 struct btree_iter lostfound_iter = { NULL }; 296 u64 cpu = raw_smp_processor_id(); 297 298 bch2_inode_init_early(c, lostfound); 299 bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); 300 lostfound->bi_dir = root_inode.bi_inum; 301 lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); 302 303 root_inode.bi_nlink++; 304 305 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); 306 if (ret) 307 goto err; 308 309 bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); 310 ret = bch2_btree_iter_traverse(&lostfound_iter); 311 if (ret) 312 goto err; 313 314 ret = bch2_dirent_create_snapshot(trans, 315 0, root_inode.bi_inum, snapshot, &root_hash_info, 316 mode_to_type(lostfound->bi_mode), 317 &lostfound_str, 318 lostfound->bi_inum, 319 &lostfound->bi_dir_offset, 320 STR_HASH_must_create) ?: 321 bch2_inode_write_flags(trans, &lostfound_iter, lostfound, 322 BTREE_UPDATE_internal_snapshot_node); 323 err: 324 bch_err_msg(c, ret, "creating lost+found"); 325 bch2_trans_iter_exit(trans, &lostfound_iter); 326 return ret; 327 } 328 329 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) 330 { 331 if (inode->bi_inum == BCACHEFS_ROOT_INO && 332 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) 333 return false; 334 335 return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); 336 } 337 338 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) 339 { 340 struct btree_iter iter; 341 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, 342 SPOS(d_pos.inode, d_pos.offset, snapshot), 343 BTREE_ITER_intent| 344 BTREE_ITER_with_updates); 345 int ret = bkey_err(k); 346 if (ret) 347 return ret; 348 349 if (bpos_eq(k.k->p, d_pos)) { 350 /* 351 * delet_at() doesn't work because the update path doesn't 352 * internally use BTREE_ITER_with_updates yet 353 */ 354 struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); 355 ret = PTR_ERR_OR_ZERO(k); 356 if (ret) 357 goto err; 358 359 bkey_init(&k->k); 360 k->k.type = KEY_TYPE_whiteout; 361 k->k.p = iter.pos; 362 ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); 363 } 364 err: 365 bch2_trans_iter_exit(trans, &iter); 366 return ret; 367 } 368 369 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) 370 { 371 struct bch_fs *c = trans->c; 372 struct bch_inode_unpacked lostfound; 373 char name_buf[20]; 374 int ret; 375 376 u32 dirent_snapshot = inode->bi_snapshot; 377 if (inode->bi_subvol) { 378 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; 379 380 u64 root_inum; 381 ret = subvol_lookup(trans, inode->bi_parent_subvol, 382 &dirent_snapshot, &root_inum); 383 if (ret) 384 return ret; 385 386 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); 387 } else { 388 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 389 } 390 391 ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); 392 if (ret) 393 return ret; 394 395 lostfound.bi_nlink += S_ISDIR(inode->bi_mode); 396 397 /* ensure lost+found inode is also present in inode snapshot */ 398 if (!inode->bi_subvol) { 399 BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot)); 400 lostfound.bi_snapshot = inode->bi_snapshot; 401 } 402 403 ret = __bch2_fsck_write_inode(trans, &lostfound); 404 if (ret) 405 return ret; 406 407 struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); 408 struct qstr name = (struct qstr) QSTR(name_buf); 409 410 inode->bi_dir = lostfound.bi_inum; 411 412 ret = bch2_dirent_create_snapshot(trans, 413 inode->bi_parent_subvol, lostfound.bi_inum, 414 dirent_snapshot, 415 &dir_hash, 416 inode_d_type(inode), 417 &name, 418 inode->bi_subvol ?: inode->bi_inum, 419 &inode->bi_dir_offset, 420 STR_HASH_must_create); 421 if (ret) { 422 bch_err_msg(c, ret, "error creating dirent"); 423 return ret; 424 } 425 426 ret = __bch2_fsck_write_inode(trans, inode); 427 if (ret) 428 return ret; 429 430 /* 431 * Fix up inodes in child snapshots: if they should also be reattached 432 * update the backpointer field, if they should not be we need to emit 433 * whiteouts for the dirent we just created. 434 */ 435 if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { 436 snapshot_id_list whiteouts_done; 437 struct btree_iter iter; 438 struct bkey_s_c k; 439 440 darray_init(&whiteouts_done); 441 442 for_each_btree_key_reverse_norestart(trans, iter, 443 BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), 444 BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { 445 if (k.k->p.offset != inode->bi_inum) 446 break; 447 448 if (!bkey_is_inode(k.k) || 449 !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || 450 snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) 451 continue; 452 453 struct bch_inode_unpacked child_inode; 454 bch2_inode_unpack(k, &child_inode); 455 456 if (!inode_should_reattach(&child_inode)) { 457 ret = maybe_delete_dirent(trans, 458 SPOS(lostfound.bi_inum, inode->bi_dir_offset, 459 dirent_snapshot), 460 k.k->p.snapshot); 461 if (ret) 462 break; 463 464 ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); 465 if (ret) 466 break; 467 } else { 468 iter.snapshot = k.k->p.snapshot; 469 child_inode.bi_dir = inode->bi_dir; 470 child_inode.bi_dir_offset = inode->bi_dir_offset; 471 472 ret = bch2_inode_write_flags(trans, &iter, &child_inode, 473 BTREE_UPDATE_internal_snapshot_node); 474 if (ret) 475 break; 476 } 477 } 478 darray_exit(&whiteouts_done); 479 bch2_trans_iter_exit(trans, &iter); 480 } 481 482 return ret; 483 } 484 485 static int remove_backpointer(struct btree_trans *trans, 486 struct bch_inode_unpacked *inode) 487 { 488 if (!inode->bi_dir) 489 return 0; 490 491 struct bch_fs *c = trans->c; 492 struct btree_iter iter; 493 struct bkey_s_c_dirent d = 494 bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, 495 SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0, 496 dirent); 497 int ret = bkey_err(d) ?: 498 dirent_points_to_inode(c, d, inode) ?: 499 __remove_dirent(trans, d.k->p); 500 bch2_trans_iter_exit(trans, &iter); 501 return ret; 502 } 503 504 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) 505 { 506 struct bch_fs *c = trans->c; 507 508 struct bch_inode_unpacked inode; 509 int ret = bch2_inode_find_by_inum_trans(trans, 510 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 511 &inode); 512 if (ret) 513 return ret; 514 515 ret = remove_backpointer(trans, &inode); 516 if (!bch2_err_matches(ret, ENOENT)) 517 bch_err_msg(c, ret, "removing dirent"); 518 if (ret) 519 return ret; 520 521 ret = reattach_inode(trans, &inode); 522 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 523 return ret; 524 } 525 526 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) 527 { 528 struct bch_fs *c = trans->c; 529 530 if (!bch2_snapshot_is_leaf(c, snapshotid)) { 531 bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); 532 return -BCH_ERR_fsck_repair_unimplemented; 533 } 534 535 /* 536 * If inum isn't set, that means we're being called from check_dirents, 537 * not check_inodes - the root of this subvolume doesn't exist or we 538 * would have found it there: 539 */ 540 if (!inum) { 541 struct btree_iter inode_iter = {}; 542 struct bch_inode_unpacked new_inode; 543 u64 cpu = raw_smp_processor_id(); 544 545 bch2_inode_init_early(c, &new_inode); 546 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); 547 548 new_inode.bi_subvol = subvolid; 549 550 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: 551 bch2_btree_iter_traverse(&inode_iter) ?: 552 bch2_inode_write(trans, &inode_iter, &new_inode); 553 bch2_trans_iter_exit(trans, &inode_iter); 554 if (ret) 555 return ret; 556 557 inum = new_inode.bi_inum; 558 } 559 560 bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); 561 562 struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); 563 int ret = PTR_ERR_OR_ZERO(new_subvol); 564 if (ret) 565 return ret; 566 567 bkey_subvolume_init(&new_subvol->k_i); 568 new_subvol->k.p.offset = subvolid; 569 new_subvol->v.snapshot = cpu_to_le32(snapshotid); 570 new_subvol->v.inode = cpu_to_le64(inum); 571 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); 572 if (ret) 573 return ret; 574 575 struct btree_iter iter; 576 struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, 577 BTREE_ID_snapshots, POS(0, snapshotid), 578 0, snapshot); 579 ret = PTR_ERR_OR_ZERO(s); 580 bch_err_msg(c, ret, "getting snapshot %u", snapshotid); 581 if (ret) 582 return ret; 583 584 u32 snapshot_tree = le32_to_cpu(s->v.tree); 585 586 s->v.subvol = cpu_to_le32(subvolid); 587 SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); 588 bch2_trans_iter_exit(trans, &iter); 589 590 struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, 591 BTREE_ID_snapshot_trees, POS(0, snapshot_tree), 592 0, snapshot_tree); 593 ret = PTR_ERR_OR_ZERO(st); 594 bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); 595 if (ret) 596 return ret; 597 598 if (!st->v.master_subvol) 599 st->v.master_subvol = cpu_to_le32(subvolid); 600 601 bch2_trans_iter_exit(trans, &iter); 602 return 0; 603 } 604 605 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) 606 { 607 struct bch_fs *c = trans->c; 608 unsigned i_mode = S_IFREG; 609 u64 i_size = 0; 610 611 switch (btree) { 612 case BTREE_ID_extents: { 613 struct btree_iter iter = {}; 614 615 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); 616 struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); 617 bch2_trans_iter_exit(trans, &iter); 618 int ret = bkey_err(k); 619 if (ret) 620 return ret; 621 622 i_size = k.k->p.offset << 9; 623 break; 624 } 625 case BTREE_ID_dirents: 626 i_mode = S_IFDIR; 627 break; 628 case BTREE_ID_xattrs: 629 break; 630 default: 631 BUG(); 632 } 633 634 struct bch_inode_unpacked new_inode; 635 bch2_inode_init_early(c, &new_inode); 636 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); 637 new_inode.bi_size = i_size; 638 new_inode.bi_inum = inum; 639 new_inode.bi_snapshot = snapshot; 640 641 return __bch2_fsck_write_inode(trans, &new_inode); 642 } 643 644 struct snapshots_seen { 645 struct bpos pos; 646 snapshot_id_list ids; 647 }; 648 649 static inline void snapshots_seen_exit(struct snapshots_seen *s) 650 { 651 darray_exit(&s->ids); 652 } 653 654 static inline void snapshots_seen_init(struct snapshots_seen *s) 655 { 656 memset(s, 0, sizeof(*s)); 657 } 658 659 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) 660 { 661 u32 *i; 662 __darray_for_each(s->ids, i) { 663 if (*i == id) 664 return 0; 665 if (*i > id) 666 break; 667 } 668 669 int ret = darray_insert_item(&s->ids, i - s->ids.data, id); 670 if (ret) 671 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 672 s->ids.size); 673 return ret; 674 } 675 676 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, 677 enum btree_id btree_id, struct bpos pos) 678 { 679 if (!bkey_eq(s->pos, pos)) 680 s->ids.nr = 0; 681 s->pos = pos; 682 683 return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); 684 } 685 686 /** 687 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, 688 * and @ancestor hasn't been overwritten in @seen 689 * 690 * @c: filesystem handle 691 * @seen: list of snapshot ids already seen at current position 692 * @id: descendent snapshot id 693 * @ancestor: ancestor snapshot id 694 * 695 * Returns: whether key in @ancestor snapshot is visible in @id snapshot 696 */ 697 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, 698 u32 id, u32 ancestor) 699 { 700 ssize_t i; 701 702 EBUG_ON(id > ancestor); 703 704 /* @ancestor should be the snapshot most recently added to @seen */ 705 EBUG_ON(ancestor != seen->pos.snapshot); 706 EBUG_ON(ancestor != darray_last(seen->ids)); 707 708 if (id == ancestor) 709 return true; 710 711 if (!bch2_snapshot_is_ancestor(c, id, ancestor)) 712 return false; 713 714 /* 715 * We know that @id is a descendant of @ancestor, we're checking if 716 * we've seen a key that overwrote @ancestor - i.e. also a descendent of 717 * @ascestor and with @id as a descendent. 718 * 719 * But we already know that we're scanning IDs between @id and @ancestor 720 * numerically, since snapshot ID lists are kept sorted, so if we find 721 * an id that's an ancestor of @id we're done: 722 */ 723 724 for (i = seen->ids.nr - 2; 725 i >= 0 && seen->ids.data[i] >= id; 726 --i) 727 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) 728 return false; 729 730 return true; 731 } 732 733 /** 734 * ref_visible - given a key with snapshot id @src that points to a key with 735 * snapshot id @dst, test whether there is some snapshot in which @dst is 736 * visible. 737 * 738 * @c: filesystem handle 739 * @s: list of snapshot IDs already seen at @src 740 * @src: snapshot ID of src key 741 * @dst: snapshot ID of dst key 742 * Returns: true if there is some snapshot in which @dst is visible 743 * 744 * Assumes we're visiting @src keys in natural key order 745 */ 746 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, 747 u32 src, u32 dst) 748 { 749 return dst <= src 750 ? key_visible_in_snapshot(c, s, dst, src) 751 : bch2_snapshot_is_ancestor(c, src, dst); 752 } 753 754 static int ref_visible2(struct bch_fs *c, 755 u32 src, struct snapshots_seen *src_seen, 756 u32 dst, struct snapshots_seen *dst_seen) 757 { 758 if (dst > src) { 759 swap(dst, src); 760 swap(dst_seen, src_seen); 761 } 762 return key_visible_in_snapshot(c, src_seen, dst, src); 763 } 764 765 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ 766 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ 767 (_i)->snapshot <= (_snapshot); _i++) \ 768 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) 769 770 struct inode_walker_entry { 771 struct bch_inode_unpacked inode; 772 u32 snapshot; 773 u64 count; 774 }; 775 776 struct inode_walker { 777 bool first_this_inode; 778 bool have_inodes; 779 bool recalculate_sums; 780 struct bpos last_pos; 781 782 DARRAY(struct inode_walker_entry) inodes; 783 }; 784 785 static void inode_walker_exit(struct inode_walker *w) 786 { 787 darray_exit(&w->inodes); 788 } 789 790 static struct inode_walker inode_walker_init(void) 791 { 792 return (struct inode_walker) { 0, }; 793 } 794 795 static int add_inode(struct bch_fs *c, struct inode_walker *w, 796 struct bkey_s_c inode) 797 { 798 struct bch_inode_unpacked u; 799 800 BUG_ON(bch2_inode_unpack(inode, &u)); 801 802 return darray_push(&w->inodes, ((struct inode_walker_entry) { 803 .inode = u, 804 .snapshot = inode.k->p.snapshot, 805 })); 806 } 807 808 static int get_inodes_all_snapshots(struct btree_trans *trans, 809 struct inode_walker *w, u64 inum) 810 { 811 struct bch_fs *c = trans->c; 812 struct btree_iter iter; 813 struct bkey_s_c k; 814 int ret; 815 816 /* 817 * We no longer have inodes for w->last_pos; clear this to avoid 818 * screwing up check_i_sectors/check_subdir_count if we take a 819 * transaction restart here: 820 */ 821 w->have_inodes = false; 822 w->recalculate_sums = false; 823 w->inodes.nr = 0; 824 825 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 826 BTREE_ITER_all_snapshots, k, ret) { 827 if (k.k->p.offset != inum) 828 break; 829 830 if (bkey_is_inode(k.k)) 831 add_inode(c, w, k); 832 } 833 bch2_trans_iter_exit(trans, &iter); 834 835 if (ret) 836 return ret; 837 838 w->first_this_inode = true; 839 w->have_inodes = true; 840 return 0; 841 } 842 843 static struct inode_walker_entry * 844 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) 845 { 846 bool is_whiteout = k.k->type == KEY_TYPE_whiteout; 847 848 struct inode_walker_entry *i; 849 __darray_for_each(w->inodes, i) 850 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) 851 goto found; 852 853 return NULL; 854 found: 855 BUG_ON(k.k->p.snapshot > i->snapshot); 856 857 if (k.k->p.snapshot != i->snapshot && !is_whiteout) { 858 struct inode_walker_entry new = *i; 859 860 new.snapshot = k.k->p.snapshot; 861 new.count = 0; 862 863 struct printbuf buf = PRINTBUF; 864 bch2_bkey_val_to_text(&buf, c, k); 865 866 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" 867 "unexpected because we should always update the inode when we update a key in that inode\n" 868 "%s", 869 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); 870 printbuf_exit(&buf); 871 872 while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) 873 --i; 874 875 size_t pos = i - w->inodes.data; 876 int ret = darray_insert_item(&w->inodes, pos, new); 877 if (ret) 878 return ERR_PTR(ret); 879 880 i = w->inodes.data + pos; 881 } 882 883 return i; 884 } 885 886 static struct inode_walker_entry *walk_inode(struct btree_trans *trans, 887 struct inode_walker *w, 888 struct bkey_s_c k) 889 { 890 if (w->last_pos.inode != k.k->p.inode) { 891 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); 892 if (ret) 893 return ERR_PTR(ret); 894 } 895 896 w->last_pos = k.k->p; 897 898 return lookup_inode_for_snapshot(trans->c, w, k); 899 } 900 901 static int get_visible_inodes(struct btree_trans *trans, 902 struct inode_walker *w, 903 struct snapshots_seen *s, 904 u64 inum) 905 { 906 struct bch_fs *c = trans->c; 907 struct btree_iter iter; 908 struct bkey_s_c k; 909 int ret; 910 911 w->inodes.nr = 0; 912 913 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 914 BTREE_ITER_all_snapshots, k, ret) { 915 if (k.k->p.offset != inum) 916 break; 917 918 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) 919 continue; 920 921 if (bkey_is_inode(k.k)) 922 add_inode(c, w, k); 923 924 if (k.k->p.snapshot >= s->pos.snapshot) 925 break; 926 } 927 bch2_trans_iter_exit(trans, &iter); 928 929 return ret; 930 } 931 932 static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) 933 { 934 if (d.v->d_type == DT_SUBVOL) { 935 u32 snap; 936 u64 inum; 937 int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum); 938 if (ret && !bch2_err_matches(ret, ENOENT)) 939 return ret; 940 return !ret; 941 } else { 942 struct btree_iter iter; 943 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 944 SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); 945 int ret = bkey_err(k); 946 if (ret) 947 return ret; 948 949 ret = bkey_is_inode(k.k); 950 bch2_trans_iter_exit(trans, &iter); 951 return ret; 952 } 953 } 954 955 /* 956 * Prefer to delete the first one, since that will be the one at the wrong 957 * offset: 958 * return value: 0 -> delete k1, 1 -> delete k2 959 */ 960 static int hash_pick_winner(struct btree_trans *trans, 961 const struct bch_hash_desc desc, 962 struct bch_hash_info *hash_info, 963 struct bkey_s_c k1, 964 struct bkey_s_c k2) 965 { 966 if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && 967 !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) 968 return 0; 969 970 switch (desc.btree_id) { 971 case BTREE_ID_dirents: { 972 int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1)); 973 if (ret < 0) 974 return ret; 975 if (!ret) 976 return 0; 977 978 ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2)); 979 if (ret < 0) 980 return ret; 981 if (!ret) 982 return 1; 983 return 2; 984 } 985 default: 986 return 0; 987 } 988 } 989 990 static int fsck_update_backpointers(struct btree_trans *trans, 991 struct snapshots_seen *s, 992 const struct bch_hash_desc desc, 993 struct bch_hash_info *hash_info, 994 struct bkey_i *new) 995 { 996 if (new->k.type != KEY_TYPE_dirent) 997 return 0; 998 999 struct bkey_i_dirent *d = bkey_i_to_dirent(new); 1000 struct inode_walker target = inode_walker_init(); 1001 int ret = 0; 1002 1003 if (d->v.d_type == DT_SUBVOL) { 1004 BUG(); 1005 } else { 1006 ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); 1007 if (ret) 1008 goto err; 1009 1010 darray_for_each(target.inodes, i) { 1011 i->inode.bi_dir_offset = d->k.p.offset; 1012 ret = __bch2_fsck_write_inode(trans, &i->inode); 1013 if (ret) 1014 goto err; 1015 } 1016 } 1017 err: 1018 inode_walker_exit(&target); 1019 return ret; 1020 } 1021 1022 static int fsck_rename_dirent(struct btree_trans *trans, 1023 struct snapshots_seen *s, 1024 const struct bch_hash_desc desc, 1025 struct bch_hash_info *hash_info, 1026 struct bkey_s_c_dirent old) 1027 { 1028 struct qstr old_name = bch2_dirent_get_name(old); 1029 struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); 1030 int ret = PTR_ERR_OR_ZERO(new); 1031 if (ret) 1032 return ret; 1033 1034 bkey_dirent_init(&new->k_i); 1035 dirent_copy_target(new, old); 1036 new->k.p = old.k->p; 1037 1038 for (unsigned i = 0; i < 1000; i++) { 1039 unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", 1040 old_name.len, old_name.name, i); 1041 unsigned u64s = BKEY_U64s + dirent_val_u64s(len); 1042 1043 if (u64s > U8_MAX) 1044 return -EINVAL; 1045 1046 new->k.u64s = u64s; 1047 1048 ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, 1049 (subvol_inum) { 0, old.k->p.inode }, 1050 old.k->p.snapshot, &new->k_i, 1051 BTREE_UPDATE_internal_snapshot_node); 1052 if (!bch2_err_matches(ret, EEXIST)) 1053 break; 1054 } 1055 1056 if (ret) 1057 return ret; 1058 1059 return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); 1060 } 1061 1062 static int hash_check_key(struct btree_trans *trans, 1063 struct snapshots_seen *s, 1064 const struct bch_hash_desc desc, 1065 struct bch_hash_info *hash_info, 1066 struct btree_iter *k_iter, struct bkey_s_c hash_k) 1067 { 1068 struct bch_fs *c = trans->c; 1069 struct btree_iter iter = { NULL }; 1070 struct printbuf buf = PRINTBUF; 1071 struct bkey_s_c k; 1072 u64 hash; 1073 int ret = 0; 1074 1075 if (hash_k.k->type != desc.key_type) 1076 return 0; 1077 1078 hash = desc.hash_bkey(hash_info, hash_k); 1079 1080 if (likely(hash == hash_k.k->p.offset)) 1081 return 0; 1082 1083 if (hash_k.k->p.offset < hash) 1084 goto bad_hash; 1085 1086 for_each_btree_key_norestart(trans, iter, desc.btree_id, 1087 SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), 1088 BTREE_ITER_slots, k, ret) { 1089 if (bkey_eq(k.k->p, hash_k.k->p)) 1090 break; 1091 1092 if (k.k->type == desc.key_type && 1093 !desc.cmp_bkey(k, hash_k)) 1094 goto duplicate_entries; 1095 1096 if (bkey_deleted(k.k)) { 1097 bch2_trans_iter_exit(trans, &iter); 1098 goto bad_hash; 1099 } 1100 } 1101 out: 1102 bch2_trans_iter_exit(trans, &iter); 1103 printbuf_exit(&buf); 1104 return ret; 1105 bad_hash: 1106 if (fsck_err(trans, hash_table_key_wrong_offset, 1107 "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", 1108 bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, 1109 (printbuf_reset(&buf), 1110 bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { 1111 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); 1112 if (IS_ERR(new)) 1113 return PTR_ERR(new); 1114 1115 k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, 1116 (subvol_inum) { 0, hash_k.k->p.inode }, 1117 hash_k.k->p.snapshot, new, 1118 STR_HASH_must_create| 1119 BTREE_ITER_with_updates| 1120 BTREE_UPDATE_internal_snapshot_node); 1121 ret = bkey_err(k); 1122 if (ret) 1123 goto out; 1124 if (k.k) 1125 goto duplicate_entries; 1126 1127 ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 1128 BTREE_UPDATE_internal_snapshot_node) ?: 1129 fsck_update_backpointers(trans, s, desc, hash_info, new) ?: 1130 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: 1131 -BCH_ERR_transaction_restart_nested; 1132 goto out; 1133 } 1134 fsck_err: 1135 goto out; 1136 duplicate_entries: 1137 ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); 1138 if (ret < 0) 1139 goto out; 1140 1141 if (!fsck_err(trans, hash_table_key_duplicate, 1142 "duplicate hash table keys%s:\n%s", 1143 ret != 2 ? "" : ", both point to valid inodes", 1144 (printbuf_reset(&buf), 1145 bch2_bkey_val_to_text(&buf, c, hash_k), 1146 prt_newline(&buf), 1147 bch2_bkey_val_to_text(&buf, c, k), 1148 buf.buf))) 1149 goto out; 1150 1151 switch (ret) { 1152 case 0: 1153 ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); 1154 break; 1155 case 1: 1156 ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); 1157 break; 1158 case 2: 1159 ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: 1160 bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); 1161 goto out; 1162 } 1163 1164 ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: 1165 -BCH_ERR_transaction_restart_nested; 1166 goto out; 1167 } 1168 1169 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 1170 struct btree_iter *iter, 1171 struct bpos pos) 1172 { 1173 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 1174 } 1175 1176 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, 1177 struct btree_iter *iter, 1178 struct bch_inode_unpacked *inode, 1179 u32 *snapshot) 1180 { 1181 if (inode->bi_subvol) { 1182 u64 inum; 1183 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); 1184 if (ret) 1185 return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); 1186 } 1187 1188 return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); 1189 } 1190 1191 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) 1192 { 1193 struct btree_iter iter; 1194 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); 1195 int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; 1196 bch2_trans_iter_exit(trans, &iter); 1197 return ret; 1198 } 1199 1200 static int check_inode_dirent_inode(struct btree_trans *trans, 1201 struct bch_inode_unpacked *inode, 1202 bool *write_inode) 1203 { 1204 struct bch_fs *c = trans->c; 1205 struct printbuf buf = PRINTBUF; 1206 1207 u32 inode_snapshot = inode->bi_snapshot; 1208 struct btree_iter dirent_iter = {}; 1209 struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); 1210 int ret = bkey_err(d); 1211 if (ret && !bch2_err_matches(ret, ENOENT)) 1212 return ret; 1213 1214 if (fsck_err_on(ret, 1215 trans, inode_points_to_missing_dirent, 1216 "inode points to missing dirent\n%s", 1217 (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || 1218 fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), 1219 trans, inode_points_to_wrong_dirent, 1220 "%s", 1221 (printbuf_reset(&buf), 1222 dirent_inode_mismatch_msg(&buf, c, d, inode), 1223 buf.buf))) { 1224 /* 1225 * We just clear the backpointer fields for now. If we find a 1226 * dirent that points to this inode in check_dirents(), we'll 1227 * update it then; then when we get to check_path() if the 1228 * backpointer is still 0 we'll reattach it. 1229 */ 1230 inode->bi_dir = 0; 1231 inode->bi_dir_offset = 0; 1232 *write_inode = true; 1233 } 1234 1235 ret = 0; 1236 fsck_err: 1237 bch2_trans_iter_exit(trans, &dirent_iter); 1238 printbuf_exit(&buf); 1239 bch_err_fn(c, ret); 1240 return ret; 1241 } 1242 1243 static int get_snapshot_root_inode(struct btree_trans *trans, 1244 struct bch_inode_unpacked *root, 1245 u64 inum) 1246 { 1247 struct btree_iter iter; 1248 struct bkey_s_c k; 1249 int ret = 0; 1250 1251 for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, 1252 SPOS(0, inum, U32_MAX), 1253 BTREE_ITER_all_snapshots, k, ret) { 1254 if (k.k->p.offset != inum) 1255 break; 1256 if (bkey_is_inode(k.k)) 1257 goto found_root; 1258 } 1259 if (ret) 1260 goto err; 1261 BUG(); 1262 found_root: 1263 BUG_ON(bch2_inode_unpack(k, root)); 1264 err: 1265 bch2_trans_iter_exit(trans, &iter); 1266 return ret; 1267 } 1268 1269 static int check_inode(struct btree_trans *trans, 1270 struct btree_iter *iter, 1271 struct bkey_s_c k, 1272 struct bch_inode_unpacked *snapshot_root, 1273 struct snapshots_seen *s) 1274 { 1275 struct bch_fs *c = trans->c; 1276 struct printbuf buf = PRINTBUF; 1277 struct bch_inode_unpacked u; 1278 bool do_update = false; 1279 int ret; 1280 1281 ret = bch2_check_key_has_snapshot(trans, iter, k); 1282 if (ret < 0) 1283 goto err; 1284 if (ret) 1285 return 0; 1286 1287 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1288 if (ret) 1289 goto err; 1290 1291 if (!bkey_is_inode(k.k)) 1292 return 0; 1293 1294 BUG_ON(bch2_inode_unpack(k, &u)); 1295 1296 if (snapshot_root->bi_inum != u.bi_inum) { 1297 ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); 1298 if (ret) 1299 goto err; 1300 } 1301 1302 if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || 1303 INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), 1304 trans, inode_snapshot_mismatch, 1305 "inodes in different snapshots don't match")) { 1306 u.bi_hash_seed = snapshot_root->bi_hash_seed; 1307 SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); 1308 do_update = true; 1309 } 1310 1311 if (u.bi_dir || u.bi_dir_offset) { 1312 ret = check_inode_dirent_inode(trans, &u, &do_update); 1313 if (ret) 1314 goto err; 1315 } 1316 1317 if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked), 1318 trans, inode_unlinked_but_has_dirent, 1319 "inode unlinked but has dirent\n%s", 1320 (printbuf_reset(&buf), 1321 bch2_inode_unpacked_to_text(&buf, &u), 1322 buf.buf))) { 1323 u.bi_flags &= ~BCH_INODE_unlinked; 1324 do_update = true; 1325 } 1326 1327 if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) { 1328 /* Check for this early so that check_unreachable_inode() will reattach it */ 1329 1330 ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot); 1331 if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty) 1332 goto err; 1333 1334 fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty, 1335 "dir unlinked but not empty\n%s", 1336 (printbuf_reset(&buf), 1337 bch2_inode_unpacked_to_text(&buf, &u), 1338 buf.buf)); 1339 u.bi_flags &= ~BCH_INODE_unlinked; 1340 do_update = true; 1341 ret = 0; 1342 } 1343 1344 ret = bch2_inode_has_child_snapshots(trans, k.k->p); 1345 if (ret < 0) 1346 goto err; 1347 1348 if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), 1349 trans, inode_has_child_snapshots_wrong, 1350 "inode has_child_snapshots flag wrong (should be %u)\n%s", 1351 ret, 1352 (printbuf_reset(&buf), 1353 bch2_inode_unpacked_to_text(&buf, &u), 1354 buf.buf))) { 1355 if (ret) 1356 u.bi_flags |= BCH_INODE_has_child_snapshot; 1357 else 1358 u.bi_flags &= ~BCH_INODE_has_child_snapshot; 1359 do_update = true; 1360 } 1361 ret = 0; 1362 1363 if ((u.bi_flags & BCH_INODE_unlinked) && 1364 !(u.bi_flags & BCH_INODE_has_child_snapshot)) { 1365 if (!test_bit(BCH_FS_started, &c->flags)) { 1366 /* 1367 * If we're not in online fsck, don't delete unlinked 1368 * inodes, just make sure they're on the deleted list. 1369 * 1370 * They might be referred to by a logged operation - 1371 * i.e. we might have crashed in the middle of a 1372 * truncate on an unlinked but open file - so we want to 1373 * let the delete_dead_inodes kill it after resuming 1374 * logged ops. 1375 */ 1376 ret = check_inode_deleted_list(trans, k.k->p); 1377 if (ret < 0) 1378 goto err_noprint; 1379 1380 fsck_err_on(!ret, 1381 trans, unlinked_inode_not_on_deleted_list, 1382 "inode %llu:%u unlinked, but not on deleted list", 1383 u.bi_inum, k.k->p.snapshot); 1384 1385 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); 1386 if (ret) 1387 goto err; 1388 } else { 1389 ret = bch2_inode_or_descendents_is_open(trans, k.k->p); 1390 if (ret < 0) 1391 goto err; 1392 1393 if (fsck_err_on(!ret, 1394 trans, inode_unlinked_and_not_open, 1395 "inode %llu%u unlinked and not open", 1396 u.bi_inum, u.bi_snapshot)) { 1397 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); 1398 bch_err_msg(c, ret, "in fsck deleting inode"); 1399 goto err_noprint; 1400 } 1401 ret = 0; 1402 } 1403 } 1404 1405 if (fsck_err_on(u.bi_parent_subvol && 1406 (u.bi_subvol == 0 || 1407 u.bi_subvol == BCACHEFS_ROOT_SUBVOL), 1408 trans, inode_bi_parent_nonzero, 1409 "inode %llu:%u has subvol %u but nonzero parent subvol %u", 1410 u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { 1411 u.bi_parent_subvol = 0; 1412 do_update = true; 1413 } 1414 1415 if (u.bi_subvol) { 1416 struct bch_subvolume s; 1417 1418 ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); 1419 if (ret && !bch2_err_matches(ret, ENOENT)) 1420 goto err; 1421 1422 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 1423 ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); 1424 goto do_update; 1425 } 1426 1427 if (fsck_err_on(ret, 1428 trans, inode_bi_subvol_missing, 1429 "inode %llu:%u bi_subvol points to missing subvolume %u", 1430 u.bi_inum, k.k->p.snapshot, u.bi_subvol) || 1431 fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || 1432 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), 1433 k.k->p.snapshot), 1434 trans, inode_bi_subvol_wrong, 1435 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", 1436 u.bi_inum, k.k->p.snapshot, u.bi_subvol, 1437 le64_to_cpu(s.inode), 1438 le32_to_cpu(s.snapshot))) { 1439 u.bi_subvol = 0; 1440 u.bi_parent_subvol = 0; 1441 do_update = true; 1442 } 1443 } 1444 do_update: 1445 if (do_update) { 1446 ret = __bch2_fsck_write_inode(trans, &u); 1447 bch_err_msg(c, ret, "in fsck updating inode"); 1448 if (ret) 1449 goto err_noprint; 1450 } 1451 err: 1452 fsck_err: 1453 bch_err_fn(c, ret); 1454 err_noprint: 1455 printbuf_exit(&buf); 1456 return ret; 1457 } 1458 1459 int bch2_check_inodes(struct bch_fs *c) 1460 { 1461 struct bch_inode_unpacked snapshot_root = {}; 1462 struct snapshots_seen s; 1463 1464 snapshots_seen_init(&s); 1465 1466 int ret = bch2_trans_run(c, 1467 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1468 POS_MIN, 1469 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1470 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1471 check_inode(trans, &iter, k, &snapshot_root, &s))); 1472 1473 snapshots_seen_exit(&s); 1474 bch_err_fn(c, ret); 1475 return ret; 1476 } 1477 1478 static int find_oldest_inode_needs_reattach(struct btree_trans *trans, 1479 struct bch_inode_unpacked *inode) 1480 { 1481 struct bch_fs *c = trans->c; 1482 struct btree_iter iter; 1483 struct bkey_s_c k; 1484 int ret = 0; 1485 1486 /* 1487 * We look for inodes to reattach in natural key order, leaves first, 1488 * but we should do the reattach at the oldest version that needs to be 1489 * reattached: 1490 */ 1491 for_each_btree_key_norestart(trans, iter, 1492 BTREE_ID_inodes, 1493 SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), 1494 BTREE_ITER_all_snapshots, k, ret) { 1495 if (k.k->p.offset != inode->bi_inum) 1496 break; 1497 1498 if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) 1499 continue; 1500 1501 if (!bkey_is_inode(k.k)) 1502 break; 1503 1504 struct bch_inode_unpacked parent_inode; 1505 bch2_inode_unpack(k, &parent_inode); 1506 1507 if (!inode_should_reattach(&parent_inode)) 1508 break; 1509 1510 *inode = parent_inode; 1511 } 1512 bch2_trans_iter_exit(trans, &iter); 1513 1514 return ret; 1515 } 1516 1517 static int check_unreachable_inode(struct btree_trans *trans, 1518 struct btree_iter *iter, 1519 struct bkey_s_c k) 1520 { 1521 struct printbuf buf = PRINTBUF; 1522 int ret = 0; 1523 1524 if (!bkey_is_inode(k.k)) 1525 return 0; 1526 1527 struct bch_inode_unpacked inode; 1528 BUG_ON(bch2_inode_unpack(k, &inode)); 1529 1530 if (!inode_should_reattach(&inode)) 1531 return 0; 1532 1533 ret = find_oldest_inode_needs_reattach(trans, &inode); 1534 if (ret) 1535 return ret; 1536 1537 if (fsck_err(trans, inode_unreachable, 1538 "unreachable inode:\n%s", 1539 (bch2_inode_unpacked_to_text(&buf, &inode), 1540 buf.buf))) 1541 ret = reattach_inode(trans, &inode); 1542 fsck_err: 1543 printbuf_exit(&buf); 1544 return ret; 1545 } 1546 1547 /* 1548 * Reattach unreachable (but not unlinked) inodes 1549 * 1550 * Run after check_inodes() and check_dirents(), so we node that inode 1551 * backpointer fields point to valid dirents, and every inode that has a dirent 1552 * that points to it has its backpointer field set - so we're just looking for 1553 * non-unlinked inodes without backpointers: 1554 * 1555 * XXX: this is racy w.r.t. hardlink removal in online fsck 1556 */ 1557 int bch2_check_unreachable_inodes(struct bch_fs *c) 1558 { 1559 int ret = bch2_trans_run(c, 1560 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1561 POS_MIN, 1562 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1563 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1564 check_unreachable_inode(trans, &iter, k))); 1565 bch_err_fn(c, ret); 1566 return ret; 1567 } 1568 1569 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) 1570 { 1571 switch (btree) { 1572 case BTREE_ID_extents: 1573 return S_ISREG(mode) || S_ISLNK(mode); 1574 case BTREE_ID_dirents: 1575 return S_ISDIR(mode); 1576 case BTREE_ID_xattrs: 1577 return true; 1578 default: 1579 BUG(); 1580 } 1581 } 1582 1583 static int check_key_has_inode(struct btree_trans *trans, 1584 struct btree_iter *iter, 1585 struct inode_walker *inode, 1586 struct inode_walker_entry *i, 1587 struct bkey_s_c k) 1588 { 1589 struct bch_fs *c = trans->c; 1590 struct printbuf buf = PRINTBUF; 1591 int ret = PTR_ERR_OR_ZERO(i); 1592 if (ret) 1593 return ret; 1594 1595 if (k.k->type == KEY_TYPE_whiteout) 1596 goto out; 1597 1598 if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { 1599 ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: 1600 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1601 if (ret) 1602 goto err; 1603 1604 inode->last_pos.inode--; 1605 ret = -BCH_ERR_transaction_restart_nested; 1606 goto err; 1607 } 1608 1609 if (fsck_err_on(!i, 1610 trans, key_in_missing_inode, 1611 "key in missing inode:\n %s", 1612 (printbuf_reset(&buf), 1613 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1614 goto delete; 1615 1616 if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), 1617 trans, key_in_wrong_inode_type, 1618 "key for wrong inode mode %o:\n %s", 1619 i->inode.bi_mode, 1620 (printbuf_reset(&buf), 1621 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1622 goto delete; 1623 out: 1624 err: 1625 fsck_err: 1626 printbuf_exit(&buf); 1627 bch_err_fn(c, ret); 1628 return ret; 1629 delete: 1630 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); 1631 goto out; 1632 } 1633 1634 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) 1635 { 1636 struct bch_fs *c = trans->c; 1637 int ret = 0; 1638 s64 count2; 1639 1640 darray_for_each(w->inodes, i) { 1641 if (i->inode.bi_sectors == i->count) 1642 continue; 1643 1644 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); 1645 1646 if (w->recalculate_sums) 1647 i->count = count2; 1648 1649 if (i->count != count2) { 1650 bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1651 w->last_pos.inode, i->snapshot, i->count, count2); 1652 return -BCH_ERR_internal_fsck_err; 1653 } 1654 1655 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), 1656 trans, inode_i_sectors_wrong, 1657 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", 1658 w->last_pos.inode, i->snapshot, 1659 i->inode.bi_sectors, i->count)) { 1660 i->inode.bi_sectors = i->count; 1661 ret = bch2_fsck_write_inode(trans, &i->inode); 1662 if (ret) 1663 break; 1664 } 1665 } 1666 fsck_err: 1667 bch_err_fn(c, ret); 1668 return ret; 1669 } 1670 1671 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) 1672 { 1673 u32 restart_count = trans->restart_count; 1674 return check_i_sectors_notnested(trans, w) ?: 1675 trans_was_restarted(trans, restart_count); 1676 } 1677 1678 struct extent_end { 1679 u32 snapshot; 1680 u64 offset; 1681 struct snapshots_seen seen; 1682 }; 1683 1684 struct extent_ends { 1685 struct bpos last_pos; 1686 DARRAY(struct extent_end) e; 1687 }; 1688 1689 static void extent_ends_reset(struct extent_ends *extent_ends) 1690 { 1691 darray_for_each(extent_ends->e, i) 1692 snapshots_seen_exit(&i->seen); 1693 extent_ends->e.nr = 0; 1694 } 1695 1696 static void extent_ends_exit(struct extent_ends *extent_ends) 1697 { 1698 extent_ends_reset(extent_ends); 1699 darray_exit(&extent_ends->e); 1700 } 1701 1702 static void extent_ends_init(struct extent_ends *extent_ends) 1703 { 1704 memset(extent_ends, 0, sizeof(*extent_ends)); 1705 } 1706 1707 static int extent_ends_at(struct bch_fs *c, 1708 struct extent_ends *extent_ends, 1709 struct snapshots_seen *seen, 1710 struct bkey_s_c k) 1711 { 1712 struct extent_end *i, n = (struct extent_end) { 1713 .offset = k.k->p.offset, 1714 .snapshot = k.k->p.snapshot, 1715 .seen = *seen, 1716 }; 1717 1718 n.seen.ids.data = kmemdup(seen->ids.data, 1719 sizeof(seen->ids.data[0]) * seen->ids.size, 1720 GFP_KERNEL); 1721 if (!n.seen.ids.data) 1722 return -BCH_ERR_ENOMEM_fsck_extent_ends_at; 1723 1724 __darray_for_each(extent_ends->e, i) { 1725 if (i->snapshot == k.k->p.snapshot) { 1726 snapshots_seen_exit(&i->seen); 1727 *i = n; 1728 return 0; 1729 } 1730 1731 if (i->snapshot >= k.k->p.snapshot) 1732 break; 1733 } 1734 1735 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); 1736 } 1737 1738 static int overlapping_extents_found(struct btree_trans *trans, 1739 enum btree_id btree, 1740 struct bpos pos1, struct snapshots_seen *pos1_seen, 1741 struct bkey pos2, 1742 bool *fixed, 1743 struct extent_end *extent_end) 1744 { 1745 struct bch_fs *c = trans->c; 1746 struct printbuf buf = PRINTBUF; 1747 struct btree_iter iter1, iter2 = { NULL }; 1748 struct bkey_s_c k1, k2; 1749 int ret; 1750 1751 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); 1752 1753 bch2_trans_iter_init(trans, &iter1, btree, pos1, 1754 BTREE_ITER_all_snapshots| 1755 BTREE_ITER_not_extents); 1756 k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); 1757 ret = bkey_err(k1); 1758 if (ret) 1759 goto err; 1760 1761 prt_str(&buf, "\n "); 1762 bch2_bkey_val_to_text(&buf, c, k1); 1763 1764 if (!bpos_eq(pos1, k1.k->p)) { 1765 prt_str(&buf, "\n wanted\n "); 1766 bch2_bpos_to_text(&buf, pos1); 1767 prt_str(&buf, "\n "); 1768 bch2_bkey_to_text(&buf, &pos2); 1769 1770 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", 1771 __func__, buf.buf); 1772 ret = -BCH_ERR_internal_fsck_err; 1773 goto err; 1774 } 1775 1776 bch2_trans_copy_iter(&iter2, &iter1); 1777 1778 while (1) { 1779 bch2_btree_iter_advance(&iter2); 1780 1781 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); 1782 ret = bkey_err(k2); 1783 if (ret) 1784 goto err; 1785 1786 if (bpos_ge(k2.k->p, pos2.p)) 1787 break; 1788 } 1789 1790 prt_str(&buf, "\n "); 1791 bch2_bkey_val_to_text(&buf, c, k2); 1792 1793 if (bpos_gt(k2.k->p, pos2.p) || 1794 pos2.size != k2.k->size) { 1795 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", 1796 __func__, buf.buf); 1797 ret = -BCH_ERR_internal_fsck_err; 1798 goto err; 1799 } 1800 1801 prt_printf(&buf, "\n overwriting %s extent", 1802 pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); 1803 1804 if (fsck_err(trans, extent_overlapping, 1805 "overlapping extents%s", buf.buf)) { 1806 struct btree_iter *old_iter = &iter1; 1807 struct disk_reservation res = { 0 }; 1808 1809 if (pos1.snapshot < pos2.p.snapshot) { 1810 old_iter = &iter2; 1811 swap(k1, k2); 1812 } 1813 1814 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); 1815 1816 ret = bch2_trans_update_extent_overwrite(trans, old_iter, 1817 BTREE_UPDATE_internal_snapshot_node, 1818 k1, k2) ?: 1819 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); 1820 bch2_disk_reservation_put(c, &res); 1821 1822 if (ret) 1823 goto err; 1824 1825 *fixed = true; 1826 1827 if (pos1.snapshot == pos2.p.snapshot) { 1828 /* 1829 * We overwrote the first extent, and did the overwrite 1830 * in the same snapshot: 1831 */ 1832 extent_end->offset = bkey_start_offset(&pos2); 1833 } else if (pos1.snapshot > pos2.p.snapshot) { 1834 /* 1835 * We overwrote the first extent in pos2's snapshot: 1836 */ 1837 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); 1838 } else { 1839 /* 1840 * We overwrote the second extent - restart 1841 * check_extent() from the top: 1842 */ 1843 ret = -BCH_ERR_transaction_restart_nested; 1844 } 1845 } 1846 fsck_err: 1847 err: 1848 bch2_trans_iter_exit(trans, &iter2); 1849 bch2_trans_iter_exit(trans, &iter1); 1850 printbuf_exit(&buf); 1851 return ret; 1852 } 1853 1854 static int check_overlapping_extents(struct btree_trans *trans, 1855 struct snapshots_seen *seen, 1856 struct extent_ends *extent_ends, 1857 struct bkey_s_c k, 1858 struct btree_iter *iter, 1859 bool *fixed) 1860 { 1861 struct bch_fs *c = trans->c; 1862 int ret = 0; 1863 1864 /* transaction restart, running again */ 1865 if (bpos_eq(extent_ends->last_pos, k.k->p)) 1866 return 0; 1867 1868 if (extent_ends->last_pos.inode != k.k->p.inode) 1869 extent_ends_reset(extent_ends); 1870 1871 darray_for_each(extent_ends->e, i) { 1872 if (i->offset <= bkey_start_offset(k.k)) 1873 continue; 1874 1875 if (!ref_visible2(c, 1876 k.k->p.snapshot, seen, 1877 i->snapshot, &i->seen)) 1878 continue; 1879 1880 ret = overlapping_extents_found(trans, iter->btree_id, 1881 SPOS(iter->pos.inode, 1882 i->offset, 1883 i->snapshot), 1884 &i->seen, 1885 *k.k, fixed, i); 1886 if (ret) 1887 goto err; 1888 } 1889 1890 extent_ends->last_pos = k.k->p; 1891 err: 1892 return ret; 1893 } 1894 1895 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, 1896 struct bkey_s_c k) 1897 { 1898 struct bch_fs *c = trans->c; 1899 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1900 struct bch_extent_crc_unpacked crc; 1901 const union bch_extent_entry *i; 1902 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; 1903 1904 bkey_for_each_crc(k.k, ptrs, crc, i) 1905 if (crc_is_encoded(crc) && 1906 crc.uncompressed_size > encoded_extent_max_sectors) { 1907 struct printbuf buf = PRINTBUF; 1908 1909 bch2_bkey_val_to_text(&buf, c, k); 1910 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); 1911 printbuf_exit(&buf); 1912 } 1913 1914 return 0; 1915 } 1916 1917 static int check_extent(struct btree_trans *trans, struct btree_iter *iter, 1918 struct bkey_s_c k, 1919 struct inode_walker *inode, 1920 struct snapshots_seen *s, 1921 struct extent_ends *extent_ends, 1922 struct disk_reservation *res) 1923 { 1924 struct bch_fs *c = trans->c; 1925 struct printbuf buf = PRINTBUF; 1926 int ret = 0; 1927 1928 ret = bch2_check_key_has_snapshot(trans, iter, k); 1929 if (ret) { 1930 ret = ret < 0 ? ret : 0; 1931 goto out; 1932 } 1933 1934 if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { 1935 ret = check_i_sectors(trans, inode); 1936 if (ret) 1937 goto err; 1938 } 1939 1940 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1941 if (ret) 1942 goto err; 1943 1944 struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); 1945 ret = PTR_ERR_OR_ZERO(extent_i); 1946 if (ret) 1947 goto err; 1948 1949 ret = check_key_has_inode(trans, iter, inode, extent_i, k); 1950 if (ret) 1951 goto err; 1952 1953 if (k.k->type != KEY_TYPE_whiteout) { 1954 ret = check_overlapping_extents(trans, s, extent_ends, k, iter, 1955 &inode->recalculate_sums); 1956 if (ret) 1957 goto err; 1958 1959 /* 1960 * Check inodes in reverse order, from oldest snapshots to 1961 * newest, starting from the inode that matches this extent's 1962 * snapshot. If we didn't have one, iterate over all inodes: 1963 */ 1964 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1965 inode->inodes.data && i >= inode->inodes.data; 1966 --i) { 1967 if (i->snapshot > k.k->p.snapshot || 1968 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) 1969 continue; 1970 1971 if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && 1972 !bkey_extent_is_reservation(k), 1973 trans, extent_past_end_of_inode, 1974 "extent type past end of inode %llu:%u, i_size %llu\n %s", 1975 i->inode.bi_inum, i->snapshot, i->inode.bi_size, 1976 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1977 struct btree_iter iter2; 1978 1979 bch2_trans_copy_iter(&iter2, iter); 1980 bch2_btree_iter_set_snapshot(&iter2, i->snapshot); 1981 ret = bch2_btree_iter_traverse(&iter2) ?: 1982 bch2_btree_delete_at(trans, &iter2, 1983 BTREE_UPDATE_internal_snapshot_node); 1984 bch2_trans_iter_exit(trans, &iter2); 1985 if (ret) 1986 goto err; 1987 1988 iter->k.type = KEY_TYPE_whiteout; 1989 break; 1990 } 1991 } 1992 } 1993 1994 ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); 1995 if (ret) 1996 goto err; 1997 1998 if (bkey_extent_is_allocation(k.k)) { 1999 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 2000 inode->inodes.data && i >= inode->inodes.data; 2001 --i) { 2002 if (i->snapshot > k.k->p.snapshot || 2003 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) 2004 continue; 2005 2006 i->count += k.k->size; 2007 } 2008 } 2009 2010 if (k.k->type != KEY_TYPE_whiteout) { 2011 ret = extent_ends_at(c, extent_ends, s, k); 2012 if (ret) 2013 goto err; 2014 } 2015 out: 2016 err: 2017 fsck_err: 2018 printbuf_exit(&buf); 2019 bch_err_fn(c, ret); 2020 return ret; 2021 } 2022 2023 /* 2024 * Walk extents: verify that extents have a corresponding S_ISREG inode, and 2025 * that i_size an i_sectors are consistent 2026 */ 2027 int bch2_check_extents(struct bch_fs *c) 2028 { 2029 struct inode_walker w = inode_walker_init(); 2030 struct snapshots_seen s; 2031 struct extent_ends extent_ends; 2032 struct disk_reservation res = { 0 }; 2033 2034 snapshots_seen_init(&s); 2035 extent_ends_init(&extent_ends); 2036 2037 int ret = bch2_trans_run(c, 2038 for_each_btree_key(trans, iter, BTREE_ID_extents, 2039 POS(BCACHEFS_ROOT_INO, 0), 2040 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ 2041 bch2_disk_reservation_put(c, &res); 2042 check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: 2043 check_extent_overbig(trans, &iter, k); 2044 })) ?: 2045 check_i_sectors_notnested(trans, &w)); 2046 2047 bch2_disk_reservation_put(c, &res); 2048 extent_ends_exit(&extent_ends); 2049 inode_walker_exit(&w); 2050 snapshots_seen_exit(&s); 2051 2052 bch_err_fn(c, ret); 2053 return ret; 2054 } 2055 2056 int bch2_check_indirect_extents(struct bch_fs *c) 2057 { 2058 struct disk_reservation res = { 0 }; 2059 2060 int ret = bch2_trans_run(c, 2061 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, 2062 POS_MIN, 2063 BTREE_ITER_prefetch, k, 2064 &res, NULL, 2065 BCH_TRANS_COMMIT_no_enospc, ({ 2066 bch2_disk_reservation_put(c, &res); 2067 check_extent_overbig(trans, &iter, k); 2068 }))); 2069 2070 bch2_disk_reservation_put(c, &res); 2071 bch_err_fn(c, ret); 2072 return ret; 2073 } 2074 2075 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) 2076 { 2077 struct bch_fs *c = trans->c; 2078 int ret = 0; 2079 s64 count2; 2080 2081 darray_for_each(w->inodes, i) { 2082 if (i->inode.bi_nlink == i->count) 2083 continue; 2084 2085 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); 2086 if (count2 < 0) 2087 return count2; 2088 2089 if (i->count != count2) { 2090 bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", 2091 w->last_pos.inode, i->snapshot, i->count, count2); 2092 i->count = count2; 2093 if (i->inode.bi_nlink == i->count) 2094 continue; 2095 } 2096 2097 if (fsck_err_on(i->inode.bi_nlink != i->count, 2098 trans, inode_dir_wrong_nlink, 2099 "directory %llu:%u with wrong i_nlink: got %u, should be %llu", 2100 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { 2101 i->inode.bi_nlink = i->count; 2102 ret = bch2_fsck_write_inode(trans, &i->inode); 2103 if (ret) 2104 break; 2105 } 2106 } 2107 fsck_err: 2108 bch_err_fn(c, ret); 2109 return ret; 2110 } 2111 2112 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) 2113 { 2114 u32 restart_count = trans->restart_count; 2115 return check_subdir_count_notnested(trans, w) ?: 2116 trans_was_restarted(trans, restart_count); 2117 } 2118 2119 noinline_for_stack 2120 static int check_dirent_inode_dirent(struct btree_trans *trans, 2121 struct btree_iter *iter, 2122 struct bkey_s_c_dirent d, 2123 struct bch_inode_unpacked *target) 2124 { 2125 struct bch_fs *c = trans->c; 2126 struct printbuf buf = PRINTBUF; 2127 struct btree_iter bp_iter = { NULL }; 2128 int ret = 0; 2129 2130 if (inode_points_to_dirent(target, d)) 2131 return 0; 2132 2133 if (!target->bi_dir && 2134 !target->bi_dir_offset) { 2135 fsck_err_on(S_ISDIR(target->bi_mode), 2136 trans, inode_dir_missing_backpointer, 2137 "directory with missing backpointer\n%s", 2138 (printbuf_reset(&buf), 2139 bch2_bkey_val_to_text(&buf, c, d.s_c), 2140 prt_printf(&buf, "\n"), 2141 bch2_inode_unpacked_to_text(&buf, target), 2142 buf.buf)); 2143 2144 fsck_err_on(target->bi_flags & BCH_INODE_unlinked, 2145 trans, inode_unlinked_but_has_dirent, 2146 "inode unlinked but has dirent\n%s", 2147 (printbuf_reset(&buf), 2148 bch2_bkey_val_to_text(&buf, c, d.s_c), 2149 prt_printf(&buf, "\n"), 2150 bch2_inode_unpacked_to_text(&buf, target), 2151 buf.buf)); 2152 2153 target->bi_flags &= ~BCH_INODE_unlinked; 2154 target->bi_dir = d.k->p.inode; 2155 target->bi_dir_offset = d.k->p.offset; 2156 return __bch2_fsck_write_inode(trans, target); 2157 } 2158 2159 if (bch2_inode_should_have_bp(target) && 2160 !fsck_err(trans, inode_wrong_backpointer, 2161 "dirent points to inode that does not point back:\n %s", 2162 (bch2_bkey_val_to_text(&buf, c, d.s_c), 2163 prt_printf(&buf, "\n "), 2164 bch2_inode_unpacked_to_text(&buf, target), 2165 buf.buf))) 2166 goto err; 2167 2168 struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, 2169 SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot)); 2170 ret = bkey_err(bp_dirent); 2171 if (ret && !bch2_err_matches(ret, ENOENT)) 2172 goto err; 2173 2174 bool backpointer_exists = !ret; 2175 ret = 0; 2176 2177 if (fsck_err_on(!backpointer_exists, 2178 trans, inode_wrong_backpointer, 2179 "inode %llu:%u has wrong backpointer:\n" 2180 "got %llu:%llu\n" 2181 "should be %llu:%llu", 2182 target->bi_inum, target->bi_snapshot, 2183 target->bi_dir, 2184 target->bi_dir_offset, 2185 d.k->p.inode, 2186 d.k->p.offset)) { 2187 target->bi_dir = d.k->p.inode; 2188 target->bi_dir_offset = d.k->p.offset; 2189 ret = __bch2_fsck_write_inode(trans, target); 2190 goto out; 2191 } 2192 2193 bch2_bkey_val_to_text(&buf, c, d.s_c); 2194 prt_newline(&buf); 2195 if (backpointer_exists) 2196 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); 2197 2198 if (fsck_err_on(backpointer_exists && 2199 (S_ISDIR(target->bi_mode) || 2200 target->bi_subvol), 2201 trans, inode_dir_multiple_links, 2202 "%s %llu:%u with multiple links\n%s", 2203 S_ISDIR(target->bi_mode) ? "directory" : "subvolume", 2204 target->bi_inum, target->bi_snapshot, buf.buf)) { 2205 ret = __remove_dirent(trans, d.k->p); 2206 goto out; 2207 } 2208 2209 /* 2210 * hardlinked file with nlink 0: 2211 * We're just adjusting nlink here so check_nlinks() will pick 2212 * it up, it ignores inodes with nlink 0 2213 */ 2214 if (fsck_err_on(backpointer_exists && !target->bi_nlink, 2215 trans, inode_multiple_links_but_nlink_0, 2216 "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", 2217 target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { 2218 target->bi_nlink++; 2219 target->bi_flags &= ~BCH_INODE_unlinked; 2220 ret = __bch2_fsck_write_inode(trans, target); 2221 if (ret) 2222 goto err; 2223 } 2224 out: 2225 err: 2226 fsck_err: 2227 bch2_trans_iter_exit(trans, &bp_iter); 2228 printbuf_exit(&buf); 2229 bch_err_fn(c, ret); 2230 return ret; 2231 } 2232 2233 noinline_for_stack 2234 static int check_dirent_target(struct btree_trans *trans, 2235 struct btree_iter *iter, 2236 struct bkey_s_c_dirent d, 2237 struct bch_inode_unpacked *target) 2238 { 2239 struct bch_fs *c = trans->c; 2240 struct bkey_i_dirent *n; 2241 struct printbuf buf = PRINTBUF; 2242 int ret = 0; 2243 2244 ret = check_dirent_inode_dirent(trans, iter, d, target); 2245 if (ret) 2246 goto err; 2247 2248 if (fsck_err_on(d.v->d_type != inode_d_type(target), 2249 trans, dirent_d_type_wrong, 2250 "incorrect d_type: got %s, should be %s:\n%s", 2251 bch2_d_type_str(d.v->d_type), 2252 bch2_d_type_str(inode_d_type(target)), 2253 (printbuf_reset(&buf), 2254 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 2255 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); 2256 ret = PTR_ERR_OR_ZERO(n); 2257 if (ret) 2258 goto err; 2259 2260 bkey_reassemble(&n->k_i, d.s_c); 2261 n->v.d_type = inode_d_type(target); 2262 if (n->v.d_type == DT_SUBVOL) { 2263 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); 2264 n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); 2265 } else { 2266 n->v.d_inum = cpu_to_le64(target->bi_inum); 2267 } 2268 2269 ret = bch2_trans_update(trans, iter, &n->k_i, 0); 2270 if (ret) 2271 goto err; 2272 2273 d = dirent_i_to_s_c(n); 2274 } 2275 err: 2276 fsck_err: 2277 printbuf_exit(&buf); 2278 bch_err_fn(c, ret); 2279 return ret; 2280 } 2281 2282 /* find a subvolume that's a descendent of @snapshot: */ 2283 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) 2284 { 2285 struct btree_iter iter; 2286 struct bkey_s_c k; 2287 int ret; 2288 2289 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { 2290 if (k.k->type != KEY_TYPE_subvolume) 2291 continue; 2292 2293 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2294 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { 2295 bch2_trans_iter_exit(trans, &iter); 2296 *subvolid = k.k->p.offset; 2297 goto found; 2298 } 2299 } 2300 if (!ret) 2301 ret = -ENOENT; 2302 found: 2303 bch2_trans_iter_exit(trans, &iter); 2304 return ret; 2305 } 2306 2307 noinline_for_stack 2308 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, 2309 struct bkey_s_c_dirent d) 2310 { 2311 struct bch_fs *c = trans->c; 2312 struct btree_iter subvol_iter = {}; 2313 struct bch_inode_unpacked subvol_root; 2314 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); 2315 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 2316 u32 parent_snapshot; 2317 u32 new_parent_subvol = 0; 2318 u64 parent_inum; 2319 struct printbuf buf = PRINTBUF; 2320 int ret = 0; 2321 2322 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); 2323 if (ret && !bch2_err_matches(ret, ENOENT)) 2324 return ret; 2325 2326 if (ret || 2327 (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { 2328 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); 2329 if (ret2 && !bch2_err_matches(ret, ENOENT)) 2330 return ret2; 2331 } 2332 2333 if (ret && 2334 !new_parent_subvol && 2335 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 2336 /* 2337 * Couldn't find a subvol for dirent's snapshot - but we lost 2338 * subvols, so we need to reconstruct: 2339 */ 2340 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); 2341 if (ret) 2342 return ret; 2343 2344 parent_snapshot = d.k->p.snapshot; 2345 } 2346 2347 if (fsck_err_on(ret, 2348 trans, dirent_to_missing_parent_subvol, 2349 "dirent parent_subvol points to missing subvolume\n%s", 2350 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || 2351 fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), 2352 trans, dirent_not_visible_in_parent_subvol, 2353 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", 2354 parent_snapshot, 2355 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 2356 if (!new_parent_subvol) { 2357 bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); 2358 return -BCH_ERR_fsck_repair_unimplemented; 2359 } 2360 2361 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); 2362 ret = PTR_ERR_OR_ZERO(new_dirent); 2363 if (ret) 2364 goto err; 2365 2366 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); 2367 } 2368 2369 struct bkey_s_c_subvolume s = 2370 bch2_bkey_get_iter_typed(trans, &subvol_iter, 2371 BTREE_ID_subvolumes, POS(0, target_subvol), 2372 0, subvolume); 2373 ret = bkey_err(s.s_c); 2374 if (ret && !bch2_err_matches(ret, ENOENT)) 2375 return ret; 2376 2377 if (ret) { 2378 if (fsck_err(trans, dirent_to_missing_subvol, 2379 "dirent points to missing subvolume\n%s", 2380 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) 2381 return __remove_dirent(trans, d.k->p); 2382 ret = 0; 2383 goto out; 2384 } 2385 2386 if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, 2387 trans, subvol_fs_path_parent_wrong, 2388 "subvol with wrong fs_path_parent, should be be %u\n%s", 2389 parent_subvol, 2390 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 2391 struct bkey_i_subvolume *n = 2392 bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); 2393 ret = PTR_ERR_OR_ZERO(n); 2394 if (ret) 2395 goto err; 2396 2397 n->v.fs_path_parent = cpu_to_le32(parent_subvol); 2398 } 2399 2400 u64 target_inum = le64_to_cpu(s.v->inode); 2401 u32 target_snapshot = le32_to_cpu(s.v->snapshot); 2402 2403 ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root); 2404 if (ret && !bch2_err_matches(ret, ENOENT)) 2405 goto err; 2406 2407 if (ret) { 2408 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); 2409 ret = -BCH_ERR_fsck_repair_unimplemented; 2410 goto err; 2411 } 2412 2413 if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, 2414 trans, inode_bi_parent_wrong, 2415 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", 2416 target_inum, 2417 subvol_root.bi_parent_subvol, parent_subvol)) { 2418 subvol_root.bi_parent_subvol = parent_subvol; 2419 subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot); 2420 ret = __bch2_fsck_write_inode(trans, &subvol_root); 2421 if (ret) 2422 goto err; 2423 } 2424 2425 ret = check_dirent_target(trans, iter, d, &subvol_root); 2426 if (ret) 2427 goto err; 2428 out: 2429 err: 2430 fsck_err: 2431 bch2_trans_iter_exit(trans, &subvol_iter); 2432 printbuf_exit(&buf); 2433 return ret; 2434 } 2435 2436 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, 2437 struct bkey_s_c k, 2438 struct bch_hash_info *hash_info, 2439 struct inode_walker *dir, 2440 struct inode_walker *target, 2441 struct snapshots_seen *s) 2442 { 2443 struct bch_fs *c = trans->c; 2444 struct inode_walker_entry *i; 2445 struct printbuf buf = PRINTBUF; 2446 int ret = 0; 2447 2448 ret = bch2_check_key_has_snapshot(trans, iter, k); 2449 if (ret) { 2450 ret = ret < 0 ? ret : 0; 2451 goto out; 2452 } 2453 2454 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 2455 if (ret) 2456 goto err; 2457 2458 if (k.k->type == KEY_TYPE_whiteout) 2459 goto out; 2460 2461 if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { 2462 ret = check_subdir_count(trans, dir); 2463 if (ret) 2464 goto err; 2465 } 2466 2467 i = walk_inode(trans, dir, k); 2468 ret = PTR_ERR_OR_ZERO(i); 2469 if (ret < 0) 2470 goto err; 2471 2472 ret = check_key_has_inode(trans, iter, dir, i, k); 2473 if (ret) 2474 goto err; 2475 2476 if (!i) 2477 goto out; 2478 2479 if (dir->first_this_inode) 2480 *hash_info = bch2_hash_info_init(c, &i->inode); 2481 dir->first_this_inode = false; 2482 2483 ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); 2484 if (ret < 0) 2485 goto err; 2486 if (ret) { 2487 /* dirent has been deleted */ 2488 ret = 0; 2489 goto out; 2490 } 2491 2492 if (k.k->type != KEY_TYPE_dirent) 2493 goto out; 2494 2495 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2496 2497 if (d.v->d_type == DT_SUBVOL) { 2498 ret = check_dirent_to_subvol(trans, iter, d); 2499 if (ret) 2500 goto err; 2501 } else { 2502 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); 2503 if (ret) 2504 goto err; 2505 2506 if (fsck_err_on(!target->inodes.nr, 2507 trans, dirent_to_missing_inode, 2508 "dirent points to missing inode:\n%s", 2509 (printbuf_reset(&buf), 2510 bch2_bkey_val_to_text(&buf, c, k), 2511 buf.buf))) { 2512 ret = __remove_dirent(trans, d.k->p); 2513 if (ret) 2514 goto err; 2515 } 2516 2517 darray_for_each(target->inodes, i) { 2518 ret = check_dirent_target(trans, iter, d, &i->inode); 2519 if (ret) 2520 goto err; 2521 } 2522 } 2523 2524 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 2525 if (ret) 2526 goto err; 2527 2528 if (d.v->d_type == DT_DIR) 2529 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) 2530 i->count++; 2531 out: 2532 err: 2533 fsck_err: 2534 printbuf_exit(&buf); 2535 bch_err_fn(c, ret); 2536 return ret; 2537 } 2538 2539 /* 2540 * Walk dirents: verify that they all have a corresponding S_ISDIR inode, 2541 * validate d_type 2542 */ 2543 int bch2_check_dirents(struct bch_fs *c) 2544 { 2545 struct inode_walker dir = inode_walker_init(); 2546 struct inode_walker target = inode_walker_init(); 2547 struct snapshots_seen s; 2548 struct bch_hash_info hash_info; 2549 2550 snapshots_seen_init(&s); 2551 2552 int ret = bch2_trans_run(c, 2553 for_each_btree_key(trans, iter, BTREE_ID_dirents, 2554 POS(BCACHEFS_ROOT_INO, 0), 2555 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 2556 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: 2557 check_subdir_count_notnested(trans, &dir)); 2558 2559 snapshots_seen_exit(&s); 2560 inode_walker_exit(&dir); 2561 inode_walker_exit(&target); 2562 bch_err_fn(c, ret); 2563 return ret; 2564 } 2565 2566 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, 2567 struct bkey_s_c k, 2568 struct bch_hash_info *hash_info, 2569 struct inode_walker *inode) 2570 { 2571 struct bch_fs *c = trans->c; 2572 struct inode_walker_entry *i; 2573 int ret; 2574 2575 ret = bch2_check_key_has_snapshot(trans, iter, k); 2576 if (ret < 0) 2577 return ret; 2578 if (ret) 2579 return 0; 2580 2581 i = walk_inode(trans, inode, k); 2582 ret = PTR_ERR_OR_ZERO(i); 2583 if (ret) 2584 return ret; 2585 2586 ret = check_key_has_inode(trans, iter, inode, i, k); 2587 if (ret) 2588 return ret; 2589 2590 if (!i) 2591 return 0; 2592 2593 if (inode->first_this_inode) 2594 *hash_info = bch2_hash_info_init(c, &i->inode); 2595 inode->first_this_inode = false; 2596 2597 ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); 2598 bch_err_fn(c, ret); 2599 return ret; 2600 } 2601 2602 /* 2603 * Walk xattrs: verify that they all have a corresponding inode 2604 */ 2605 int bch2_check_xattrs(struct bch_fs *c) 2606 { 2607 struct inode_walker inode = inode_walker_init(); 2608 struct bch_hash_info hash_info; 2609 int ret = 0; 2610 2611 ret = bch2_trans_run(c, 2612 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, 2613 POS(BCACHEFS_ROOT_INO, 0), 2614 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, 2615 k, 2616 NULL, NULL, 2617 BCH_TRANS_COMMIT_no_enospc, 2618 check_xattr(trans, &iter, k, &hash_info, &inode))); 2619 2620 inode_walker_exit(&inode); 2621 bch_err_fn(c, ret); 2622 return ret; 2623 } 2624 2625 static int check_root_trans(struct btree_trans *trans) 2626 { 2627 struct bch_fs *c = trans->c; 2628 struct bch_inode_unpacked root_inode; 2629 u32 snapshot; 2630 u64 inum; 2631 int ret; 2632 2633 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); 2634 if (ret && !bch2_err_matches(ret, ENOENT)) 2635 return ret; 2636 2637 if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, 2638 "root subvol missing")) { 2639 struct bkey_i_subvolume *root_subvol = 2640 bch2_trans_kmalloc(trans, sizeof(*root_subvol)); 2641 ret = PTR_ERR_OR_ZERO(root_subvol); 2642 if (ret) 2643 goto err; 2644 2645 snapshot = U32_MAX; 2646 inum = BCACHEFS_ROOT_INO; 2647 2648 bkey_subvolume_init(&root_subvol->k_i); 2649 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; 2650 root_subvol->v.flags = 0; 2651 root_subvol->v.snapshot = cpu_to_le32(snapshot); 2652 root_subvol->v.inode = cpu_to_le64(inum); 2653 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); 2654 bch_err_msg(c, ret, "writing root subvol"); 2655 if (ret) 2656 goto err; 2657 } 2658 2659 ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode); 2660 if (ret && !bch2_err_matches(ret, ENOENT)) 2661 return ret; 2662 2663 if (mustfix_fsck_err_on(ret, 2664 trans, root_dir_missing, 2665 "root directory missing") || 2666 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), 2667 trans, root_inode_not_dir, 2668 "root inode not a directory")) { 2669 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 2670 0, NULL); 2671 root_inode.bi_inum = inum; 2672 root_inode.bi_snapshot = snapshot; 2673 2674 ret = __bch2_fsck_write_inode(trans, &root_inode); 2675 bch_err_msg(c, ret, "writing root inode"); 2676 } 2677 err: 2678 fsck_err: 2679 return ret; 2680 } 2681 2682 /* Get root directory, create if it doesn't exist: */ 2683 int bch2_check_root(struct bch_fs *c) 2684 { 2685 int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2686 check_root_trans(trans)); 2687 bch_err_fn(c, ret); 2688 return ret; 2689 } 2690 2691 typedef DARRAY(u32) darray_u32; 2692 2693 static bool darray_u32_has(darray_u32 *d, u32 v) 2694 { 2695 darray_for_each(*d, i) 2696 if (*i == v) 2697 return true; 2698 return false; 2699 } 2700 2701 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) 2702 { 2703 struct bch_fs *c = trans->c; 2704 struct btree_iter parent_iter = {}; 2705 darray_u32 subvol_path = {}; 2706 struct printbuf buf = PRINTBUF; 2707 int ret = 0; 2708 2709 if (k.k->type != KEY_TYPE_subvolume) 2710 return 0; 2711 2712 while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { 2713 ret = darray_push(&subvol_path, k.k->p.offset); 2714 if (ret) 2715 goto err; 2716 2717 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2718 2719 struct bch_inode_unpacked subvol_root; 2720 ret = bch2_inode_find_by_inum_trans(trans, 2721 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 2722 &subvol_root); 2723 if (ret) 2724 break; 2725 2726 u32 parent = le32_to_cpu(s.v->fs_path_parent); 2727 2728 if (darray_u32_has(&subvol_path, parent)) { 2729 if (fsck_err(c, subvol_loop, "subvolume loop")) 2730 ret = reattach_subvol(trans, s); 2731 break; 2732 } 2733 2734 bch2_trans_iter_exit(trans, &parent_iter); 2735 bch2_trans_iter_init(trans, &parent_iter, 2736 BTREE_ID_subvolumes, POS(0, parent), 0); 2737 k = bch2_btree_iter_peek_slot(&parent_iter); 2738 ret = bkey_err(k); 2739 if (ret) 2740 goto err; 2741 2742 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, 2743 trans, subvol_unreachable, 2744 "unreachable subvolume %s", 2745 (bch2_bkey_val_to_text(&buf, c, s.s_c), 2746 buf.buf))) { 2747 ret = reattach_subvol(trans, s); 2748 break; 2749 } 2750 } 2751 fsck_err: 2752 err: 2753 printbuf_exit(&buf); 2754 darray_exit(&subvol_path); 2755 bch2_trans_iter_exit(trans, &parent_iter); 2756 return ret; 2757 } 2758 2759 int bch2_check_subvolume_structure(struct bch_fs *c) 2760 { 2761 int ret = bch2_trans_run(c, 2762 for_each_btree_key_commit(trans, iter, 2763 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, 2764 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2765 check_subvol_path(trans, &iter, k))); 2766 bch_err_fn(c, ret); 2767 return ret; 2768 } 2769 2770 struct pathbuf_entry { 2771 u64 inum; 2772 u32 snapshot; 2773 }; 2774 2775 typedef DARRAY(struct pathbuf_entry) pathbuf; 2776 2777 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) 2778 { 2779 darray_for_each(*p, i) 2780 if (i->inum == inum && 2781 i->snapshot == snapshot) 2782 return true; 2783 return false; 2784 } 2785 2786 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) 2787 { 2788 struct bch_fs *c = trans->c; 2789 struct btree_iter inode_iter = {}; 2790 struct bch_inode_unpacked inode; 2791 struct printbuf buf = PRINTBUF; 2792 u32 snapshot = inode_k.k->p.snapshot; 2793 int ret = 0; 2794 2795 p->nr = 0; 2796 2797 BUG_ON(bch2_inode_unpack(inode_k, &inode)); 2798 2799 if (!S_ISDIR(inode.bi_mode)) 2800 return 0; 2801 2802 while (!inode.bi_subvol) { 2803 struct btree_iter dirent_iter; 2804 struct bkey_s_c_dirent d; 2805 u32 parent_snapshot = snapshot; 2806 2807 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); 2808 ret = bkey_err(d.s_c); 2809 if (ret && !bch2_err_matches(ret, ENOENT)) 2810 break; 2811 2812 if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) 2813 bch2_trans_iter_exit(trans, &dirent_iter); 2814 2815 if (bch2_err_matches(ret, ENOENT)) { 2816 printbuf_reset(&buf); 2817 bch2_bkey_val_to_text(&buf, c, inode_k); 2818 bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", 2819 bch2_err_str(ret), buf.buf); 2820 goto out; 2821 } 2822 2823 bch2_trans_iter_exit(trans, &dirent_iter); 2824 2825 ret = darray_push(p, ((struct pathbuf_entry) { 2826 .inum = inode.bi_inum, 2827 .snapshot = snapshot, 2828 })); 2829 if (ret) 2830 return ret; 2831 2832 snapshot = parent_snapshot; 2833 2834 bch2_trans_iter_exit(trans, &inode_iter); 2835 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, 2836 SPOS(0, inode.bi_dir, snapshot), 0); 2837 ret = bkey_err(inode_k) ?: 2838 !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode 2839 : bch2_inode_unpack(inode_k, &inode); 2840 if (ret) { 2841 /* Should have been caught in dirents pass */ 2842 bch_err_msg(c, ret, "error looking up parent directory"); 2843 break; 2844 } 2845 2846 snapshot = inode_k.k->p.snapshot; 2847 2848 if (path_is_dup(p, inode.bi_inum, snapshot)) { 2849 /* XXX print path */ 2850 bch_err(c, "directory structure loop"); 2851 2852 darray_for_each(*p, i) 2853 pr_err("%llu:%u", i->inum, i->snapshot); 2854 pr_err("%llu:%u", inode.bi_inum, snapshot); 2855 2856 if (fsck_err(trans, dir_loop, "directory structure loop")) { 2857 ret = remove_backpointer(trans, &inode); 2858 bch_err_msg(c, ret, "removing dirent"); 2859 if (ret) 2860 break; 2861 2862 ret = reattach_inode(trans, &inode); 2863 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 2864 } 2865 break; 2866 } 2867 } 2868 out: 2869 fsck_err: 2870 bch2_trans_iter_exit(trans, &inode_iter); 2871 printbuf_exit(&buf); 2872 bch_err_fn(c, ret); 2873 return ret; 2874 } 2875 2876 /* 2877 * Check for loops in the directory structure: all other connectivity issues 2878 * have been fixed by prior passes 2879 */ 2880 int bch2_check_directory_structure(struct bch_fs *c) 2881 { 2882 pathbuf path = { 0, }; 2883 int ret; 2884 2885 ret = bch2_trans_run(c, 2886 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, 2887 BTREE_ITER_intent| 2888 BTREE_ITER_prefetch| 2889 BTREE_ITER_all_snapshots, k, 2890 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 2891 if (!bkey_is_inode(k.k)) 2892 continue; 2893 2894 if (bch2_inode_flags(k) & BCH_INODE_unlinked) 2895 continue; 2896 2897 check_path(trans, &path, k); 2898 }))); 2899 darray_exit(&path); 2900 2901 bch_err_fn(c, ret); 2902 return ret; 2903 } 2904 2905 struct nlink_table { 2906 size_t nr; 2907 size_t size; 2908 2909 struct nlink { 2910 u64 inum; 2911 u32 snapshot; 2912 u32 count; 2913 } *d; 2914 }; 2915 2916 static int add_nlink(struct bch_fs *c, struct nlink_table *t, 2917 u64 inum, u32 snapshot) 2918 { 2919 if (t->nr == t->size) { 2920 size_t new_size = max_t(size_t, 128UL, t->size * 2); 2921 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); 2922 2923 if (!d) { 2924 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", 2925 new_size); 2926 return -BCH_ERR_ENOMEM_fsck_add_nlink; 2927 } 2928 2929 if (t->d) 2930 memcpy(d, t->d, t->size * sizeof(t->d[0])); 2931 kvfree(t->d); 2932 2933 t->d = d; 2934 t->size = new_size; 2935 } 2936 2937 2938 t->d[t->nr++] = (struct nlink) { 2939 .inum = inum, 2940 .snapshot = snapshot, 2941 }; 2942 2943 return 0; 2944 } 2945 2946 static int nlink_cmp(const void *_l, const void *_r) 2947 { 2948 const struct nlink *l = _l; 2949 const struct nlink *r = _r; 2950 2951 return cmp_int(l->inum, r->inum); 2952 } 2953 2954 static void inc_link(struct bch_fs *c, struct snapshots_seen *s, 2955 struct nlink_table *links, 2956 u64 range_start, u64 range_end, u64 inum, u32 snapshot) 2957 { 2958 struct nlink *link, key = { 2959 .inum = inum, .snapshot = U32_MAX, 2960 }; 2961 2962 if (inum < range_start || inum >= range_end) 2963 return; 2964 2965 link = __inline_bsearch(&key, links->d, links->nr, 2966 sizeof(links->d[0]), nlink_cmp); 2967 if (!link) 2968 return; 2969 2970 while (link > links->d && link[0].inum == link[-1].inum) 2971 --link; 2972 2973 for (; link < links->d + links->nr && link->inum == inum; link++) 2974 if (ref_visible(c, s, snapshot, link->snapshot)) { 2975 link->count++; 2976 if (link->snapshot >= snapshot) 2977 break; 2978 } 2979 } 2980 2981 noinline_for_stack 2982 static int check_nlinks_find_hardlinks(struct bch_fs *c, 2983 struct nlink_table *t, 2984 u64 start, u64 *end) 2985 { 2986 int ret = bch2_trans_run(c, 2987 for_each_btree_key(trans, iter, BTREE_ID_inodes, 2988 POS(0, start), 2989 BTREE_ITER_intent| 2990 BTREE_ITER_prefetch| 2991 BTREE_ITER_all_snapshots, k, ({ 2992 if (!bkey_is_inode(k.k)) 2993 continue; 2994 2995 /* Should never fail, checked by bch2_inode_invalid: */ 2996 struct bch_inode_unpacked u; 2997 BUG_ON(bch2_inode_unpack(k, &u)); 2998 2999 /* 3000 * Backpointer and directory structure checks are sufficient for 3001 * directories, since they can't have hardlinks: 3002 */ 3003 if (S_ISDIR(u.bi_mode)) 3004 continue; 3005 3006 /* 3007 * Previous passes ensured that bi_nlink is nonzero if 3008 * it had multiple hardlinks: 3009 */ 3010 if (!u.bi_nlink) 3011 continue; 3012 3013 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); 3014 if (ret) { 3015 *end = k.k->p.offset; 3016 ret = 0; 3017 break; 3018 } 3019 0; 3020 }))); 3021 3022 bch_err_fn(c, ret); 3023 return ret; 3024 } 3025 3026 noinline_for_stack 3027 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, 3028 u64 range_start, u64 range_end) 3029 { 3030 struct snapshots_seen s; 3031 3032 snapshots_seen_init(&s); 3033 3034 int ret = bch2_trans_run(c, 3035 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, 3036 BTREE_ITER_intent| 3037 BTREE_ITER_prefetch| 3038 BTREE_ITER_all_snapshots, k, ({ 3039 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); 3040 if (ret) 3041 break; 3042 3043 if (k.k->type == KEY_TYPE_dirent) { 3044 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 3045 3046 if (d.v->d_type != DT_DIR && 3047 d.v->d_type != DT_SUBVOL) 3048 inc_link(c, &s, links, range_start, range_end, 3049 le64_to_cpu(d.v->d_inum), d.k->p.snapshot); 3050 } 3051 0; 3052 }))); 3053 3054 snapshots_seen_exit(&s); 3055 3056 bch_err_fn(c, ret); 3057 return ret; 3058 } 3059 3060 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, 3061 struct bkey_s_c k, 3062 struct nlink_table *links, 3063 size_t *idx, u64 range_end) 3064 { 3065 struct bch_inode_unpacked u; 3066 struct nlink *link = &links->d[*idx]; 3067 int ret = 0; 3068 3069 if (k.k->p.offset >= range_end) 3070 return 1; 3071 3072 if (!bkey_is_inode(k.k)) 3073 return 0; 3074 3075 BUG_ON(bch2_inode_unpack(k, &u)); 3076 3077 if (S_ISDIR(u.bi_mode)) 3078 return 0; 3079 3080 if (!u.bi_nlink) 3081 return 0; 3082 3083 while ((cmp_int(link->inum, k.k->p.offset) ?: 3084 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { 3085 BUG_ON(*idx == links->nr); 3086 link = &links->d[++*idx]; 3087 } 3088 3089 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, 3090 trans, inode_wrong_nlink, 3091 "inode %llu type %s has wrong i_nlink (%u, should be %u)", 3092 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], 3093 bch2_inode_nlink_get(&u), link->count)) { 3094 bch2_inode_nlink_set(&u, link->count); 3095 ret = __bch2_fsck_write_inode(trans, &u); 3096 } 3097 fsck_err: 3098 return ret; 3099 } 3100 3101 noinline_for_stack 3102 static int check_nlinks_update_hardlinks(struct bch_fs *c, 3103 struct nlink_table *links, 3104 u64 range_start, u64 range_end) 3105 { 3106 size_t idx = 0; 3107 3108 int ret = bch2_trans_run(c, 3109 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 3110 POS(0, range_start), 3111 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 3112 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 3113 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); 3114 if (ret < 0) { 3115 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); 3116 return ret; 3117 } 3118 3119 return 0; 3120 } 3121 3122 int bch2_check_nlinks(struct bch_fs *c) 3123 { 3124 struct nlink_table links = { 0 }; 3125 u64 this_iter_range_start, next_iter_range_start = 0; 3126 int ret = 0; 3127 3128 do { 3129 this_iter_range_start = next_iter_range_start; 3130 next_iter_range_start = U64_MAX; 3131 3132 ret = check_nlinks_find_hardlinks(c, &links, 3133 this_iter_range_start, 3134 &next_iter_range_start); 3135 3136 ret = check_nlinks_walk_dirents(c, &links, 3137 this_iter_range_start, 3138 next_iter_range_start); 3139 if (ret) 3140 break; 3141 3142 ret = check_nlinks_update_hardlinks(c, &links, 3143 this_iter_range_start, 3144 next_iter_range_start); 3145 if (ret) 3146 break; 3147 3148 links.nr = 0; 3149 } while (next_iter_range_start != U64_MAX); 3150 3151 kvfree(links.d); 3152 bch_err_fn(c, ret); 3153 return ret; 3154 } 3155 3156 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, 3157 struct bkey_s_c k) 3158 { 3159 struct bkey_s_c_reflink_p p; 3160 struct bkey_i_reflink_p *u; 3161 3162 if (k.k->type != KEY_TYPE_reflink_p) 3163 return 0; 3164 3165 p = bkey_s_c_to_reflink_p(k); 3166 3167 if (!p.v->front_pad && !p.v->back_pad) 3168 return 0; 3169 3170 u = bch2_trans_kmalloc(trans, sizeof(*u)); 3171 int ret = PTR_ERR_OR_ZERO(u); 3172 if (ret) 3173 return ret; 3174 3175 bkey_reassemble(&u->k_i, k); 3176 u->v.front_pad = 0; 3177 u->v.back_pad = 0; 3178 3179 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); 3180 } 3181 3182 int bch2_fix_reflink_p(struct bch_fs *c) 3183 { 3184 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) 3185 return 0; 3186 3187 int ret = bch2_trans_run(c, 3188 for_each_btree_key_commit(trans, iter, 3189 BTREE_ID_extents, POS_MIN, 3190 BTREE_ITER_intent|BTREE_ITER_prefetch| 3191 BTREE_ITER_all_snapshots, k, 3192 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 3193 fix_reflink_p_key(trans, &iter, k))); 3194 bch_err_fn(c, ret); 3195 return ret; 3196 } 3197