1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bcachefs_ioctl.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "darray.h" 10 #include "dirent.h" 11 #include "error.h" 12 #include "fs.h" 13 #include "fs-common.h" 14 #include "fsck.h" 15 #include "inode.h" 16 #include "keylist.h" 17 #include "recovery_passes.h" 18 #include "snapshot.h" 19 #include "super.h" 20 #include "thread_with_file.h" 21 #include "xattr.h" 22 23 #include <linux/bsearch.h> 24 #include <linux/dcache.h> /* struct qstr */ 25 26 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, 27 struct bkey_s_c_dirent d) 28 { 29 return inode->bi_dir == d.k->p.inode && 30 inode->bi_dir_offset == d.k->p.offset; 31 } 32 33 static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, 34 struct bch_inode_unpacked *inode) 35 { 36 if (d.v->d_type == DT_SUBVOL 37 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 38 : le64_to_cpu(d.v->d_inum) == inode->bi_inum) 39 return 0; 40 return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 41 } 42 43 static void dirent_inode_mismatch_msg(struct printbuf *out, 44 struct bch_fs *c, 45 struct bkey_s_c_dirent dirent, 46 struct bch_inode_unpacked *inode) 47 { 48 prt_str(out, "inode points to dirent that does not point back:"); 49 prt_newline(out); 50 bch2_bkey_val_to_text(out, c, dirent.s_c); 51 prt_newline(out); 52 bch2_inode_unpacked_to_text(out, inode); 53 } 54 55 static int dirent_points_to_inode(struct bch_fs *c, 56 struct bkey_s_c_dirent dirent, 57 struct bch_inode_unpacked *inode) 58 { 59 int ret = dirent_points_to_inode_nowarn(dirent, inode); 60 if (ret) { 61 struct printbuf buf = PRINTBUF; 62 dirent_inode_mismatch_msg(&buf, c, dirent, inode); 63 bch_warn(c, "%s", buf.buf); 64 printbuf_exit(&buf); 65 } 66 return ret; 67 } 68 69 /* 70 * XXX: this is handling transaction restarts without returning 71 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: 72 */ 73 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, 74 u32 snapshot) 75 { 76 u64 sectors = 0; 77 78 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, 79 SPOS(inum, 0, snapshot), 80 POS(inum, U64_MAX), 81 0, k, ({ 82 if (bkey_extent_is_allocation(k.k)) 83 sectors += k.k->size; 84 0; 85 })); 86 87 return ret ?: sectors; 88 } 89 90 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, 91 u32 snapshot) 92 { 93 u64 subdirs = 0; 94 95 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, 96 SPOS(inum, 0, snapshot), 97 POS(inum, U64_MAX), 98 0, k, ({ 99 if (k.k->type == KEY_TYPE_dirent && 100 bkey_s_c_to_dirent(k).v->d_type == DT_DIR) 101 subdirs++; 102 0; 103 })); 104 105 return ret ?: subdirs; 106 } 107 108 static int subvol_lookup(struct btree_trans *trans, u32 subvol, 109 u32 *snapshot, u64 *inum) 110 { 111 struct bch_subvolume s; 112 int ret = bch2_subvolume_get(trans, subvol, false, &s); 113 114 *snapshot = le32_to_cpu(s.snapshot); 115 *inum = le64_to_cpu(s.inode); 116 return ret; 117 } 118 119 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, 120 struct bch_inode_unpacked *inode) 121 { 122 struct btree_iter iter; 123 struct bkey_s_c k; 124 int ret; 125 126 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), 127 BTREE_ITER_all_snapshots, k, ret) { 128 if (k.k->p.offset != inode_nr) 129 break; 130 if (!bkey_is_inode(k.k)) 131 continue; 132 ret = bch2_inode_unpack(k, inode); 133 goto found; 134 } 135 ret = -BCH_ERR_ENOENT_inode; 136 found: 137 bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); 138 bch2_trans_iter_exit(trans, &iter); 139 return ret; 140 } 141 142 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, 143 struct bch_inode_unpacked *inode) 144 { 145 struct btree_iter iter; 146 struct bkey_s_c k; 147 int ret; 148 149 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 150 SPOS(0, inode_nr, snapshot), 0); 151 ret = bkey_err(k); 152 if (ret) 153 goto err; 154 155 ret = bkey_is_inode(k.k) 156 ? bch2_inode_unpack(k, inode) 157 : -BCH_ERR_ENOENT_inode; 158 err: 159 bch2_trans_iter_exit(trans, &iter); 160 return ret; 161 } 162 163 static int lookup_dirent_in_snapshot(struct btree_trans *trans, 164 struct bch_hash_info hash_info, 165 subvol_inum dir, struct qstr *name, 166 u64 *target, unsigned *type, u32 snapshot) 167 { 168 struct btree_iter iter; 169 struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, 170 &hash_info, dir, name, 0, snapshot); 171 int ret = bkey_err(k); 172 if (ret) 173 return ret; 174 175 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 176 *target = le64_to_cpu(d.v->d_inum); 177 *type = d.v->d_type; 178 bch2_trans_iter_exit(trans, &iter); 179 return 0; 180 } 181 182 static int __remove_dirent(struct btree_trans *trans, struct bpos pos) 183 { 184 struct bch_fs *c = trans->c; 185 struct btree_iter iter; 186 struct bch_inode_unpacked dir_inode; 187 struct bch_hash_info dir_hash_info; 188 int ret; 189 190 ret = lookup_first_inode(trans, pos.inode, &dir_inode); 191 if (ret) 192 goto err; 193 194 dir_hash_info = bch2_hash_info_init(c, &dir_inode); 195 196 bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); 197 198 ret = bch2_btree_iter_traverse(&iter) ?: 199 bch2_hash_delete_at(trans, bch2_dirent_hash_desc, 200 &dir_hash_info, &iter, 201 BTREE_UPDATE_internal_snapshot_node); 202 bch2_trans_iter_exit(trans, &iter); 203 err: 204 bch_err_fn(c, ret); 205 return ret; 206 } 207 208 /* 209 * Find any subvolume associated with a tree of snapshots 210 * We can't rely on master_subvol - it might have been deleted. 211 */ 212 static int find_snapshot_tree_subvol(struct btree_trans *trans, 213 u32 tree_id, u32 *subvol) 214 { 215 struct btree_iter iter; 216 struct bkey_s_c k; 217 int ret; 218 219 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { 220 if (k.k->type != KEY_TYPE_snapshot) 221 continue; 222 223 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); 224 if (le32_to_cpu(s.v->tree) != tree_id) 225 continue; 226 227 if (s.v->subvol) { 228 *subvol = le32_to_cpu(s.v->subvol); 229 goto found; 230 } 231 } 232 ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol; 233 found: 234 bch2_trans_iter_exit(trans, &iter); 235 return ret; 236 } 237 238 /* Get lost+found, create if it doesn't exist: */ 239 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, 240 struct bch_inode_unpacked *lostfound, 241 u64 reattaching_inum) 242 { 243 struct bch_fs *c = trans->c; 244 struct qstr lostfound_str = QSTR("lost+found"); 245 struct btree_iter lostfound_iter = { NULL }; 246 u64 inum = 0; 247 unsigned d_type = 0; 248 int ret; 249 250 struct bch_snapshot_tree st; 251 ret = bch2_snapshot_tree_lookup(trans, 252 bch2_snapshot_tree(c, snapshot), &st); 253 if (ret) 254 return ret; 255 256 u32 subvolid; 257 ret = find_snapshot_tree_subvol(trans, 258 bch2_snapshot_tree(c, snapshot), &subvolid); 259 bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u", 260 bch2_snapshot_tree(c, snapshot)); 261 if (ret) 262 return ret; 263 264 struct bch_subvolume subvol; 265 ret = bch2_subvolume_get(trans, subvolid, false, &subvol); 266 bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot); 267 if (ret) 268 return ret; 269 270 if (!subvol.inode) { 271 struct btree_iter iter; 272 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, 273 BTREE_ID_subvolumes, POS(0, subvolid), 274 0, subvolume); 275 ret = PTR_ERR_OR_ZERO(subvol); 276 if (ret) 277 return ret; 278 279 subvol->v.inode = cpu_to_le64(reattaching_inum); 280 bch2_trans_iter_exit(trans, &iter); 281 } 282 283 subvol_inum root_inum = { 284 .subvol = subvolid, 285 .inum = le64_to_cpu(subvol.inode) 286 }; 287 288 struct bch_inode_unpacked root_inode; 289 struct bch_hash_info root_hash_info; 290 ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); 291 bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", 292 root_inum.inum, subvolid); 293 if (ret) 294 return ret; 295 296 root_hash_info = bch2_hash_info_init(c, &root_inode); 297 298 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, 299 &lostfound_str, &inum, &d_type, snapshot); 300 if (bch2_err_matches(ret, ENOENT)) 301 goto create_lostfound; 302 303 bch_err_fn(c, ret); 304 if (ret) 305 return ret; 306 307 if (d_type != DT_DIR) { 308 bch_err(c, "error looking up lost+found: not a directory"); 309 return -BCH_ERR_ENOENT_not_directory; 310 } 311 312 /* 313 * The bch2_check_dirents pass has already run, dangling dirents 314 * shouldn't exist here: 315 */ 316 ret = lookup_inode(trans, inum, snapshot, lostfound); 317 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", 318 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); 319 return ret; 320 321 create_lostfound: 322 /* 323 * we always create lost+found in the root snapshot; we don't want 324 * different branches of the snapshot tree to have different lost+found 325 */ 326 snapshot = le32_to_cpu(st.root_snapshot); 327 /* 328 * XXX: we could have a nicer log message here if we had a nice way to 329 * walk backpointers to print a path 330 */ 331 struct printbuf path = PRINTBUF; 332 ret = bch2_inum_to_path(trans, root_inum, &path); 333 if (ret) 334 goto err; 335 336 bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", 337 path.buf, root_inum.subvol, snapshot); 338 printbuf_exit(&path); 339 340 u64 now = bch2_current_time(c); 341 u64 cpu = raw_smp_processor_id(); 342 343 bch2_inode_init_early(c, lostfound); 344 bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); 345 lostfound->bi_dir = root_inode.bi_inum; 346 lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); 347 348 root_inode.bi_nlink++; 349 350 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); 351 if (ret) 352 goto err; 353 354 bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); 355 ret = bch2_btree_iter_traverse(&lostfound_iter); 356 if (ret) 357 goto err; 358 359 ret = bch2_dirent_create_snapshot(trans, 360 0, root_inode.bi_inum, snapshot, &root_hash_info, 361 mode_to_type(lostfound->bi_mode), 362 &lostfound_str, 363 lostfound->bi_inum, 364 &lostfound->bi_dir_offset, 365 STR_HASH_must_create) ?: 366 bch2_inode_write_flags(trans, &lostfound_iter, lostfound, 367 BTREE_UPDATE_internal_snapshot_node); 368 err: 369 bch_err_msg(c, ret, "creating lost+found"); 370 bch2_trans_iter_exit(trans, &lostfound_iter); 371 return ret; 372 } 373 374 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) 375 { 376 if (inode->bi_inum == BCACHEFS_ROOT_INO && 377 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) 378 return false; 379 380 return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); 381 } 382 383 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) 384 { 385 struct btree_iter iter; 386 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, 387 SPOS(d_pos.inode, d_pos.offset, snapshot), 388 BTREE_ITER_intent| 389 BTREE_ITER_with_updates); 390 int ret = bkey_err(k); 391 if (ret) 392 return ret; 393 394 if (bpos_eq(k.k->p, d_pos)) { 395 /* 396 * delet_at() doesn't work because the update path doesn't 397 * internally use BTREE_ITER_with_updates yet 398 */ 399 struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); 400 ret = PTR_ERR_OR_ZERO(k); 401 if (ret) 402 goto err; 403 404 bkey_init(&k->k); 405 k->k.type = KEY_TYPE_whiteout; 406 k->k.p = iter.pos; 407 ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); 408 } 409 err: 410 bch2_trans_iter_exit(trans, &iter); 411 return ret; 412 } 413 414 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) 415 { 416 struct bch_fs *c = trans->c; 417 struct bch_inode_unpacked lostfound; 418 char name_buf[20]; 419 int ret; 420 421 u32 dirent_snapshot = inode->bi_snapshot; 422 if (inode->bi_subvol) { 423 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; 424 425 u64 root_inum; 426 ret = subvol_lookup(trans, inode->bi_parent_subvol, 427 &dirent_snapshot, &root_inum); 428 if (ret) 429 return ret; 430 431 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); 432 } else { 433 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 434 } 435 436 ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); 437 if (ret) 438 return ret; 439 440 lostfound.bi_nlink += S_ISDIR(inode->bi_mode); 441 442 /* ensure lost+found inode is also present in inode snapshot */ 443 if (!inode->bi_subvol) { 444 BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot)); 445 lostfound.bi_snapshot = inode->bi_snapshot; 446 } 447 448 ret = __bch2_fsck_write_inode(trans, &lostfound); 449 if (ret) 450 return ret; 451 452 struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); 453 struct qstr name = (struct qstr) QSTR(name_buf); 454 455 inode->bi_dir = lostfound.bi_inum; 456 457 ret = bch2_dirent_create_snapshot(trans, 458 inode->bi_parent_subvol, lostfound.bi_inum, 459 dirent_snapshot, 460 &dir_hash, 461 inode_d_type(inode), 462 &name, 463 inode->bi_subvol ?: inode->bi_inum, 464 &inode->bi_dir_offset, 465 STR_HASH_must_create); 466 if (ret) { 467 bch_err_msg(c, ret, "error creating dirent"); 468 return ret; 469 } 470 471 ret = __bch2_fsck_write_inode(trans, inode); 472 if (ret) 473 return ret; 474 475 /* 476 * Fix up inodes in child snapshots: if they should also be reattached 477 * update the backpointer field, if they should not be we need to emit 478 * whiteouts for the dirent we just created. 479 */ 480 if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { 481 snapshot_id_list whiteouts_done; 482 struct btree_iter iter; 483 struct bkey_s_c k; 484 485 darray_init(&whiteouts_done); 486 487 for_each_btree_key_reverse_norestart(trans, iter, 488 BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), 489 BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { 490 if (k.k->p.offset != inode->bi_inum) 491 break; 492 493 if (!bkey_is_inode(k.k) || 494 !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || 495 snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) 496 continue; 497 498 struct bch_inode_unpacked child_inode; 499 ret = bch2_inode_unpack(k, &child_inode); 500 if (ret) 501 break; 502 503 if (!inode_should_reattach(&child_inode)) { 504 ret = maybe_delete_dirent(trans, 505 SPOS(lostfound.bi_inum, inode->bi_dir_offset, 506 dirent_snapshot), 507 k.k->p.snapshot); 508 if (ret) 509 break; 510 511 ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); 512 if (ret) 513 break; 514 } else { 515 iter.snapshot = k.k->p.snapshot; 516 child_inode.bi_dir = inode->bi_dir; 517 child_inode.bi_dir_offset = inode->bi_dir_offset; 518 519 ret = bch2_inode_write_flags(trans, &iter, &child_inode, 520 BTREE_UPDATE_internal_snapshot_node); 521 if (ret) 522 break; 523 } 524 } 525 darray_exit(&whiteouts_done); 526 bch2_trans_iter_exit(trans, &iter); 527 } 528 529 return ret; 530 } 531 532 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 533 struct btree_iter *iter, 534 struct bpos pos) 535 { 536 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 537 } 538 539 static int remove_backpointer(struct btree_trans *trans, 540 struct bch_inode_unpacked *inode) 541 { 542 if (!inode->bi_dir) 543 return 0; 544 545 struct bch_fs *c = trans->c; 546 struct btree_iter iter; 547 struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, 548 SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); 549 int ret = bkey_err(d) ?: 550 dirent_points_to_inode(c, d, inode) ?: 551 __remove_dirent(trans, d.k->p); 552 bch2_trans_iter_exit(trans, &iter); 553 return ret; 554 } 555 556 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) 557 { 558 struct bch_fs *c = trans->c; 559 560 struct bch_inode_unpacked inode; 561 int ret = bch2_inode_find_by_inum_trans(trans, 562 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 563 &inode); 564 if (ret) 565 return ret; 566 567 ret = remove_backpointer(trans, &inode); 568 if (!bch2_err_matches(ret, ENOENT)) 569 bch_err_msg(c, ret, "removing dirent"); 570 if (ret) 571 return ret; 572 573 ret = reattach_inode(trans, &inode); 574 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 575 return ret; 576 } 577 578 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) 579 { 580 struct bch_fs *c = trans->c; 581 582 if (!bch2_snapshot_is_leaf(c, snapshotid)) { 583 bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); 584 return -BCH_ERR_fsck_repair_unimplemented; 585 } 586 587 /* 588 * If inum isn't set, that means we're being called from check_dirents, 589 * not check_inodes - the root of this subvolume doesn't exist or we 590 * would have found it there: 591 */ 592 if (!inum) { 593 struct btree_iter inode_iter = {}; 594 struct bch_inode_unpacked new_inode; 595 u64 cpu = raw_smp_processor_id(); 596 597 bch2_inode_init_early(c, &new_inode); 598 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); 599 600 new_inode.bi_subvol = subvolid; 601 602 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: 603 bch2_btree_iter_traverse(&inode_iter) ?: 604 bch2_inode_write(trans, &inode_iter, &new_inode); 605 bch2_trans_iter_exit(trans, &inode_iter); 606 if (ret) 607 return ret; 608 609 inum = new_inode.bi_inum; 610 } 611 612 bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); 613 614 struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); 615 int ret = PTR_ERR_OR_ZERO(new_subvol); 616 if (ret) 617 return ret; 618 619 bkey_subvolume_init(&new_subvol->k_i); 620 new_subvol->k.p.offset = subvolid; 621 new_subvol->v.snapshot = cpu_to_le32(snapshotid); 622 new_subvol->v.inode = cpu_to_le64(inum); 623 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); 624 if (ret) 625 return ret; 626 627 struct btree_iter iter; 628 struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, 629 BTREE_ID_snapshots, POS(0, snapshotid), 630 0, snapshot); 631 ret = PTR_ERR_OR_ZERO(s); 632 bch_err_msg(c, ret, "getting snapshot %u", snapshotid); 633 if (ret) 634 return ret; 635 636 u32 snapshot_tree = le32_to_cpu(s->v.tree); 637 638 s->v.subvol = cpu_to_le32(subvolid); 639 SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); 640 bch2_trans_iter_exit(trans, &iter); 641 642 struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, 643 BTREE_ID_snapshot_trees, POS(0, snapshot_tree), 644 0, snapshot_tree); 645 ret = PTR_ERR_OR_ZERO(st); 646 bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); 647 if (ret) 648 return ret; 649 650 if (!st->v.master_subvol) 651 st->v.master_subvol = cpu_to_le32(subvolid); 652 653 bch2_trans_iter_exit(trans, &iter); 654 return 0; 655 } 656 657 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) 658 { 659 struct bch_fs *c = trans->c; 660 unsigned i_mode = S_IFREG; 661 u64 i_size = 0; 662 663 switch (btree) { 664 case BTREE_ID_extents: { 665 struct btree_iter iter = {}; 666 667 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); 668 struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); 669 bch2_trans_iter_exit(trans, &iter); 670 int ret = bkey_err(k); 671 if (ret) 672 return ret; 673 674 i_size = k.k->p.offset << 9; 675 break; 676 } 677 case BTREE_ID_dirents: 678 i_mode = S_IFDIR; 679 break; 680 case BTREE_ID_xattrs: 681 break; 682 default: 683 BUG(); 684 } 685 686 struct bch_inode_unpacked new_inode; 687 bch2_inode_init_early(c, &new_inode); 688 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); 689 new_inode.bi_size = i_size; 690 new_inode.bi_inum = inum; 691 new_inode.bi_snapshot = snapshot; 692 693 return __bch2_fsck_write_inode(trans, &new_inode); 694 } 695 696 struct snapshots_seen { 697 struct bpos pos; 698 snapshot_id_list ids; 699 }; 700 701 static inline void snapshots_seen_exit(struct snapshots_seen *s) 702 { 703 darray_exit(&s->ids); 704 } 705 706 static inline void snapshots_seen_init(struct snapshots_seen *s) 707 { 708 memset(s, 0, sizeof(*s)); 709 } 710 711 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) 712 { 713 u32 *i; 714 __darray_for_each(s->ids, i) { 715 if (*i == id) 716 return 0; 717 if (*i > id) 718 break; 719 } 720 721 int ret = darray_insert_item(&s->ids, i - s->ids.data, id); 722 if (ret) 723 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 724 s->ids.size); 725 return ret; 726 } 727 728 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, 729 enum btree_id btree_id, struct bpos pos) 730 { 731 if (!bkey_eq(s->pos, pos)) 732 s->ids.nr = 0; 733 s->pos = pos; 734 735 return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); 736 } 737 738 /** 739 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, 740 * and @ancestor hasn't been overwritten in @seen 741 * 742 * @c: filesystem handle 743 * @seen: list of snapshot ids already seen at current position 744 * @id: descendent snapshot id 745 * @ancestor: ancestor snapshot id 746 * 747 * Returns: whether key in @ancestor snapshot is visible in @id snapshot 748 */ 749 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, 750 u32 id, u32 ancestor) 751 { 752 ssize_t i; 753 754 EBUG_ON(id > ancestor); 755 756 /* @ancestor should be the snapshot most recently added to @seen */ 757 EBUG_ON(ancestor != seen->pos.snapshot); 758 EBUG_ON(ancestor != darray_last(seen->ids)); 759 760 if (id == ancestor) 761 return true; 762 763 if (!bch2_snapshot_is_ancestor(c, id, ancestor)) 764 return false; 765 766 /* 767 * We know that @id is a descendant of @ancestor, we're checking if 768 * we've seen a key that overwrote @ancestor - i.e. also a descendent of 769 * @ascestor and with @id as a descendent. 770 * 771 * But we already know that we're scanning IDs between @id and @ancestor 772 * numerically, since snapshot ID lists are kept sorted, so if we find 773 * an id that's an ancestor of @id we're done: 774 */ 775 776 for (i = seen->ids.nr - 2; 777 i >= 0 && seen->ids.data[i] >= id; 778 --i) 779 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) 780 return false; 781 782 return true; 783 } 784 785 /** 786 * ref_visible - given a key with snapshot id @src that points to a key with 787 * snapshot id @dst, test whether there is some snapshot in which @dst is 788 * visible. 789 * 790 * @c: filesystem handle 791 * @s: list of snapshot IDs already seen at @src 792 * @src: snapshot ID of src key 793 * @dst: snapshot ID of dst key 794 * Returns: true if there is some snapshot in which @dst is visible 795 * 796 * Assumes we're visiting @src keys in natural key order 797 */ 798 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, 799 u32 src, u32 dst) 800 { 801 return dst <= src 802 ? key_visible_in_snapshot(c, s, dst, src) 803 : bch2_snapshot_is_ancestor(c, src, dst); 804 } 805 806 static int ref_visible2(struct bch_fs *c, 807 u32 src, struct snapshots_seen *src_seen, 808 u32 dst, struct snapshots_seen *dst_seen) 809 { 810 if (dst > src) { 811 swap(dst, src); 812 swap(dst_seen, src_seen); 813 } 814 return key_visible_in_snapshot(c, src_seen, dst, src); 815 } 816 817 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ 818 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ 819 (_i)->snapshot <= (_snapshot); _i++) \ 820 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) 821 822 struct inode_walker_entry { 823 struct bch_inode_unpacked inode; 824 u32 snapshot; 825 u64 count; 826 }; 827 828 struct inode_walker { 829 bool first_this_inode; 830 bool have_inodes; 831 bool recalculate_sums; 832 struct bpos last_pos; 833 834 DARRAY(struct inode_walker_entry) inodes; 835 snapshot_id_list deletes; 836 }; 837 838 static void inode_walker_exit(struct inode_walker *w) 839 { 840 darray_exit(&w->inodes); 841 darray_exit(&w->deletes); 842 } 843 844 static struct inode_walker inode_walker_init(void) 845 { 846 return (struct inode_walker) { 0, }; 847 } 848 849 static int add_inode(struct bch_fs *c, struct inode_walker *w, 850 struct bkey_s_c inode) 851 { 852 struct bch_inode_unpacked u; 853 854 return bch2_inode_unpack(inode, &u) ?: 855 darray_push(&w->inodes, ((struct inode_walker_entry) { 856 .inode = u, 857 .snapshot = inode.k->p.snapshot, 858 })); 859 } 860 861 static int get_inodes_all_snapshots(struct btree_trans *trans, 862 struct inode_walker *w, u64 inum) 863 { 864 struct bch_fs *c = trans->c; 865 struct btree_iter iter; 866 struct bkey_s_c k; 867 int ret; 868 869 /* 870 * We no longer have inodes for w->last_pos; clear this to avoid 871 * screwing up check_i_sectors/check_subdir_count if we take a 872 * transaction restart here: 873 */ 874 w->have_inodes = false; 875 w->recalculate_sums = false; 876 w->inodes.nr = 0; 877 878 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 879 BTREE_ITER_all_snapshots, k, ret) { 880 if (k.k->p.offset != inum) 881 break; 882 883 if (bkey_is_inode(k.k)) 884 add_inode(c, w, k); 885 } 886 bch2_trans_iter_exit(trans, &iter); 887 888 if (ret) 889 return ret; 890 891 w->first_this_inode = true; 892 w->have_inodes = true; 893 return 0; 894 } 895 896 static struct inode_walker_entry * 897 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) 898 { 899 bool is_whiteout = k.k->type == KEY_TYPE_whiteout; 900 901 struct inode_walker_entry *i; 902 __darray_for_each(w->inodes, i) 903 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) 904 goto found; 905 906 return NULL; 907 found: 908 BUG_ON(k.k->p.snapshot > i->snapshot); 909 910 if (k.k->p.snapshot != i->snapshot && !is_whiteout) { 911 struct inode_walker_entry new = *i; 912 913 new.snapshot = k.k->p.snapshot; 914 new.count = 0; 915 916 struct printbuf buf = PRINTBUF; 917 bch2_bkey_val_to_text(&buf, c, k); 918 919 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" 920 "unexpected because we should always update the inode when we update a key in that inode\n" 921 "%s", 922 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); 923 printbuf_exit(&buf); 924 925 while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) 926 --i; 927 928 size_t pos = i - w->inodes.data; 929 int ret = darray_insert_item(&w->inodes, pos, new); 930 if (ret) 931 return ERR_PTR(ret); 932 933 i = w->inodes.data + pos; 934 } 935 936 return i; 937 } 938 939 static struct inode_walker_entry *walk_inode(struct btree_trans *trans, 940 struct inode_walker *w, 941 struct bkey_s_c k) 942 { 943 if (w->last_pos.inode != k.k->p.inode) { 944 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); 945 if (ret) 946 return ERR_PTR(ret); 947 } 948 949 w->last_pos = k.k->p; 950 951 return lookup_inode_for_snapshot(trans->c, w, k); 952 } 953 954 static int get_visible_inodes(struct btree_trans *trans, 955 struct inode_walker *w, 956 struct snapshots_seen *s, 957 u64 inum) 958 { 959 struct bch_fs *c = trans->c; 960 struct btree_iter iter; 961 struct bkey_s_c k; 962 int ret; 963 964 w->inodes.nr = 0; 965 w->deletes.nr = 0; 966 967 for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), 968 BTREE_ITER_all_snapshots, k, ret) { 969 if (k.k->p.offset != inum) 970 break; 971 972 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) 973 continue; 974 975 if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) 976 continue; 977 978 ret = bkey_is_inode(k.k) 979 ? add_inode(c, w, k) 980 : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); 981 if (ret) 982 break; 983 } 984 bch2_trans_iter_exit(trans, &iter); 985 986 return ret; 987 } 988 989 /* 990 * Prefer to delete the first one, since that will be the one at the wrong 991 * offset: 992 * return value: 0 -> delete k1, 1 -> delete k2 993 */ 994 int bch2_fsck_update_backpointers(struct btree_trans *trans, 995 struct snapshots_seen *s, 996 const struct bch_hash_desc desc, 997 struct bch_hash_info *hash_info, 998 struct bkey_i *new) 999 { 1000 if (new->k.type != KEY_TYPE_dirent) 1001 return 0; 1002 1003 struct bkey_i_dirent *d = bkey_i_to_dirent(new); 1004 struct inode_walker target = inode_walker_init(); 1005 int ret = 0; 1006 1007 if (d->v.d_type == DT_SUBVOL) { 1008 BUG(); 1009 } else { 1010 ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); 1011 if (ret) 1012 goto err; 1013 1014 darray_for_each(target.inodes, i) { 1015 i->inode.bi_dir_offset = d->k.p.offset; 1016 ret = __bch2_fsck_write_inode(trans, &i->inode); 1017 if (ret) 1018 goto err; 1019 } 1020 } 1021 err: 1022 inode_walker_exit(&target); 1023 return ret; 1024 } 1025 1026 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, 1027 struct btree_iter *iter, 1028 struct bch_inode_unpacked *inode, 1029 u32 *snapshot) 1030 { 1031 if (inode->bi_subvol) { 1032 u64 inum; 1033 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); 1034 if (ret) 1035 return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); 1036 } 1037 1038 return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); 1039 } 1040 1041 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) 1042 { 1043 struct btree_iter iter; 1044 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); 1045 int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; 1046 bch2_trans_iter_exit(trans, &iter); 1047 return ret; 1048 } 1049 1050 static int check_inode_dirent_inode(struct btree_trans *trans, 1051 struct bch_inode_unpacked *inode, 1052 bool *write_inode) 1053 { 1054 struct bch_fs *c = trans->c; 1055 struct printbuf buf = PRINTBUF; 1056 1057 u32 inode_snapshot = inode->bi_snapshot; 1058 struct btree_iter dirent_iter = {}; 1059 struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); 1060 int ret = bkey_err(d); 1061 if (ret && !bch2_err_matches(ret, ENOENT)) 1062 return ret; 1063 1064 if (fsck_err_on(ret, 1065 trans, inode_points_to_missing_dirent, 1066 "inode points to missing dirent\n%s", 1067 (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || 1068 fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), 1069 trans, inode_points_to_wrong_dirent, 1070 "%s", 1071 (printbuf_reset(&buf), 1072 dirent_inode_mismatch_msg(&buf, c, d, inode), 1073 buf.buf))) { 1074 /* 1075 * We just clear the backpointer fields for now. If we find a 1076 * dirent that points to this inode in check_dirents(), we'll 1077 * update it then; then when we get to check_path() if the 1078 * backpointer is still 0 we'll reattach it. 1079 */ 1080 inode->bi_dir = 0; 1081 inode->bi_dir_offset = 0; 1082 *write_inode = true; 1083 } 1084 1085 ret = 0; 1086 fsck_err: 1087 bch2_trans_iter_exit(trans, &dirent_iter); 1088 printbuf_exit(&buf); 1089 bch_err_fn(c, ret); 1090 return ret; 1091 } 1092 1093 static int get_snapshot_root_inode(struct btree_trans *trans, 1094 struct bch_inode_unpacked *root, 1095 u64 inum) 1096 { 1097 struct btree_iter iter; 1098 struct bkey_s_c k; 1099 int ret = 0; 1100 1101 for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, 1102 SPOS(0, inum, U32_MAX), 1103 BTREE_ITER_all_snapshots, k, ret) { 1104 if (k.k->p.offset != inum) 1105 break; 1106 if (bkey_is_inode(k.k)) 1107 goto found_root; 1108 } 1109 if (ret) 1110 goto err; 1111 BUG(); 1112 found_root: 1113 ret = bch2_inode_unpack(k, root); 1114 err: 1115 bch2_trans_iter_exit(trans, &iter); 1116 return ret; 1117 } 1118 1119 static int check_directory_size(struct btree_trans *trans, 1120 struct bch_inode_unpacked *inode_u, 1121 struct bkey_s_c inode_k, bool *write_inode) 1122 { 1123 struct btree_iter iter; 1124 struct bkey_s_c k; 1125 u64 new_size = 0; 1126 int ret; 1127 1128 for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, 1129 SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot), 1130 POS(inode_k.k->p.offset, U64_MAX), 1131 0, k, ret) { 1132 if (k.k->type != KEY_TYPE_dirent) 1133 continue; 1134 1135 struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k); 1136 struct qstr name = bch2_dirent_get_name(dirent); 1137 1138 new_size += dirent_occupied_size(&name); 1139 } 1140 bch2_trans_iter_exit(trans, &iter); 1141 1142 if (!ret && inode_u->bi_size != new_size) { 1143 inode_u->bi_size = new_size; 1144 *write_inode = true; 1145 } 1146 1147 return ret; 1148 } 1149 1150 static int check_inode(struct btree_trans *trans, 1151 struct btree_iter *iter, 1152 struct bkey_s_c k, 1153 struct bch_inode_unpacked *snapshot_root, 1154 struct snapshots_seen *s) 1155 { 1156 struct bch_fs *c = trans->c; 1157 struct printbuf buf = PRINTBUF; 1158 struct bch_inode_unpacked u; 1159 bool do_update = false; 1160 int ret; 1161 1162 ret = bch2_check_key_has_snapshot(trans, iter, k); 1163 if (ret < 0) 1164 goto err; 1165 if (ret) 1166 return 0; 1167 1168 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1169 if (ret) 1170 goto err; 1171 1172 if (!bkey_is_inode(k.k)) 1173 return 0; 1174 1175 ret = bch2_inode_unpack(k, &u); 1176 if (ret) 1177 goto err; 1178 1179 if (snapshot_root->bi_inum != u.bi_inum) { 1180 ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); 1181 if (ret) 1182 goto err; 1183 } 1184 1185 if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || 1186 INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), 1187 trans, inode_snapshot_mismatch, 1188 "inode hash info in different snapshots don't match")) { 1189 u.bi_hash_seed = snapshot_root->bi_hash_seed; 1190 SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); 1191 do_update = true; 1192 } 1193 1194 if (u.bi_dir || u.bi_dir_offset) { 1195 ret = check_inode_dirent_inode(trans, &u, &do_update); 1196 if (ret) 1197 goto err; 1198 } 1199 1200 if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked), 1201 trans, inode_unlinked_but_has_dirent, 1202 "inode unlinked but has dirent\n%s", 1203 (printbuf_reset(&buf), 1204 bch2_inode_unpacked_to_text(&buf, &u), 1205 buf.buf))) { 1206 u.bi_flags &= ~BCH_INODE_unlinked; 1207 do_update = true; 1208 } 1209 1210 if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) { 1211 /* Check for this early so that check_unreachable_inode() will reattach it */ 1212 1213 ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot); 1214 if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty) 1215 goto err; 1216 1217 fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty, 1218 "dir unlinked but not empty\n%s", 1219 (printbuf_reset(&buf), 1220 bch2_inode_unpacked_to_text(&buf, &u), 1221 buf.buf)); 1222 u.bi_flags &= ~BCH_INODE_unlinked; 1223 do_update = true; 1224 ret = 0; 1225 } 1226 1227 ret = bch2_inode_has_child_snapshots(trans, k.k->p); 1228 if (ret < 0) 1229 goto err; 1230 1231 if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), 1232 trans, inode_has_child_snapshots_wrong, 1233 "inode has_child_snapshots flag wrong (should be %u)\n%s", 1234 ret, 1235 (printbuf_reset(&buf), 1236 bch2_inode_unpacked_to_text(&buf, &u), 1237 buf.buf))) { 1238 if (ret) 1239 u.bi_flags |= BCH_INODE_has_child_snapshot; 1240 else 1241 u.bi_flags &= ~BCH_INODE_has_child_snapshot; 1242 do_update = true; 1243 } 1244 ret = 0; 1245 1246 if ((u.bi_flags & BCH_INODE_unlinked) && 1247 !(u.bi_flags & BCH_INODE_has_child_snapshot)) { 1248 if (!test_bit(BCH_FS_started, &c->flags)) { 1249 /* 1250 * If we're not in online fsck, don't delete unlinked 1251 * inodes, just make sure they're on the deleted list. 1252 * 1253 * They might be referred to by a logged operation - 1254 * i.e. we might have crashed in the middle of a 1255 * truncate on an unlinked but open file - so we want to 1256 * let the delete_dead_inodes kill it after resuming 1257 * logged ops. 1258 */ 1259 ret = check_inode_deleted_list(trans, k.k->p); 1260 if (ret < 0) 1261 goto err_noprint; 1262 1263 fsck_err_on(!ret, 1264 trans, unlinked_inode_not_on_deleted_list, 1265 "inode %llu:%u unlinked, but not on deleted list", 1266 u.bi_inum, k.k->p.snapshot); 1267 1268 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); 1269 if (ret) 1270 goto err; 1271 } else { 1272 ret = bch2_inode_or_descendents_is_open(trans, k.k->p); 1273 if (ret < 0) 1274 goto err; 1275 1276 if (fsck_err_on(!ret, 1277 trans, inode_unlinked_and_not_open, 1278 "inode %llu:%u unlinked and not open", 1279 u.bi_inum, u.bi_snapshot)) { 1280 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); 1281 bch_err_msg(c, ret, "in fsck deleting inode"); 1282 goto err_noprint; 1283 } 1284 ret = 0; 1285 } 1286 } 1287 1288 if (fsck_err_on(u.bi_parent_subvol && 1289 (u.bi_subvol == 0 || 1290 u.bi_subvol == BCACHEFS_ROOT_SUBVOL), 1291 trans, inode_bi_parent_nonzero, 1292 "inode %llu:%u has subvol %u but nonzero parent subvol %u", 1293 u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { 1294 u.bi_parent_subvol = 0; 1295 do_update = true; 1296 } 1297 1298 if (u.bi_subvol) { 1299 struct bch_subvolume s; 1300 1301 ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); 1302 if (ret && !bch2_err_matches(ret, ENOENT)) 1303 goto err; 1304 1305 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 1306 ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); 1307 goto do_update; 1308 } 1309 1310 if (fsck_err_on(ret, 1311 trans, inode_bi_subvol_missing, 1312 "inode %llu:%u bi_subvol points to missing subvolume %u", 1313 u.bi_inum, k.k->p.snapshot, u.bi_subvol) || 1314 fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || 1315 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), 1316 k.k->p.snapshot), 1317 trans, inode_bi_subvol_wrong, 1318 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", 1319 u.bi_inum, k.k->p.snapshot, u.bi_subvol, 1320 le64_to_cpu(s.inode), 1321 le32_to_cpu(s.snapshot))) { 1322 u.bi_subvol = 0; 1323 u.bi_parent_subvol = 0; 1324 do_update = true; 1325 } 1326 } 1327 1328 if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), 1329 trans, inode_journal_seq_in_future, 1330 "inode journal seq in future (currently at %llu)\n%s", 1331 journal_cur_seq(&c->journal), 1332 (printbuf_reset(&buf), 1333 bch2_inode_unpacked_to_text(&buf, &u), 1334 buf.buf))) { 1335 u.bi_journal_seq = journal_cur_seq(&c->journal); 1336 do_update = true; 1337 } 1338 1339 if (S_ISDIR(u.bi_mode)) { 1340 ret = check_directory_size(trans, &u, k, &do_update); 1341 1342 fsck_err_on(ret, 1343 trans, directory_size_mismatch, 1344 "directory inode %llu:%u with the mismatch directory size", 1345 u.bi_inum, k.k->p.snapshot); 1346 ret = 0; 1347 } 1348 do_update: 1349 if (do_update) { 1350 ret = __bch2_fsck_write_inode(trans, &u); 1351 bch_err_msg(c, ret, "in fsck updating inode"); 1352 if (ret) 1353 goto err_noprint; 1354 } 1355 err: 1356 fsck_err: 1357 bch_err_fn(c, ret); 1358 err_noprint: 1359 printbuf_exit(&buf); 1360 return ret; 1361 } 1362 1363 int bch2_check_inodes(struct bch_fs *c) 1364 { 1365 struct bch_inode_unpacked snapshot_root = {}; 1366 struct snapshots_seen s; 1367 1368 snapshots_seen_init(&s); 1369 1370 int ret = bch2_trans_run(c, 1371 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1372 POS_MIN, 1373 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1374 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1375 check_inode(trans, &iter, k, &snapshot_root, &s))); 1376 1377 snapshots_seen_exit(&s); 1378 bch_err_fn(c, ret); 1379 return ret; 1380 } 1381 1382 static int find_oldest_inode_needs_reattach(struct btree_trans *trans, 1383 struct bch_inode_unpacked *inode) 1384 { 1385 struct bch_fs *c = trans->c; 1386 struct btree_iter iter; 1387 struct bkey_s_c k; 1388 int ret = 0; 1389 1390 /* 1391 * We look for inodes to reattach in natural key order, leaves first, 1392 * but we should do the reattach at the oldest version that needs to be 1393 * reattached: 1394 */ 1395 for_each_btree_key_norestart(trans, iter, 1396 BTREE_ID_inodes, 1397 SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), 1398 BTREE_ITER_all_snapshots, k, ret) { 1399 if (k.k->p.offset != inode->bi_inum) 1400 break; 1401 1402 if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) 1403 continue; 1404 1405 if (!bkey_is_inode(k.k)) 1406 break; 1407 1408 struct bch_inode_unpacked parent_inode; 1409 ret = bch2_inode_unpack(k, &parent_inode); 1410 if (ret) 1411 break; 1412 1413 if (!inode_should_reattach(&parent_inode)) 1414 break; 1415 1416 *inode = parent_inode; 1417 } 1418 bch2_trans_iter_exit(trans, &iter); 1419 1420 return ret; 1421 } 1422 1423 static int check_unreachable_inode(struct btree_trans *trans, 1424 struct btree_iter *iter, 1425 struct bkey_s_c k) 1426 { 1427 struct printbuf buf = PRINTBUF; 1428 int ret = 0; 1429 1430 if (!bkey_is_inode(k.k)) 1431 return 0; 1432 1433 struct bch_inode_unpacked inode; 1434 ret = bch2_inode_unpack(k, &inode); 1435 if (ret) 1436 return ret; 1437 1438 if (!inode_should_reattach(&inode)) 1439 return 0; 1440 1441 ret = find_oldest_inode_needs_reattach(trans, &inode); 1442 if (ret) 1443 return ret; 1444 1445 if (fsck_err(trans, inode_unreachable, 1446 "unreachable inode:\n%s", 1447 (bch2_inode_unpacked_to_text(&buf, &inode), 1448 buf.buf))) 1449 ret = reattach_inode(trans, &inode); 1450 fsck_err: 1451 printbuf_exit(&buf); 1452 return ret; 1453 } 1454 1455 /* 1456 * Reattach unreachable (but not unlinked) inodes 1457 * 1458 * Run after check_inodes() and check_dirents(), so we node that inode 1459 * backpointer fields point to valid dirents, and every inode that has a dirent 1460 * that points to it has its backpointer field set - so we're just looking for 1461 * non-unlinked inodes without backpointers: 1462 * 1463 * XXX: this is racy w.r.t. hardlink removal in online fsck 1464 */ 1465 int bch2_check_unreachable_inodes(struct bch_fs *c) 1466 { 1467 int ret = bch2_trans_run(c, 1468 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1469 POS_MIN, 1470 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1471 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1472 check_unreachable_inode(trans, &iter, k))); 1473 bch_err_fn(c, ret); 1474 return ret; 1475 } 1476 1477 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) 1478 { 1479 switch (btree) { 1480 case BTREE_ID_extents: 1481 return S_ISREG(mode) || S_ISLNK(mode); 1482 case BTREE_ID_dirents: 1483 return S_ISDIR(mode); 1484 case BTREE_ID_xattrs: 1485 return true; 1486 default: 1487 BUG(); 1488 } 1489 } 1490 1491 static int check_key_has_inode(struct btree_trans *trans, 1492 struct btree_iter *iter, 1493 struct inode_walker *inode, 1494 struct inode_walker_entry *i, 1495 struct bkey_s_c k) 1496 { 1497 struct bch_fs *c = trans->c; 1498 struct printbuf buf = PRINTBUF; 1499 int ret = PTR_ERR_OR_ZERO(i); 1500 if (ret) 1501 return ret; 1502 1503 if (k.k->type == KEY_TYPE_whiteout) 1504 goto out; 1505 1506 if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { 1507 ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: 1508 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1509 if (ret) 1510 goto err; 1511 1512 inode->last_pos.inode--; 1513 ret = -BCH_ERR_transaction_restart_nested; 1514 goto err; 1515 } 1516 1517 if (fsck_err_on(!i, 1518 trans, key_in_missing_inode, 1519 "key in missing inode:\n %s", 1520 (printbuf_reset(&buf), 1521 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1522 goto delete; 1523 1524 if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), 1525 trans, key_in_wrong_inode_type, 1526 "key for wrong inode mode %o:\n %s", 1527 i->inode.bi_mode, 1528 (printbuf_reset(&buf), 1529 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1530 goto delete; 1531 out: 1532 err: 1533 fsck_err: 1534 printbuf_exit(&buf); 1535 bch_err_fn(c, ret); 1536 return ret; 1537 delete: 1538 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); 1539 goto out; 1540 } 1541 1542 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) 1543 { 1544 struct bch_fs *c = trans->c; 1545 int ret = 0; 1546 s64 count2; 1547 1548 darray_for_each(w->inodes, i) { 1549 if (i->inode.bi_sectors == i->count) 1550 continue; 1551 1552 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); 1553 1554 if (w->recalculate_sums) 1555 i->count = count2; 1556 1557 if (i->count != count2) { 1558 bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1559 w->last_pos.inode, i->snapshot, i->count, count2); 1560 i->count = count2; 1561 } 1562 1563 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), 1564 trans, inode_i_sectors_wrong, 1565 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", 1566 w->last_pos.inode, i->snapshot, 1567 i->inode.bi_sectors, i->count)) { 1568 i->inode.bi_sectors = i->count; 1569 ret = bch2_fsck_write_inode(trans, &i->inode); 1570 if (ret) 1571 break; 1572 } 1573 } 1574 fsck_err: 1575 bch_err_fn(c, ret); 1576 return ret; 1577 } 1578 1579 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) 1580 { 1581 u32 restart_count = trans->restart_count; 1582 return check_i_sectors_notnested(trans, w) ?: 1583 trans_was_restarted(trans, restart_count); 1584 } 1585 1586 struct extent_end { 1587 u32 snapshot; 1588 u64 offset; 1589 struct snapshots_seen seen; 1590 }; 1591 1592 struct extent_ends { 1593 struct bpos last_pos; 1594 DARRAY(struct extent_end) e; 1595 }; 1596 1597 static void extent_ends_reset(struct extent_ends *extent_ends) 1598 { 1599 darray_for_each(extent_ends->e, i) 1600 snapshots_seen_exit(&i->seen); 1601 extent_ends->e.nr = 0; 1602 } 1603 1604 static void extent_ends_exit(struct extent_ends *extent_ends) 1605 { 1606 extent_ends_reset(extent_ends); 1607 darray_exit(&extent_ends->e); 1608 } 1609 1610 static void extent_ends_init(struct extent_ends *extent_ends) 1611 { 1612 memset(extent_ends, 0, sizeof(*extent_ends)); 1613 } 1614 1615 static int extent_ends_at(struct bch_fs *c, 1616 struct extent_ends *extent_ends, 1617 struct snapshots_seen *seen, 1618 struct bkey_s_c k) 1619 { 1620 struct extent_end *i, n = (struct extent_end) { 1621 .offset = k.k->p.offset, 1622 .snapshot = k.k->p.snapshot, 1623 .seen = *seen, 1624 }; 1625 1626 n.seen.ids.data = kmemdup(seen->ids.data, 1627 sizeof(seen->ids.data[0]) * seen->ids.size, 1628 GFP_KERNEL); 1629 if (!n.seen.ids.data) 1630 return -BCH_ERR_ENOMEM_fsck_extent_ends_at; 1631 1632 __darray_for_each(extent_ends->e, i) { 1633 if (i->snapshot == k.k->p.snapshot) { 1634 snapshots_seen_exit(&i->seen); 1635 *i = n; 1636 return 0; 1637 } 1638 1639 if (i->snapshot >= k.k->p.snapshot) 1640 break; 1641 } 1642 1643 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); 1644 } 1645 1646 static int overlapping_extents_found(struct btree_trans *trans, 1647 enum btree_id btree, 1648 struct bpos pos1, struct snapshots_seen *pos1_seen, 1649 struct bkey pos2, 1650 bool *fixed, 1651 struct extent_end *extent_end) 1652 { 1653 struct bch_fs *c = trans->c; 1654 struct printbuf buf = PRINTBUF; 1655 struct btree_iter iter1, iter2 = { NULL }; 1656 struct bkey_s_c k1, k2; 1657 int ret; 1658 1659 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); 1660 1661 bch2_trans_iter_init(trans, &iter1, btree, pos1, 1662 BTREE_ITER_all_snapshots| 1663 BTREE_ITER_not_extents); 1664 k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); 1665 ret = bkey_err(k1); 1666 if (ret) 1667 goto err; 1668 1669 prt_str(&buf, "\n "); 1670 bch2_bkey_val_to_text(&buf, c, k1); 1671 1672 if (!bpos_eq(pos1, k1.k->p)) { 1673 prt_str(&buf, "\n wanted\n "); 1674 bch2_bpos_to_text(&buf, pos1); 1675 prt_str(&buf, "\n "); 1676 bch2_bkey_to_text(&buf, &pos2); 1677 1678 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", 1679 __func__, buf.buf); 1680 ret = -BCH_ERR_internal_fsck_err; 1681 goto err; 1682 } 1683 1684 bch2_trans_copy_iter(&iter2, &iter1); 1685 1686 while (1) { 1687 bch2_btree_iter_advance(&iter2); 1688 1689 k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); 1690 ret = bkey_err(k2); 1691 if (ret) 1692 goto err; 1693 1694 if (bpos_ge(k2.k->p, pos2.p)) 1695 break; 1696 } 1697 1698 prt_str(&buf, "\n "); 1699 bch2_bkey_val_to_text(&buf, c, k2); 1700 1701 if (bpos_gt(k2.k->p, pos2.p) || 1702 pos2.size != k2.k->size) { 1703 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", 1704 __func__, buf.buf); 1705 ret = -BCH_ERR_internal_fsck_err; 1706 goto err; 1707 } 1708 1709 prt_printf(&buf, "\n overwriting %s extent", 1710 pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); 1711 1712 if (fsck_err(trans, extent_overlapping, 1713 "overlapping extents%s", buf.buf)) { 1714 struct btree_iter *old_iter = &iter1; 1715 struct disk_reservation res = { 0 }; 1716 1717 if (pos1.snapshot < pos2.p.snapshot) { 1718 old_iter = &iter2; 1719 swap(k1, k2); 1720 } 1721 1722 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); 1723 1724 ret = bch2_trans_update_extent_overwrite(trans, old_iter, 1725 BTREE_UPDATE_internal_snapshot_node, 1726 k1, k2) ?: 1727 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); 1728 bch2_disk_reservation_put(c, &res); 1729 1730 if (ret) 1731 goto err; 1732 1733 *fixed = true; 1734 1735 if (pos1.snapshot == pos2.p.snapshot) { 1736 /* 1737 * We overwrote the first extent, and did the overwrite 1738 * in the same snapshot: 1739 */ 1740 extent_end->offset = bkey_start_offset(&pos2); 1741 } else if (pos1.snapshot > pos2.p.snapshot) { 1742 /* 1743 * We overwrote the first extent in pos2's snapshot: 1744 */ 1745 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); 1746 } else { 1747 /* 1748 * We overwrote the second extent - restart 1749 * check_extent() from the top: 1750 */ 1751 ret = -BCH_ERR_transaction_restart_nested; 1752 } 1753 } 1754 fsck_err: 1755 err: 1756 bch2_trans_iter_exit(trans, &iter2); 1757 bch2_trans_iter_exit(trans, &iter1); 1758 printbuf_exit(&buf); 1759 return ret; 1760 } 1761 1762 static int check_overlapping_extents(struct btree_trans *trans, 1763 struct snapshots_seen *seen, 1764 struct extent_ends *extent_ends, 1765 struct bkey_s_c k, 1766 struct btree_iter *iter, 1767 bool *fixed) 1768 { 1769 struct bch_fs *c = trans->c; 1770 int ret = 0; 1771 1772 /* transaction restart, running again */ 1773 if (bpos_eq(extent_ends->last_pos, k.k->p)) 1774 return 0; 1775 1776 if (extent_ends->last_pos.inode != k.k->p.inode) 1777 extent_ends_reset(extent_ends); 1778 1779 darray_for_each(extent_ends->e, i) { 1780 if (i->offset <= bkey_start_offset(k.k)) 1781 continue; 1782 1783 if (!ref_visible2(c, 1784 k.k->p.snapshot, seen, 1785 i->snapshot, &i->seen)) 1786 continue; 1787 1788 ret = overlapping_extents_found(trans, iter->btree_id, 1789 SPOS(iter->pos.inode, 1790 i->offset, 1791 i->snapshot), 1792 &i->seen, 1793 *k.k, fixed, i); 1794 if (ret) 1795 goto err; 1796 } 1797 1798 extent_ends->last_pos = k.k->p; 1799 err: 1800 return ret; 1801 } 1802 1803 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, 1804 struct bkey_s_c k) 1805 { 1806 struct bch_fs *c = trans->c; 1807 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1808 struct bch_extent_crc_unpacked crc; 1809 const union bch_extent_entry *i; 1810 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; 1811 1812 bkey_for_each_crc(k.k, ptrs, crc, i) 1813 if (crc_is_encoded(crc) && 1814 crc.uncompressed_size > encoded_extent_max_sectors) { 1815 struct printbuf buf = PRINTBUF; 1816 1817 bch2_bkey_val_to_text(&buf, c, k); 1818 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); 1819 printbuf_exit(&buf); 1820 } 1821 1822 return 0; 1823 } 1824 1825 static int check_extent(struct btree_trans *trans, struct btree_iter *iter, 1826 struct bkey_s_c k, 1827 struct inode_walker *inode, 1828 struct snapshots_seen *s, 1829 struct extent_ends *extent_ends, 1830 struct disk_reservation *res) 1831 { 1832 struct bch_fs *c = trans->c; 1833 struct printbuf buf = PRINTBUF; 1834 int ret = 0; 1835 1836 ret = bch2_check_key_has_snapshot(trans, iter, k); 1837 if (ret) { 1838 ret = ret < 0 ? ret : 0; 1839 goto out; 1840 } 1841 1842 if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { 1843 ret = check_i_sectors(trans, inode); 1844 if (ret) 1845 goto err; 1846 } 1847 1848 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1849 if (ret) 1850 goto err; 1851 1852 struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); 1853 ret = PTR_ERR_OR_ZERO(extent_i); 1854 if (ret) 1855 goto err; 1856 1857 ret = check_key_has_inode(trans, iter, inode, extent_i, k); 1858 if (ret) 1859 goto err; 1860 1861 if (k.k->type != KEY_TYPE_whiteout) { 1862 ret = check_overlapping_extents(trans, s, extent_ends, k, iter, 1863 &inode->recalculate_sums); 1864 if (ret) 1865 goto err; 1866 1867 /* 1868 * Check inodes in reverse order, from oldest snapshots to 1869 * newest, starting from the inode that matches this extent's 1870 * snapshot. If we didn't have one, iterate over all inodes: 1871 */ 1872 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1873 inode->inodes.data && i >= inode->inodes.data; 1874 --i) { 1875 if (i->snapshot > k.k->p.snapshot || 1876 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) 1877 continue; 1878 1879 if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && 1880 !bkey_extent_is_reservation(k), 1881 trans, extent_past_end_of_inode, 1882 "extent type past end of inode %llu:%u, i_size %llu\n %s", 1883 i->inode.bi_inum, i->snapshot, i->inode.bi_size, 1884 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1885 struct btree_iter iter2; 1886 1887 bch2_trans_copy_iter(&iter2, iter); 1888 bch2_btree_iter_set_snapshot(&iter2, i->snapshot); 1889 ret = bch2_btree_iter_traverse(&iter2) ?: 1890 bch2_btree_delete_at(trans, &iter2, 1891 BTREE_UPDATE_internal_snapshot_node); 1892 bch2_trans_iter_exit(trans, &iter2); 1893 if (ret) 1894 goto err; 1895 1896 iter->k.type = KEY_TYPE_whiteout; 1897 break; 1898 } 1899 } 1900 } 1901 1902 ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); 1903 if (ret) 1904 goto err; 1905 1906 if (bkey_extent_is_allocation(k.k)) { 1907 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1908 inode->inodes.data && i >= inode->inodes.data; 1909 --i) { 1910 if (i->snapshot > k.k->p.snapshot || 1911 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) 1912 continue; 1913 1914 i->count += k.k->size; 1915 } 1916 } 1917 1918 if (k.k->type != KEY_TYPE_whiteout) { 1919 ret = extent_ends_at(c, extent_ends, s, k); 1920 if (ret) 1921 goto err; 1922 } 1923 out: 1924 err: 1925 fsck_err: 1926 printbuf_exit(&buf); 1927 bch_err_fn(c, ret); 1928 return ret; 1929 } 1930 1931 /* 1932 * Walk extents: verify that extents have a corresponding S_ISREG inode, and 1933 * that i_size an i_sectors are consistent 1934 */ 1935 int bch2_check_extents(struct bch_fs *c) 1936 { 1937 struct inode_walker w = inode_walker_init(); 1938 struct snapshots_seen s; 1939 struct extent_ends extent_ends; 1940 struct disk_reservation res = { 0 }; 1941 1942 snapshots_seen_init(&s); 1943 extent_ends_init(&extent_ends); 1944 1945 int ret = bch2_trans_run(c, 1946 for_each_btree_key(trans, iter, BTREE_ID_extents, 1947 POS(BCACHEFS_ROOT_INO, 0), 1948 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ 1949 bch2_disk_reservation_put(c, &res); 1950 check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: 1951 check_extent_overbig(trans, &iter, k); 1952 })) ?: 1953 check_i_sectors_notnested(trans, &w)); 1954 1955 bch2_disk_reservation_put(c, &res); 1956 extent_ends_exit(&extent_ends); 1957 inode_walker_exit(&w); 1958 snapshots_seen_exit(&s); 1959 1960 bch_err_fn(c, ret); 1961 return ret; 1962 } 1963 1964 int bch2_check_indirect_extents(struct bch_fs *c) 1965 { 1966 struct disk_reservation res = { 0 }; 1967 1968 int ret = bch2_trans_run(c, 1969 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, 1970 POS_MIN, 1971 BTREE_ITER_prefetch, k, 1972 &res, NULL, 1973 BCH_TRANS_COMMIT_no_enospc, ({ 1974 bch2_disk_reservation_put(c, &res); 1975 check_extent_overbig(trans, &iter, k); 1976 }))); 1977 1978 bch2_disk_reservation_put(c, &res); 1979 bch_err_fn(c, ret); 1980 return ret; 1981 } 1982 1983 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) 1984 { 1985 struct bch_fs *c = trans->c; 1986 int ret = 0; 1987 s64 count2; 1988 1989 darray_for_each(w->inodes, i) { 1990 if (i->inode.bi_nlink == i->count) 1991 continue; 1992 1993 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); 1994 if (count2 < 0) 1995 return count2; 1996 1997 if (i->count != count2) { 1998 bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", 1999 w->last_pos.inode, i->snapshot, i->count, count2); 2000 i->count = count2; 2001 if (i->inode.bi_nlink == i->count) 2002 continue; 2003 } 2004 2005 if (fsck_err_on(i->inode.bi_nlink != i->count, 2006 trans, inode_dir_wrong_nlink, 2007 "directory %llu:%u with wrong i_nlink: got %u, should be %llu", 2008 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { 2009 i->inode.bi_nlink = i->count; 2010 ret = bch2_fsck_write_inode(trans, &i->inode); 2011 if (ret) 2012 break; 2013 } 2014 } 2015 fsck_err: 2016 bch_err_fn(c, ret); 2017 return ret; 2018 } 2019 2020 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) 2021 { 2022 u32 restart_count = trans->restart_count; 2023 return check_subdir_count_notnested(trans, w) ?: 2024 trans_was_restarted(trans, restart_count); 2025 } 2026 2027 noinline_for_stack 2028 static int check_dirent_inode_dirent(struct btree_trans *trans, 2029 struct btree_iter *iter, 2030 struct bkey_s_c_dirent d, 2031 struct bch_inode_unpacked *target) 2032 { 2033 struct bch_fs *c = trans->c; 2034 struct printbuf buf = PRINTBUF; 2035 struct btree_iter bp_iter = { NULL }; 2036 int ret = 0; 2037 2038 if (inode_points_to_dirent(target, d)) 2039 return 0; 2040 2041 if (!target->bi_dir && 2042 !target->bi_dir_offset) { 2043 fsck_err_on(S_ISDIR(target->bi_mode), 2044 trans, inode_dir_missing_backpointer, 2045 "directory with missing backpointer\n%s", 2046 (printbuf_reset(&buf), 2047 bch2_bkey_val_to_text(&buf, c, d.s_c), 2048 prt_printf(&buf, "\n"), 2049 bch2_inode_unpacked_to_text(&buf, target), 2050 buf.buf)); 2051 2052 fsck_err_on(target->bi_flags & BCH_INODE_unlinked, 2053 trans, inode_unlinked_but_has_dirent, 2054 "inode unlinked but has dirent\n%s", 2055 (printbuf_reset(&buf), 2056 bch2_bkey_val_to_text(&buf, c, d.s_c), 2057 prt_printf(&buf, "\n"), 2058 bch2_inode_unpacked_to_text(&buf, target), 2059 buf.buf)); 2060 2061 target->bi_flags &= ~BCH_INODE_unlinked; 2062 target->bi_dir = d.k->p.inode; 2063 target->bi_dir_offset = d.k->p.offset; 2064 return __bch2_fsck_write_inode(trans, target); 2065 } 2066 2067 if (bch2_inode_should_have_single_bp(target) && 2068 !fsck_err(trans, inode_wrong_backpointer, 2069 "dirent points to inode that does not point back:\n %s", 2070 (bch2_bkey_val_to_text(&buf, c, d.s_c), 2071 prt_printf(&buf, "\n "), 2072 bch2_inode_unpacked_to_text(&buf, target), 2073 buf.buf))) 2074 goto err; 2075 2076 struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, 2077 SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot)); 2078 ret = bkey_err(bp_dirent); 2079 if (ret && !bch2_err_matches(ret, ENOENT)) 2080 goto err; 2081 2082 bool backpointer_exists = !ret; 2083 ret = 0; 2084 2085 if (fsck_err_on(!backpointer_exists, 2086 trans, inode_wrong_backpointer, 2087 "inode %llu:%u has wrong backpointer:\n" 2088 "got %llu:%llu\n" 2089 "should be %llu:%llu", 2090 target->bi_inum, target->bi_snapshot, 2091 target->bi_dir, 2092 target->bi_dir_offset, 2093 d.k->p.inode, 2094 d.k->p.offset)) { 2095 target->bi_dir = d.k->p.inode; 2096 target->bi_dir_offset = d.k->p.offset; 2097 ret = __bch2_fsck_write_inode(trans, target); 2098 goto out; 2099 } 2100 2101 bch2_bkey_val_to_text(&buf, c, d.s_c); 2102 prt_newline(&buf); 2103 if (backpointer_exists) 2104 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); 2105 2106 if (fsck_err_on(backpointer_exists && 2107 (S_ISDIR(target->bi_mode) || 2108 target->bi_subvol), 2109 trans, inode_dir_multiple_links, 2110 "%s %llu:%u with multiple links\n%s", 2111 S_ISDIR(target->bi_mode) ? "directory" : "subvolume", 2112 target->bi_inum, target->bi_snapshot, buf.buf)) { 2113 ret = __remove_dirent(trans, d.k->p); 2114 goto out; 2115 } 2116 2117 /* 2118 * hardlinked file with nlink 0: 2119 * We're just adjusting nlink here so check_nlinks() will pick 2120 * it up, it ignores inodes with nlink 0 2121 */ 2122 if (fsck_err_on(backpointer_exists && !target->bi_nlink, 2123 trans, inode_multiple_links_but_nlink_0, 2124 "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", 2125 target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { 2126 target->bi_nlink++; 2127 target->bi_flags &= ~BCH_INODE_unlinked; 2128 ret = __bch2_fsck_write_inode(trans, target); 2129 if (ret) 2130 goto err; 2131 } 2132 out: 2133 err: 2134 fsck_err: 2135 bch2_trans_iter_exit(trans, &bp_iter); 2136 printbuf_exit(&buf); 2137 bch_err_fn(c, ret); 2138 return ret; 2139 } 2140 2141 noinline_for_stack 2142 static int check_dirent_target(struct btree_trans *trans, 2143 struct btree_iter *iter, 2144 struct bkey_s_c_dirent d, 2145 struct bch_inode_unpacked *target) 2146 { 2147 struct bch_fs *c = trans->c; 2148 struct bkey_i_dirent *n; 2149 struct printbuf buf = PRINTBUF; 2150 int ret = 0; 2151 2152 ret = check_dirent_inode_dirent(trans, iter, d, target); 2153 if (ret) 2154 goto err; 2155 2156 if (fsck_err_on(d.v->d_type != inode_d_type(target), 2157 trans, dirent_d_type_wrong, 2158 "incorrect d_type: got %s, should be %s:\n%s", 2159 bch2_d_type_str(d.v->d_type), 2160 bch2_d_type_str(inode_d_type(target)), 2161 (printbuf_reset(&buf), 2162 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 2163 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); 2164 ret = PTR_ERR_OR_ZERO(n); 2165 if (ret) 2166 goto err; 2167 2168 bkey_reassemble(&n->k_i, d.s_c); 2169 n->v.d_type = inode_d_type(target); 2170 if (n->v.d_type == DT_SUBVOL) { 2171 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); 2172 n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); 2173 } else { 2174 n->v.d_inum = cpu_to_le64(target->bi_inum); 2175 } 2176 2177 ret = bch2_trans_update(trans, iter, &n->k_i, 0); 2178 if (ret) 2179 goto err; 2180 2181 d = dirent_i_to_s_c(n); 2182 } 2183 err: 2184 fsck_err: 2185 printbuf_exit(&buf); 2186 bch_err_fn(c, ret); 2187 return ret; 2188 } 2189 2190 /* find a subvolume that's a descendent of @snapshot: */ 2191 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) 2192 { 2193 struct btree_iter iter; 2194 struct bkey_s_c k; 2195 int ret; 2196 2197 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { 2198 if (k.k->type != KEY_TYPE_subvolume) 2199 continue; 2200 2201 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2202 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { 2203 bch2_trans_iter_exit(trans, &iter); 2204 *subvolid = k.k->p.offset; 2205 goto found; 2206 } 2207 } 2208 if (!ret) 2209 ret = -ENOENT; 2210 found: 2211 bch2_trans_iter_exit(trans, &iter); 2212 return ret; 2213 } 2214 2215 noinline_for_stack 2216 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, 2217 struct bkey_s_c_dirent d) 2218 { 2219 struct bch_fs *c = trans->c; 2220 struct btree_iter subvol_iter = {}; 2221 struct bch_inode_unpacked subvol_root; 2222 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); 2223 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 2224 u32 parent_snapshot; 2225 u32 new_parent_subvol = 0; 2226 u64 parent_inum; 2227 struct printbuf buf = PRINTBUF; 2228 int ret = 0; 2229 2230 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); 2231 if (ret && !bch2_err_matches(ret, ENOENT)) 2232 return ret; 2233 2234 if (ret || 2235 (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { 2236 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); 2237 if (ret2 && !bch2_err_matches(ret, ENOENT)) 2238 return ret2; 2239 } 2240 2241 if (ret && 2242 !new_parent_subvol && 2243 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 2244 /* 2245 * Couldn't find a subvol for dirent's snapshot - but we lost 2246 * subvols, so we need to reconstruct: 2247 */ 2248 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); 2249 if (ret) 2250 return ret; 2251 2252 parent_snapshot = d.k->p.snapshot; 2253 } 2254 2255 if (fsck_err_on(ret, 2256 trans, dirent_to_missing_parent_subvol, 2257 "dirent parent_subvol points to missing subvolume\n%s", 2258 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || 2259 fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), 2260 trans, dirent_not_visible_in_parent_subvol, 2261 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", 2262 parent_snapshot, 2263 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 2264 if (!new_parent_subvol) { 2265 bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); 2266 return -BCH_ERR_fsck_repair_unimplemented; 2267 } 2268 2269 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); 2270 ret = PTR_ERR_OR_ZERO(new_dirent); 2271 if (ret) 2272 goto err; 2273 2274 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); 2275 } 2276 2277 struct bkey_s_c_subvolume s = 2278 bch2_bkey_get_iter_typed(trans, &subvol_iter, 2279 BTREE_ID_subvolumes, POS(0, target_subvol), 2280 0, subvolume); 2281 ret = bkey_err(s.s_c); 2282 if (ret && !bch2_err_matches(ret, ENOENT)) 2283 return ret; 2284 2285 if (ret) { 2286 if (fsck_err(trans, dirent_to_missing_subvol, 2287 "dirent points to missing subvolume\n%s", 2288 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) 2289 return __remove_dirent(trans, d.k->p); 2290 ret = 0; 2291 goto out; 2292 } 2293 2294 if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, 2295 trans, subvol_fs_path_parent_wrong, 2296 "subvol with wrong fs_path_parent, should be be %u\n%s", 2297 parent_subvol, 2298 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 2299 struct bkey_i_subvolume *n = 2300 bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); 2301 ret = PTR_ERR_OR_ZERO(n); 2302 if (ret) 2303 goto err; 2304 2305 n->v.fs_path_parent = cpu_to_le32(parent_subvol); 2306 } 2307 2308 u64 target_inum = le64_to_cpu(s.v->inode); 2309 u32 target_snapshot = le32_to_cpu(s.v->snapshot); 2310 2311 ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root); 2312 if (ret && !bch2_err_matches(ret, ENOENT)) 2313 goto err; 2314 2315 if (ret) { 2316 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); 2317 ret = -BCH_ERR_fsck_repair_unimplemented; 2318 goto err; 2319 } 2320 2321 if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, 2322 trans, inode_bi_parent_wrong, 2323 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", 2324 target_inum, 2325 subvol_root.bi_parent_subvol, parent_subvol)) { 2326 subvol_root.bi_parent_subvol = parent_subvol; 2327 subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot); 2328 ret = __bch2_fsck_write_inode(trans, &subvol_root); 2329 if (ret) 2330 goto err; 2331 } 2332 2333 ret = check_dirent_target(trans, iter, d, &subvol_root); 2334 if (ret) 2335 goto err; 2336 out: 2337 err: 2338 fsck_err: 2339 bch2_trans_iter_exit(trans, &subvol_iter); 2340 printbuf_exit(&buf); 2341 return ret; 2342 } 2343 2344 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, 2345 struct bkey_s_c k, 2346 struct bch_hash_info *hash_info, 2347 struct inode_walker *dir, 2348 struct inode_walker *target, 2349 struct snapshots_seen *s) 2350 { 2351 struct bch_fs *c = trans->c; 2352 struct inode_walker_entry *i; 2353 struct printbuf buf = PRINTBUF; 2354 int ret = 0; 2355 2356 ret = bch2_check_key_has_snapshot(trans, iter, k); 2357 if (ret) { 2358 ret = ret < 0 ? ret : 0; 2359 goto out; 2360 } 2361 2362 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 2363 if (ret) 2364 goto err; 2365 2366 if (k.k->type == KEY_TYPE_whiteout) 2367 goto out; 2368 2369 if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { 2370 ret = check_subdir_count(trans, dir); 2371 if (ret) 2372 goto err; 2373 } 2374 2375 i = walk_inode(trans, dir, k); 2376 ret = PTR_ERR_OR_ZERO(i); 2377 if (ret < 0) 2378 goto err; 2379 2380 ret = check_key_has_inode(trans, iter, dir, i, k); 2381 if (ret) 2382 goto err; 2383 2384 if (!i) 2385 goto out; 2386 2387 if (dir->first_this_inode) 2388 *hash_info = bch2_hash_info_init(c, &i->inode); 2389 dir->first_this_inode = false; 2390 2391 ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); 2392 if (ret < 0) 2393 goto err; 2394 if (ret) { 2395 /* dirent has been deleted */ 2396 ret = 0; 2397 goto out; 2398 } 2399 2400 if (k.k->type != KEY_TYPE_dirent) 2401 goto out; 2402 2403 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2404 2405 if (d.v->d_type == DT_SUBVOL) { 2406 ret = check_dirent_to_subvol(trans, iter, d); 2407 if (ret) 2408 goto err; 2409 } else { 2410 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); 2411 if (ret) 2412 goto err; 2413 2414 if (fsck_err_on(!target->inodes.nr, 2415 trans, dirent_to_missing_inode, 2416 "dirent points to missing inode:\n%s", 2417 (printbuf_reset(&buf), 2418 bch2_bkey_val_to_text(&buf, c, k), 2419 buf.buf))) { 2420 ret = __remove_dirent(trans, d.k->p); 2421 if (ret) 2422 goto err; 2423 } 2424 2425 darray_for_each(target->inodes, i) { 2426 ret = check_dirent_target(trans, iter, d, &i->inode); 2427 if (ret) 2428 goto err; 2429 } 2430 2431 darray_for_each(target->deletes, i) 2432 if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i), 2433 trans, dirent_to_overwritten_inode, 2434 "dirent points to inode overwritten in snapshot %u:\n%s", 2435 *i, 2436 (printbuf_reset(&buf), 2437 bch2_bkey_val_to_text(&buf, c, k), 2438 buf.buf))) { 2439 struct btree_iter delete_iter; 2440 bch2_trans_iter_init(trans, &delete_iter, 2441 BTREE_ID_dirents, 2442 SPOS(k.k->p.inode, k.k->p.offset, *i), 2443 BTREE_ITER_intent); 2444 ret = bch2_btree_iter_traverse(&delete_iter) ?: 2445 bch2_hash_delete_at(trans, bch2_dirent_hash_desc, 2446 hash_info, 2447 &delete_iter, 2448 BTREE_UPDATE_internal_snapshot_node); 2449 bch2_trans_iter_exit(trans, &delete_iter); 2450 if (ret) 2451 goto err; 2452 2453 } 2454 } 2455 2456 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 2457 if (ret) 2458 goto err; 2459 2460 if (d.v->d_type == DT_DIR) 2461 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) 2462 i->count++; 2463 out: 2464 err: 2465 fsck_err: 2466 printbuf_exit(&buf); 2467 bch_err_fn(c, ret); 2468 return ret; 2469 } 2470 2471 /* 2472 * Walk dirents: verify that they all have a corresponding S_ISDIR inode, 2473 * validate d_type 2474 */ 2475 int bch2_check_dirents(struct bch_fs *c) 2476 { 2477 struct inode_walker dir = inode_walker_init(); 2478 struct inode_walker target = inode_walker_init(); 2479 struct snapshots_seen s; 2480 struct bch_hash_info hash_info; 2481 2482 snapshots_seen_init(&s); 2483 2484 int ret = bch2_trans_run(c, 2485 for_each_btree_key(trans, iter, BTREE_ID_dirents, 2486 POS(BCACHEFS_ROOT_INO, 0), 2487 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 2488 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: 2489 check_subdir_count_notnested(trans, &dir)); 2490 2491 snapshots_seen_exit(&s); 2492 inode_walker_exit(&dir); 2493 inode_walker_exit(&target); 2494 bch_err_fn(c, ret); 2495 return ret; 2496 } 2497 2498 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, 2499 struct bkey_s_c k, 2500 struct bch_hash_info *hash_info, 2501 struct inode_walker *inode) 2502 { 2503 struct bch_fs *c = trans->c; 2504 struct inode_walker_entry *i; 2505 int ret; 2506 2507 ret = bch2_check_key_has_snapshot(trans, iter, k); 2508 if (ret < 0) 2509 return ret; 2510 if (ret) 2511 return 0; 2512 2513 i = walk_inode(trans, inode, k); 2514 ret = PTR_ERR_OR_ZERO(i); 2515 if (ret) 2516 return ret; 2517 2518 ret = check_key_has_inode(trans, iter, inode, i, k); 2519 if (ret) 2520 return ret; 2521 2522 if (!i) 2523 return 0; 2524 2525 if (inode->first_this_inode) 2526 *hash_info = bch2_hash_info_init(c, &i->inode); 2527 inode->first_this_inode = false; 2528 2529 ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); 2530 bch_err_fn(c, ret); 2531 return ret; 2532 } 2533 2534 /* 2535 * Walk xattrs: verify that they all have a corresponding inode 2536 */ 2537 int bch2_check_xattrs(struct bch_fs *c) 2538 { 2539 struct inode_walker inode = inode_walker_init(); 2540 struct bch_hash_info hash_info; 2541 int ret = 0; 2542 2543 ret = bch2_trans_run(c, 2544 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, 2545 POS(BCACHEFS_ROOT_INO, 0), 2546 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, 2547 k, 2548 NULL, NULL, 2549 BCH_TRANS_COMMIT_no_enospc, 2550 check_xattr(trans, &iter, k, &hash_info, &inode))); 2551 2552 inode_walker_exit(&inode); 2553 bch_err_fn(c, ret); 2554 return ret; 2555 } 2556 2557 static int check_root_trans(struct btree_trans *trans) 2558 { 2559 struct bch_fs *c = trans->c; 2560 struct bch_inode_unpacked root_inode; 2561 u32 snapshot; 2562 u64 inum; 2563 int ret; 2564 2565 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); 2566 if (ret && !bch2_err_matches(ret, ENOENT)) 2567 return ret; 2568 2569 if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, 2570 "root subvol missing")) { 2571 struct bkey_i_subvolume *root_subvol = 2572 bch2_trans_kmalloc(trans, sizeof(*root_subvol)); 2573 ret = PTR_ERR_OR_ZERO(root_subvol); 2574 if (ret) 2575 goto err; 2576 2577 snapshot = U32_MAX; 2578 inum = BCACHEFS_ROOT_INO; 2579 2580 bkey_subvolume_init(&root_subvol->k_i); 2581 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; 2582 root_subvol->v.flags = 0; 2583 root_subvol->v.snapshot = cpu_to_le32(snapshot); 2584 root_subvol->v.inode = cpu_to_le64(inum); 2585 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); 2586 bch_err_msg(c, ret, "writing root subvol"); 2587 if (ret) 2588 goto err; 2589 } 2590 2591 ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode); 2592 if (ret && !bch2_err_matches(ret, ENOENT)) 2593 return ret; 2594 2595 if (mustfix_fsck_err_on(ret, 2596 trans, root_dir_missing, 2597 "root directory missing") || 2598 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), 2599 trans, root_inode_not_dir, 2600 "root inode not a directory")) { 2601 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 2602 0, NULL); 2603 root_inode.bi_inum = inum; 2604 root_inode.bi_snapshot = snapshot; 2605 2606 ret = __bch2_fsck_write_inode(trans, &root_inode); 2607 bch_err_msg(c, ret, "writing root inode"); 2608 } 2609 err: 2610 fsck_err: 2611 return ret; 2612 } 2613 2614 /* Get root directory, create if it doesn't exist: */ 2615 int bch2_check_root(struct bch_fs *c) 2616 { 2617 int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2618 check_root_trans(trans)); 2619 bch_err_fn(c, ret); 2620 return ret; 2621 } 2622 2623 typedef DARRAY(u32) darray_u32; 2624 2625 static bool darray_u32_has(darray_u32 *d, u32 v) 2626 { 2627 darray_for_each(*d, i) 2628 if (*i == v) 2629 return true; 2630 return false; 2631 } 2632 2633 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) 2634 { 2635 struct bch_fs *c = trans->c; 2636 struct btree_iter parent_iter = {}; 2637 darray_u32 subvol_path = {}; 2638 struct printbuf buf = PRINTBUF; 2639 int ret = 0; 2640 2641 if (k.k->type != KEY_TYPE_subvolume) 2642 return 0; 2643 2644 while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { 2645 ret = darray_push(&subvol_path, k.k->p.offset); 2646 if (ret) 2647 goto err; 2648 2649 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2650 2651 struct bch_inode_unpacked subvol_root; 2652 ret = bch2_inode_find_by_inum_trans(trans, 2653 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 2654 &subvol_root); 2655 if (ret) 2656 break; 2657 2658 u32 parent = le32_to_cpu(s.v->fs_path_parent); 2659 2660 if (darray_u32_has(&subvol_path, parent)) { 2661 if (fsck_err(c, subvol_loop, "subvolume loop")) 2662 ret = reattach_subvol(trans, s); 2663 break; 2664 } 2665 2666 bch2_trans_iter_exit(trans, &parent_iter); 2667 bch2_trans_iter_init(trans, &parent_iter, 2668 BTREE_ID_subvolumes, POS(0, parent), 0); 2669 k = bch2_btree_iter_peek_slot(&parent_iter); 2670 ret = bkey_err(k); 2671 if (ret) 2672 goto err; 2673 2674 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, 2675 trans, subvol_unreachable, 2676 "unreachable subvolume %s", 2677 (bch2_bkey_val_to_text(&buf, c, s.s_c), 2678 buf.buf))) { 2679 ret = reattach_subvol(trans, s); 2680 break; 2681 } 2682 } 2683 fsck_err: 2684 err: 2685 printbuf_exit(&buf); 2686 darray_exit(&subvol_path); 2687 bch2_trans_iter_exit(trans, &parent_iter); 2688 return ret; 2689 } 2690 2691 int bch2_check_subvolume_structure(struct bch_fs *c) 2692 { 2693 int ret = bch2_trans_run(c, 2694 for_each_btree_key_commit(trans, iter, 2695 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, 2696 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2697 check_subvol_path(trans, &iter, k))); 2698 bch_err_fn(c, ret); 2699 return ret; 2700 } 2701 2702 struct pathbuf_entry { 2703 u64 inum; 2704 u32 snapshot; 2705 }; 2706 2707 typedef DARRAY(struct pathbuf_entry) pathbuf; 2708 2709 static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p, 2710 u32 new_depth) 2711 { 2712 struct btree_iter iter; 2713 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 2714 SPOS(0, p->inum, p->snapshot), 0); 2715 2716 struct bch_inode_unpacked inode; 2717 int ret = bkey_err(k) ?: 2718 !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode 2719 : bch2_inode_unpack(k, &inode); 2720 if (ret) 2721 goto err; 2722 2723 if (inode.bi_depth != new_depth) { 2724 inode.bi_depth = new_depth; 2725 ret = __bch2_fsck_write_inode(trans, &inode) ?: 2726 bch2_trans_commit(trans, NULL, NULL, 0); 2727 } 2728 err: 2729 bch2_trans_iter_exit(trans, &iter); 2730 return ret; 2731 } 2732 2733 static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth) 2734 { 2735 u32 restart_count = trans->restart_count; 2736 int ret = 0; 2737 2738 darray_for_each_reverse(*path, i) { 2739 ret = nested_lockrestart_do(trans, 2740 bch2_bi_depth_renumber_one(trans, i, new_bi_depth)); 2741 bch_err_fn(trans->c, ret); 2742 if (ret) 2743 break; 2744 2745 new_bi_depth++; 2746 } 2747 2748 return ret ?: trans_was_restarted(trans, restart_count); 2749 } 2750 2751 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) 2752 { 2753 darray_for_each(*p, i) 2754 if (i->inum == inum && 2755 i->snapshot == snapshot) 2756 return true; 2757 return false; 2758 } 2759 2760 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) 2761 { 2762 struct bch_fs *c = trans->c; 2763 struct btree_iter inode_iter = {}; 2764 pathbuf path = {}; 2765 struct printbuf buf = PRINTBUF; 2766 u32 snapshot = inode_k.k->p.snapshot; 2767 bool redo_bi_depth = false; 2768 u32 min_bi_depth = U32_MAX; 2769 int ret = 0; 2770 2771 struct bch_inode_unpacked inode; 2772 ret = bch2_inode_unpack(inode_k, &inode); 2773 if (ret) 2774 return ret; 2775 2776 while (!inode.bi_subvol) { 2777 struct btree_iter dirent_iter; 2778 struct bkey_s_c_dirent d; 2779 u32 parent_snapshot = snapshot; 2780 2781 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); 2782 ret = bkey_err(d.s_c); 2783 if (ret && !bch2_err_matches(ret, ENOENT)) 2784 goto out; 2785 2786 if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) 2787 bch2_trans_iter_exit(trans, &dirent_iter); 2788 2789 if (bch2_err_matches(ret, ENOENT)) { 2790 printbuf_reset(&buf); 2791 bch2_bkey_val_to_text(&buf, c, inode_k); 2792 bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", 2793 bch2_err_str(ret), buf.buf); 2794 goto out; 2795 } 2796 2797 bch2_trans_iter_exit(trans, &dirent_iter); 2798 2799 ret = darray_push(&path, ((struct pathbuf_entry) { 2800 .inum = inode.bi_inum, 2801 .snapshot = snapshot, 2802 })); 2803 if (ret) 2804 return ret; 2805 2806 snapshot = parent_snapshot; 2807 2808 bch2_trans_iter_exit(trans, &inode_iter); 2809 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, 2810 SPOS(0, inode.bi_dir, snapshot), 0); 2811 2812 struct bch_inode_unpacked parent_inode; 2813 ret = bkey_err(inode_k) ?: 2814 !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode 2815 : bch2_inode_unpack(inode_k, &parent_inode); 2816 if (ret) { 2817 /* Should have been caught in dirents pass */ 2818 bch_err_msg(c, ret, "error looking up parent directory"); 2819 goto out; 2820 } 2821 2822 min_bi_depth = parent_inode.bi_depth; 2823 2824 if (parent_inode.bi_depth < inode.bi_depth && 2825 min_bi_depth < U16_MAX) 2826 break; 2827 2828 inode = parent_inode; 2829 snapshot = inode_k.k->p.snapshot; 2830 redo_bi_depth = true; 2831 2832 if (path_is_dup(&path, inode.bi_inum, snapshot)) { 2833 /* XXX print path */ 2834 bch_err(c, "directory structure loop"); 2835 2836 darray_for_each(path, i) 2837 pr_err("%llu:%u", i->inum, i->snapshot); 2838 pr_err("%llu:%u", inode.bi_inum, snapshot); 2839 2840 if (fsck_err(trans, dir_loop, "directory structure loop")) { 2841 ret = remove_backpointer(trans, &inode); 2842 bch_err_msg(c, ret, "removing dirent"); 2843 if (ret) 2844 break; 2845 2846 ret = reattach_inode(trans, &inode); 2847 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 2848 } 2849 2850 goto out; 2851 } 2852 } 2853 2854 if (inode.bi_subvol) 2855 min_bi_depth = 0; 2856 2857 if (redo_bi_depth) 2858 ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth); 2859 out: 2860 fsck_err: 2861 bch2_trans_iter_exit(trans, &inode_iter); 2862 darray_exit(&path); 2863 printbuf_exit(&buf); 2864 bch_err_fn(c, ret); 2865 return ret; 2866 } 2867 2868 /* 2869 * Check for loops in the directory structure: all other connectivity issues 2870 * have been fixed by prior passes 2871 */ 2872 int bch2_check_directory_structure(struct bch_fs *c) 2873 { 2874 int ret = bch2_trans_run(c, 2875 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, 2876 BTREE_ITER_intent| 2877 BTREE_ITER_prefetch| 2878 BTREE_ITER_all_snapshots, k, 2879 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 2880 if (!S_ISDIR(bkey_inode_mode(k))) 2881 continue; 2882 2883 if (bch2_inode_flags(k) & BCH_INODE_unlinked) 2884 continue; 2885 2886 check_path_loop(trans, k); 2887 }))); 2888 2889 bch_err_fn(c, ret); 2890 return ret; 2891 } 2892 2893 struct nlink_table { 2894 size_t nr; 2895 size_t size; 2896 2897 struct nlink { 2898 u64 inum; 2899 u32 snapshot; 2900 u32 count; 2901 } *d; 2902 }; 2903 2904 static int add_nlink(struct bch_fs *c, struct nlink_table *t, 2905 u64 inum, u32 snapshot) 2906 { 2907 if (t->nr == t->size) { 2908 size_t new_size = max_t(size_t, 128UL, t->size * 2); 2909 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); 2910 2911 if (!d) { 2912 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", 2913 new_size); 2914 return -BCH_ERR_ENOMEM_fsck_add_nlink; 2915 } 2916 2917 if (t->d) 2918 memcpy(d, t->d, t->size * sizeof(t->d[0])); 2919 kvfree(t->d); 2920 2921 t->d = d; 2922 t->size = new_size; 2923 } 2924 2925 2926 t->d[t->nr++] = (struct nlink) { 2927 .inum = inum, 2928 .snapshot = snapshot, 2929 }; 2930 2931 return 0; 2932 } 2933 2934 static int nlink_cmp(const void *_l, const void *_r) 2935 { 2936 const struct nlink *l = _l; 2937 const struct nlink *r = _r; 2938 2939 return cmp_int(l->inum, r->inum); 2940 } 2941 2942 static void inc_link(struct bch_fs *c, struct snapshots_seen *s, 2943 struct nlink_table *links, 2944 u64 range_start, u64 range_end, u64 inum, u32 snapshot) 2945 { 2946 struct nlink *link, key = { 2947 .inum = inum, .snapshot = U32_MAX, 2948 }; 2949 2950 if (inum < range_start || inum >= range_end) 2951 return; 2952 2953 link = __inline_bsearch(&key, links->d, links->nr, 2954 sizeof(links->d[0]), nlink_cmp); 2955 if (!link) 2956 return; 2957 2958 while (link > links->d && link[0].inum == link[-1].inum) 2959 --link; 2960 2961 for (; link < links->d + links->nr && link->inum == inum; link++) 2962 if (ref_visible(c, s, snapshot, link->snapshot)) { 2963 link->count++; 2964 if (link->snapshot >= snapshot) 2965 break; 2966 } 2967 } 2968 2969 noinline_for_stack 2970 static int check_nlinks_find_hardlinks(struct bch_fs *c, 2971 struct nlink_table *t, 2972 u64 start, u64 *end) 2973 { 2974 int ret = bch2_trans_run(c, 2975 for_each_btree_key(trans, iter, BTREE_ID_inodes, 2976 POS(0, start), 2977 BTREE_ITER_intent| 2978 BTREE_ITER_prefetch| 2979 BTREE_ITER_all_snapshots, k, ({ 2980 if (!bkey_is_inode(k.k)) 2981 continue; 2982 2983 /* Should never fail, checked by bch2_inode_invalid: */ 2984 struct bch_inode_unpacked u; 2985 _ret3 = bch2_inode_unpack(k, &u); 2986 if (_ret3) 2987 break; 2988 2989 /* 2990 * Backpointer and directory structure checks are sufficient for 2991 * directories, since they can't have hardlinks: 2992 */ 2993 if (S_ISDIR(u.bi_mode)) 2994 continue; 2995 2996 /* 2997 * Previous passes ensured that bi_nlink is nonzero if 2998 * it had multiple hardlinks: 2999 */ 3000 if (!u.bi_nlink) 3001 continue; 3002 3003 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); 3004 if (ret) { 3005 *end = k.k->p.offset; 3006 ret = 0; 3007 break; 3008 } 3009 0; 3010 }))); 3011 3012 bch_err_fn(c, ret); 3013 return ret; 3014 } 3015 3016 noinline_for_stack 3017 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, 3018 u64 range_start, u64 range_end) 3019 { 3020 struct snapshots_seen s; 3021 3022 snapshots_seen_init(&s); 3023 3024 int ret = bch2_trans_run(c, 3025 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, 3026 BTREE_ITER_intent| 3027 BTREE_ITER_prefetch| 3028 BTREE_ITER_all_snapshots, k, ({ 3029 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); 3030 if (ret) 3031 break; 3032 3033 if (k.k->type == KEY_TYPE_dirent) { 3034 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 3035 3036 if (d.v->d_type != DT_DIR && 3037 d.v->d_type != DT_SUBVOL) 3038 inc_link(c, &s, links, range_start, range_end, 3039 le64_to_cpu(d.v->d_inum), d.k->p.snapshot); 3040 } 3041 0; 3042 }))); 3043 3044 snapshots_seen_exit(&s); 3045 3046 bch_err_fn(c, ret); 3047 return ret; 3048 } 3049 3050 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, 3051 struct bkey_s_c k, 3052 struct nlink_table *links, 3053 size_t *idx, u64 range_end) 3054 { 3055 struct bch_inode_unpacked u; 3056 struct nlink *link = &links->d[*idx]; 3057 int ret = 0; 3058 3059 if (k.k->p.offset >= range_end) 3060 return 1; 3061 3062 if (!bkey_is_inode(k.k)) 3063 return 0; 3064 3065 ret = bch2_inode_unpack(k, &u); 3066 if (ret) 3067 return ret; 3068 3069 if (S_ISDIR(u.bi_mode)) 3070 return 0; 3071 3072 if (!u.bi_nlink) 3073 return 0; 3074 3075 while ((cmp_int(link->inum, k.k->p.offset) ?: 3076 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { 3077 BUG_ON(*idx == links->nr); 3078 link = &links->d[++*idx]; 3079 } 3080 3081 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, 3082 trans, inode_wrong_nlink, 3083 "inode %llu type %s has wrong i_nlink (%u, should be %u)", 3084 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], 3085 bch2_inode_nlink_get(&u), link->count)) { 3086 bch2_inode_nlink_set(&u, link->count); 3087 ret = __bch2_fsck_write_inode(trans, &u); 3088 } 3089 fsck_err: 3090 return ret; 3091 } 3092 3093 noinline_for_stack 3094 static int check_nlinks_update_hardlinks(struct bch_fs *c, 3095 struct nlink_table *links, 3096 u64 range_start, u64 range_end) 3097 { 3098 size_t idx = 0; 3099 3100 int ret = bch2_trans_run(c, 3101 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 3102 POS(0, range_start), 3103 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 3104 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 3105 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); 3106 if (ret < 0) { 3107 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); 3108 return ret; 3109 } 3110 3111 return 0; 3112 } 3113 3114 int bch2_check_nlinks(struct bch_fs *c) 3115 { 3116 struct nlink_table links = { 0 }; 3117 u64 this_iter_range_start, next_iter_range_start = 0; 3118 int ret = 0; 3119 3120 do { 3121 this_iter_range_start = next_iter_range_start; 3122 next_iter_range_start = U64_MAX; 3123 3124 ret = check_nlinks_find_hardlinks(c, &links, 3125 this_iter_range_start, 3126 &next_iter_range_start); 3127 3128 ret = check_nlinks_walk_dirents(c, &links, 3129 this_iter_range_start, 3130 next_iter_range_start); 3131 if (ret) 3132 break; 3133 3134 ret = check_nlinks_update_hardlinks(c, &links, 3135 this_iter_range_start, 3136 next_iter_range_start); 3137 if (ret) 3138 break; 3139 3140 links.nr = 0; 3141 } while (next_iter_range_start != U64_MAX); 3142 3143 kvfree(links.d); 3144 bch_err_fn(c, ret); 3145 return ret; 3146 } 3147 3148 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, 3149 struct bkey_s_c k) 3150 { 3151 struct bkey_s_c_reflink_p p; 3152 struct bkey_i_reflink_p *u; 3153 3154 if (k.k->type != KEY_TYPE_reflink_p) 3155 return 0; 3156 3157 p = bkey_s_c_to_reflink_p(k); 3158 3159 if (!p.v->front_pad && !p.v->back_pad) 3160 return 0; 3161 3162 u = bch2_trans_kmalloc(trans, sizeof(*u)); 3163 int ret = PTR_ERR_OR_ZERO(u); 3164 if (ret) 3165 return ret; 3166 3167 bkey_reassemble(&u->k_i, k); 3168 u->v.front_pad = 0; 3169 u->v.back_pad = 0; 3170 3171 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); 3172 } 3173 3174 int bch2_fix_reflink_p(struct bch_fs *c) 3175 { 3176 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) 3177 return 0; 3178 3179 int ret = bch2_trans_run(c, 3180 for_each_btree_key_commit(trans, iter, 3181 BTREE_ID_extents, POS_MIN, 3182 BTREE_ITER_intent|BTREE_ITER_prefetch| 3183 BTREE_ITER_all_snapshots, k, 3184 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 3185 fix_reflink_p_key(trans, &iter, k))); 3186 bch_err_fn(c, ret); 3187 return ret; 3188 } 3189 3190 #ifndef NO_BCACHEFS_CHARDEV 3191 3192 struct fsck_thread { 3193 struct thread_with_stdio thr; 3194 struct bch_fs *c; 3195 struct bch_opts opts; 3196 }; 3197 3198 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) 3199 { 3200 struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); 3201 kfree(thr); 3202 } 3203 3204 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) 3205 { 3206 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); 3207 struct bch_fs *c = thr->c; 3208 3209 int ret = PTR_ERR_OR_ZERO(c); 3210 if (ret) 3211 return ret; 3212 3213 ret = bch2_fs_start(thr->c); 3214 if (ret) 3215 goto err; 3216 3217 if (test_bit(BCH_FS_errors_fixed, &c->flags)) { 3218 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); 3219 ret |= 1; 3220 } 3221 if (test_bit(BCH_FS_error, &c->flags)) { 3222 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); 3223 ret |= 4; 3224 } 3225 err: 3226 bch2_fs_stop(c); 3227 return ret; 3228 } 3229 3230 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { 3231 .exit = bch2_fsck_thread_exit, 3232 .fn = bch2_fsck_offline_thread_fn, 3233 }; 3234 3235 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) 3236 { 3237 struct bch_ioctl_fsck_offline arg; 3238 struct fsck_thread *thr = NULL; 3239 darray_str(devs) = {}; 3240 long ret = 0; 3241 3242 if (copy_from_user(&arg, user_arg, sizeof(arg))) 3243 return -EFAULT; 3244 3245 if (arg.flags) 3246 return -EINVAL; 3247 3248 if (!capable(CAP_SYS_ADMIN)) 3249 return -EPERM; 3250 3251 for (size_t i = 0; i < arg.nr_devs; i++) { 3252 u64 dev_u64; 3253 ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); 3254 if (ret) 3255 goto err; 3256 3257 char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); 3258 ret = PTR_ERR_OR_ZERO(dev_str); 3259 if (ret) 3260 goto err; 3261 3262 ret = darray_push(&devs, dev_str); 3263 if (ret) { 3264 kfree(dev_str); 3265 goto err; 3266 } 3267 } 3268 3269 thr = kzalloc(sizeof(*thr), GFP_KERNEL); 3270 if (!thr) { 3271 ret = -ENOMEM; 3272 goto err; 3273 } 3274 3275 thr->opts = bch2_opts_empty(); 3276 3277 if (arg.opts) { 3278 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); 3279 ret = PTR_ERR_OR_ZERO(optstr) ?: 3280 bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); 3281 if (!IS_ERR(optstr)) 3282 kfree(optstr); 3283 3284 if (ret) 3285 goto err; 3286 } 3287 3288 opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); 3289 opt_set(thr->opts, read_only, 1); 3290 opt_set(thr->opts, ratelimit_errors, 0); 3291 3292 /* We need request_key() to be called before we punt to kthread: */ 3293 opt_set(thr->opts, nostart, true); 3294 3295 bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); 3296 3297 thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); 3298 3299 if (!IS_ERR(thr->c) && 3300 thr->c->opts.errors == BCH_ON_ERROR_panic) 3301 thr->c->opts.errors = BCH_ON_ERROR_ro; 3302 3303 ret = __bch2_run_thread_with_stdio(&thr->thr); 3304 out: 3305 darray_for_each(devs, i) 3306 kfree(*i); 3307 darray_exit(&devs); 3308 return ret; 3309 err: 3310 if (thr) 3311 bch2_fsck_thread_exit(&thr->thr); 3312 pr_err("ret %s", bch2_err_str(ret)); 3313 goto out; 3314 } 3315 3316 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) 3317 { 3318 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); 3319 struct bch_fs *c = thr->c; 3320 3321 c->stdio_filter = current; 3322 c->stdio = &thr->thr.stdio; 3323 3324 /* 3325 * XXX: can we figure out a way to do this without mucking with c->opts? 3326 */ 3327 unsigned old_fix_errors = c->opts.fix_errors; 3328 if (opt_defined(thr->opts, fix_errors)) 3329 c->opts.fix_errors = thr->opts.fix_errors; 3330 else 3331 c->opts.fix_errors = FSCK_FIX_ask; 3332 3333 c->opts.fsck = true; 3334 set_bit(BCH_FS_fsck_running, &c->flags); 3335 3336 c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; 3337 int ret = bch2_run_online_recovery_passes(c); 3338 3339 clear_bit(BCH_FS_fsck_running, &c->flags); 3340 bch_err_fn(c, ret); 3341 3342 c->stdio = NULL; 3343 c->stdio_filter = NULL; 3344 c->opts.fix_errors = old_fix_errors; 3345 3346 up(&c->online_fsck_mutex); 3347 bch2_ro_ref_put(c); 3348 return ret; 3349 } 3350 3351 static const struct thread_with_stdio_ops bch2_online_fsck_ops = { 3352 .exit = bch2_fsck_thread_exit, 3353 .fn = bch2_fsck_online_thread_fn, 3354 }; 3355 3356 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) 3357 { 3358 struct fsck_thread *thr = NULL; 3359 long ret = 0; 3360 3361 if (arg.flags) 3362 return -EINVAL; 3363 3364 if (!capable(CAP_SYS_ADMIN)) 3365 return -EPERM; 3366 3367 if (!bch2_ro_ref_tryget(c)) 3368 return -EROFS; 3369 3370 if (down_trylock(&c->online_fsck_mutex)) { 3371 bch2_ro_ref_put(c); 3372 return -EAGAIN; 3373 } 3374 3375 thr = kzalloc(sizeof(*thr), GFP_KERNEL); 3376 if (!thr) { 3377 ret = -ENOMEM; 3378 goto err; 3379 } 3380 3381 thr->c = c; 3382 thr->opts = bch2_opts_empty(); 3383 3384 if (arg.opts) { 3385 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); 3386 3387 ret = PTR_ERR_OR_ZERO(optstr) ?: 3388 bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); 3389 if (!IS_ERR(optstr)) 3390 kfree(optstr); 3391 3392 if (ret) 3393 goto err; 3394 } 3395 3396 ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); 3397 err: 3398 if (ret < 0) { 3399 bch_err_fn(c, ret); 3400 if (thr) 3401 bch2_fsck_thread_exit(&thr->thr); 3402 up(&c->online_fsck_mutex); 3403 bch2_ro_ref_put(c); 3404 } 3405 return ret; 3406 } 3407 3408 #endif /* NO_BCACHEFS_CHARDEV */ 3409