1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bcachefs_ioctl.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "darray.h" 10 #include "dirent.h" 11 #include "error.h" 12 #include "fs.h" 13 #include "fsck.h" 14 #include "inode.h" 15 #include "io_misc.h" 16 #include "keylist.h" 17 #include "namei.h" 18 #include "recovery_passes.h" 19 #include "snapshot.h" 20 #include "super.h" 21 #include "thread_with_file.h" 22 #include "xattr.h" 23 24 #include <linux/bsearch.h> 25 #include <linux/dcache.h> /* struct qstr */ 26 27 static int dirent_points_to_inode_nowarn(struct bch_fs *c, 28 struct bkey_s_c_dirent d, 29 struct bch_inode_unpacked *inode) 30 { 31 if (d.v->d_type == DT_SUBVOL 32 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 33 : le64_to_cpu(d.v->d_inum) == inode->bi_inum) 34 return 0; 35 return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); 36 } 37 38 static void dirent_inode_mismatch_msg(struct printbuf *out, 39 struct bch_fs *c, 40 struct bkey_s_c_dirent dirent, 41 struct bch_inode_unpacked *inode) 42 { 43 prt_str(out, "inode points to dirent that does not point back:"); 44 prt_newline(out); 45 bch2_bkey_val_to_text(out, c, dirent.s_c); 46 prt_newline(out); 47 bch2_inode_unpacked_to_text(out, inode); 48 } 49 50 static int dirent_points_to_inode(struct bch_fs *c, 51 struct bkey_s_c_dirent dirent, 52 struct bch_inode_unpacked *inode) 53 { 54 int ret = dirent_points_to_inode_nowarn(c, dirent, inode); 55 if (ret) { 56 struct printbuf buf = PRINTBUF; 57 dirent_inode_mismatch_msg(&buf, c, dirent, inode); 58 bch_warn(c, "%s", buf.buf); 59 printbuf_exit(&buf); 60 } 61 return ret; 62 } 63 64 /* 65 * XXX: this is handling transaction restarts without returning 66 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: 67 */ 68 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, 69 u32 snapshot) 70 { 71 u64 sectors = 0; 72 73 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, 74 SPOS(inum, 0, snapshot), 75 POS(inum, U64_MAX), 76 0, k, ({ 77 if (bkey_extent_is_allocation(k.k)) 78 sectors += k.k->size; 79 0; 80 })); 81 82 return ret ?: sectors; 83 } 84 85 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, 86 u32 snapshot) 87 { 88 u64 subdirs = 0; 89 90 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, 91 SPOS(inum, 0, snapshot), 92 POS(inum, U64_MAX), 93 0, k, ({ 94 if (k.k->type == KEY_TYPE_dirent && 95 bkey_s_c_to_dirent(k).v->d_type == DT_DIR) 96 subdirs++; 97 0; 98 })); 99 100 return ret ?: subdirs; 101 } 102 103 static int subvol_lookup(struct btree_trans *trans, u32 subvol, 104 u32 *snapshot, u64 *inum) 105 { 106 struct bch_subvolume s; 107 int ret = bch2_subvolume_get(trans, subvol, false, &s); 108 109 *snapshot = le32_to_cpu(s.snapshot); 110 *inum = le64_to_cpu(s.inode); 111 return ret; 112 } 113 114 static int lookup_dirent_in_snapshot(struct btree_trans *trans, 115 struct bch_hash_info hash_info, 116 subvol_inum dir, struct qstr *name, 117 u64 *target, unsigned *type, u32 snapshot) 118 { 119 struct btree_iter iter; 120 struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, 121 &hash_info, dir, name, 0, snapshot); 122 int ret = bkey_err(k); 123 if (ret) 124 return ret; 125 126 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 127 *target = le64_to_cpu(d.v->d_inum); 128 *type = d.v->d_type; 129 bch2_trans_iter_exit(trans, &iter); 130 return 0; 131 } 132 133 /* 134 * Find any subvolume associated with a tree of snapshots 135 * We can't rely on master_subvol - it might have been deleted. 136 */ 137 static int find_snapshot_tree_subvol(struct btree_trans *trans, 138 u32 tree_id, u32 *subvol) 139 { 140 struct btree_iter iter; 141 struct bkey_s_c k; 142 int ret; 143 144 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { 145 if (k.k->type != KEY_TYPE_snapshot) 146 continue; 147 148 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); 149 if (le32_to_cpu(s.v->tree) != tree_id) 150 continue; 151 152 if (s.v->subvol) { 153 *subvol = le32_to_cpu(s.v->subvol); 154 goto found; 155 } 156 } 157 ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol); 158 found: 159 bch2_trans_iter_exit(trans, &iter); 160 return ret; 161 } 162 163 /* Get lost+found, create if it doesn't exist: */ 164 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, 165 struct bch_inode_unpacked *lostfound, 166 u64 reattaching_inum) 167 { 168 struct bch_fs *c = trans->c; 169 struct qstr lostfound_str = QSTR("lost+found"); 170 struct btree_iter lostfound_iter = {}; 171 u64 inum = 0; 172 unsigned d_type = 0; 173 int ret; 174 175 struct bch_snapshot_tree st; 176 ret = bch2_snapshot_tree_lookup(trans, 177 bch2_snapshot_tree(c, snapshot), &st); 178 if (ret) 179 return ret; 180 181 u32 subvolid; 182 ret = find_snapshot_tree_subvol(trans, 183 bch2_snapshot_tree(c, snapshot), &subvolid); 184 bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u", 185 bch2_snapshot_tree(c, snapshot)); 186 if (ret) 187 return ret; 188 189 struct bch_subvolume subvol; 190 ret = bch2_subvolume_get(trans, subvolid, false, &subvol); 191 bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot); 192 if (ret) 193 return ret; 194 195 if (!subvol.inode) { 196 struct btree_iter iter; 197 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, 198 BTREE_ID_subvolumes, POS(0, subvolid), 199 0, subvolume); 200 ret = PTR_ERR_OR_ZERO(subvol); 201 if (ret) 202 return ret; 203 204 subvol->v.inode = cpu_to_le64(reattaching_inum); 205 bch2_trans_iter_exit(trans, &iter); 206 } 207 208 subvol_inum root_inum = { 209 .subvol = subvolid, 210 .inum = le64_to_cpu(subvol.inode) 211 }; 212 213 struct bch_inode_unpacked root_inode; 214 struct bch_hash_info root_hash_info; 215 ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); 216 bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", 217 root_inum.inum, subvolid); 218 if (ret) 219 return ret; 220 221 root_hash_info = bch2_hash_info_init(c, &root_inode); 222 223 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, 224 &lostfound_str, &inum, &d_type, snapshot); 225 if (bch2_err_matches(ret, ENOENT)) 226 goto create_lostfound; 227 228 bch_err_fn(c, ret); 229 if (ret) 230 return ret; 231 232 if (d_type != DT_DIR) { 233 bch_err(c, "error looking up lost+found: not a directory"); 234 return bch_err_throw(c, ENOENT_not_directory); 235 } 236 237 /* 238 * The bch2_check_dirents pass has already run, dangling dirents 239 * shouldn't exist here: 240 */ 241 ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); 242 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", 243 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); 244 return ret; 245 246 create_lostfound: 247 /* 248 * we always create lost+found in the root snapshot; we don't want 249 * different branches of the snapshot tree to have different lost+found 250 */ 251 snapshot = le32_to_cpu(st.root_snapshot); 252 /* 253 * XXX: we could have a nicer log message here if we had a nice way to 254 * walk backpointers to print a path 255 */ 256 struct printbuf path = PRINTBUF; 257 ret = bch2_inum_to_path(trans, root_inum, &path); 258 if (ret) 259 goto err; 260 261 bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", 262 path.buf, root_inum.subvol, snapshot); 263 printbuf_exit(&path); 264 265 u64 now = bch2_current_time(c); 266 u64 cpu = raw_smp_processor_id(); 267 268 bch2_inode_init_early(c, lostfound); 269 bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); 270 lostfound->bi_dir = root_inode.bi_inum; 271 lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); 272 273 root_inode.bi_nlink++; 274 275 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); 276 if (ret) 277 goto err; 278 279 bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); 280 ret = bch2_btree_iter_traverse(trans, &lostfound_iter); 281 if (ret) 282 goto err; 283 284 ret = bch2_dirent_create_snapshot(trans, 285 0, root_inode.bi_inum, snapshot, &root_hash_info, 286 mode_to_type(lostfound->bi_mode), 287 &lostfound_str, 288 lostfound->bi_inum, 289 &lostfound->bi_dir_offset, 290 BTREE_UPDATE_internal_snapshot_node| 291 STR_HASH_must_create) ?: 292 bch2_inode_write_flags(trans, &lostfound_iter, lostfound, 293 BTREE_UPDATE_internal_snapshot_node); 294 err: 295 bch_err_msg(c, ret, "creating lost+found"); 296 bch2_trans_iter_exit(trans, &lostfound_iter); 297 return ret; 298 } 299 300 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) 301 { 302 if (inode->bi_inum == BCACHEFS_ROOT_INO && 303 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) 304 return false; 305 306 /* 307 * Subvolume roots are special: older versions of subvolume roots may be 308 * disconnected, it's only the newest version that matters. 309 * 310 * We only keep a single dirent pointing to a subvolume root, i.e. 311 * older versions of snapshots will not have a different dirent pointing 312 * to the same subvolume root. 313 * 314 * This is because dirents that point to subvolumes are only visible in 315 * the parent subvolume - versioning is not needed - and keeping them 316 * around would break fsck, because when we're crossing subvolumes we 317 * don't have a consistent snapshot ID to do check the inode <-> dirent 318 * relationships. 319 * 320 * Thus, a subvolume root that's been renamed after a snapshot will have 321 * a disconnected older version - that's expected. 322 * 323 * Note that taking a snapshot always updates the root inode (to update 324 * the dirent backpointer), so a subvolume root inode with 325 * BCH_INODE_has_child_snapshot is never visible. 326 */ 327 if (inode->bi_subvol && 328 (inode->bi_flags & BCH_INODE_has_child_snapshot)) 329 return false; 330 331 return !bch2_inode_has_backpointer(inode) && 332 !(inode->bi_flags & BCH_INODE_unlinked); 333 } 334 335 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) 336 { 337 struct btree_iter iter; 338 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, 339 SPOS(d_pos.inode, d_pos.offset, snapshot), 340 BTREE_ITER_intent| 341 BTREE_ITER_with_updates); 342 int ret = bkey_err(k); 343 if (ret) 344 return ret; 345 346 if (bpos_eq(k.k->p, d_pos)) { 347 /* 348 * delet_at() doesn't work because the update path doesn't 349 * internally use BTREE_ITER_with_updates yet 350 */ 351 struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); 352 ret = PTR_ERR_OR_ZERO(k); 353 if (ret) 354 goto err; 355 356 bkey_init(&k->k); 357 k->k.type = KEY_TYPE_whiteout; 358 k->k.p = iter.pos; 359 ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); 360 } 361 err: 362 bch2_trans_iter_exit(trans, &iter); 363 return ret; 364 } 365 366 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) 367 { 368 struct bch_fs *c = trans->c; 369 struct bch_inode_unpacked lostfound; 370 char name_buf[20]; 371 int ret; 372 373 u32 dirent_snapshot = inode->bi_snapshot; 374 if (inode->bi_subvol) { 375 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; 376 377 struct btree_iter subvol_iter; 378 struct bkey_i_subvolume *subvol = 379 bch2_bkey_get_mut_typed(trans, &subvol_iter, 380 BTREE_ID_subvolumes, POS(0, inode->bi_subvol), 381 0, subvolume); 382 ret = PTR_ERR_OR_ZERO(subvol); 383 if (ret) 384 return ret; 385 386 subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL; 387 bch2_trans_iter_exit(trans, &subvol_iter); 388 389 u64 root_inum; 390 ret = subvol_lookup(trans, inode->bi_parent_subvol, 391 &dirent_snapshot, &root_inum); 392 if (ret) 393 return ret; 394 395 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); 396 } else { 397 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 398 } 399 400 ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); 401 if (ret) 402 return ret; 403 404 bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum); 405 406 lostfound.bi_nlink += S_ISDIR(inode->bi_mode); 407 408 /* ensure lost+found inode is also present in inode snapshot */ 409 if (!inode->bi_subvol) { 410 BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot)); 411 lostfound.bi_snapshot = inode->bi_snapshot; 412 } 413 414 ret = __bch2_fsck_write_inode(trans, &lostfound); 415 if (ret) 416 return ret; 417 418 struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); 419 struct qstr name = QSTR(name_buf); 420 421 inode->bi_dir = lostfound.bi_inum; 422 423 ret = bch2_dirent_create_snapshot(trans, 424 inode->bi_parent_subvol, lostfound.bi_inum, 425 dirent_snapshot, 426 &dir_hash, 427 inode_d_type(inode), 428 &name, 429 inode->bi_subvol ?: inode->bi_inum, 430 &inode->bi_dir_offset, 431 BTREE_UPDATE_internal_snapshot_node| 432 STR_HASH_must_create); 433 if (ret) { 434 bch_err_msg(c, ret, "error creating dirent"); 435 return ret; 436 } 437 438 ret = __bch2_fsck_write_inode(trans, inode); 439 if (ret) 440 return ret; 441 442 { 443 CLASS(printbuf, buf)(); 444 ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, 445 inode->bi_snapshot, NULL, &buf); 446 if (ret) 447 return ret; 448 449 bch_info(c, "reattached at %s", buf.buf); 450 } 451 452 /* 453 * Fix up inodes in child snapshots: if they should also be reattached 454 * update the backpointer field, if they should not be we need to emit 455 * whiteouts for the dirent we just created. 456 */ 457 if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { 458 snapshot_id_list whiteouts_done; 459 struct btree_iter iter; 460 struct bkey_s_c k; 461 462 darray_init(&whiteouts_done); 463 464 for_each_btree_key_reverse_norestart(trans, iter, 465 BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), 466 BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { 467 if (k.k->p.offset != inode->bi_inum) 468 break; 469 470 if (!bkey_is_inode(k.k) || 471 !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || 472 snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) 473 continue; 474 475 struct bch_inode_unpacked child_inode; 476 ret = bch2_inode_unpack(k, &child_inode); 477 if (ret) 478 break; 479 480 if (!inode_should_reattach(&child_inode)) { 481 ret = maybe_delete_dirent(trans, 482 SPOS(lostfound.bi_inum, inode->bi_dir_offset, 483 dirent_snapshot), 484 k.k->p.snapshot); 485 if (ret) 486 break; 487 488 ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); 489 if (ret) 490 break; 491 } else { 492 iter.snapshot = k.k->p.snapshot; 493 child_inode.bi_dir = inode->bi_dir; 494 child_inode.bi_dir_offset = inode->bi_dir_offset; 495 496 ret = bch2_inode_write_flags(trans, &iter, &child_inode, 497 BTREE_UPDATE_internal_snapshot_node); 498 if (ret) 499 break; 500 } 501 } 502 darray_exit(&whiteouts_done); 503 bch2_trans_iter_exit(trans, &iter); 504 } 505 506 return ret; 507 } 508 509 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 510 struct btree_iter *iter, 511 struct bpos pos) 512 { 513 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 514 } 515 516 static int remove_backpointer(struct btree_trans *trans, 517 struct bch_inode_unpacked *inode) 518 { 519 if (!bch2_inode_has_backpointer(inode)) 520 return 0; 521 522 u32 snapshot = inode->bi_snapshot; 523 524 if (inode->bi_parent_subvol) { 525 int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot); 526 if (ret) 527 return ret; 528 } 529 530 struct bch_fs *c = trans->c; 531 struct btree_iter iter; 532 struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, 533 SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); 534 int ret = bkey_err(d) ?: 535 dirent_points_to_inode(c, d, inode) ?: 536 bch2_fsck_remove_dirent(trans, d.k->p); 537 bch2_trans_iter_exit(trans, &iter); 538 return ret; 539 } 540 541 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) 542 { 543 struct bch_fs *c = trans->c; 544 545 struct bch_inode_unpacked inode; 546 int ret = bch2_inode_find_by_inum_trans(trans, 547 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 548 &inode); 549 if (ret) 550 return ret; 551 552 ret = remove_backpointer(trans, &inode); 553 if (!bch2_err_matches(ret, ENOENT)) 554 bch_err_msg(c, ret, "removing dirent"); 555 if (ret) 556 return ret; 557 558 ret = reattach_inode(trans, &inode); 559 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 560 return ret; 561 } 562 563 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) 564 { 565 struct bch_fs *c = trans->c; 566 567 if (!bch2_snapshot_is_leaf(c, snapshotid)) { 568 bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); 569 return bch_err_throw(c, fsck_repair_unimplemented); 570 } 571 572 /* 573 * If inum isn't set, that means we're being called from check_dirents, 574 * not check_inodes - the root of this subvolume doesn't exist or we 575 * would have found it there: 576 */ 577 if (!inum) { 578 struct btree_iter inode_iter = {}; 579 struct bch_inode_unpacked new_inode; 580 u64 cpu = raw_smp_processor_id(); 581 582 bch2_inode_init_early(c, &new_inode); 583 bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); 584 585 new_inode.bi_subvol = subvolid; 586 587 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: 588 bch2_btree_iter_traverse(trans, &inode_iter) ?: 589 bch2_inode_write(trans, &inode_iter, &new_inode); 590 bch2_trans_iter_exit(trans, &inode_iter); 591 if (ret) 592 return ret; 593 594 inum = new_inode.bi_inum; 595 } 596 597 bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); 598 599 struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); 600 int ret = PTR_ERR_OR_ZERO(new_subvol); 601 if (ret) 602 return ret; 603 604 bkey_subvolume_init(&new_subvol->k_i); 605 new_subvol->k.p.offset = subvolid; 606 new_subvol->v.snapshot = cpu_to_le32(snapshotid); 607 new_subvol->v.inode = cpu_to_le64(inum); 608 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); 609 if (ret) 610 return ret; 611 612 struct btree_iter iter; 613 struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, 614 BTREE_ID_snapshots, POS(0, snapshotid), 615 0, snapshot); 616 ret = PTR_ERR_OR_ZERO(s); 617 bch_err_msg(c, ret, "getting snapshot %u", snapshotid); 618 if (ret) 619 return ret; 620 621 u32 snapshot_tree = le32_to_cpu(s->v.tree); 622 623 s->v.subvol = cpu_to_le32(subvolid); 624 SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); 625 bch2_trans_iter_exit(trans, &iter); 626 627 struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, 628 BTREE_ID_snapshot_trees, POS(0, snapshot_tree), 629 0, snapshot_tree); 630 ret = PTR_ERR_OR_ZERO(st); 631 bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); 632 if (ret) 633 return ret; 634 635 if (!st->v.master_subvol) 636 st->v.master_subvol = cpu_to_le32(subvolid); 637 638 bch2_trans_iter_exit(trans, &iter); 639 return 0; 640 } 641 642 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) 643 { 644 struct bch_fs *c = trans->c; 645 unsigned i_mode = S_IFREG; 646 u64 i_size = 0; 647 648 switch (btree) { 649 case BTREE_ID_extents: { 650 struct btree_iter iter = {}; 651 652 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); 653 struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); 654 bch2_trans_iter_exit(trans, &iter); 655 int ret = bkey_err(k); 656 if (ret) 657 return ret; 658 659 i_size = k.k->p.offset << 9; 660 break; 661 } 662 case BTREE_ID_dirents: 663 i_mode = S_IFDIR; 664 break; 665 case BTREE_ID_xattrs: 666 break; 667 default: 668 BUG(); 669 } 670 671 struct bch_inode_unpacked new_inode; 672 bch2_inode_init_early(c, &new_inode); 673 bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); 674 new_inode.bi_size = i_size; 675 new_inode.bi_inum = inum; 676 new_inode.bi_snapshot = snapshot; 677 678 return __bch2_fsck_write_inode(trans, &new_inode); 679 } 680 681 static inline void snapshots_seen_exit(struct snapshots_seen *s) 682 { 683 darray_exit(&s->ids); 684 } 685 686 static inline void snapshots_seen_init(struct snapshots_seen *s) 687 { 688 memset(s, 0, sizeof(*s)); 689 } 690 691 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) 692 { 693 u32 *i; 694 __darray_for_each(s->ids, i) { 695 if (*i == id) 696 return 0; 697 if (*i > id) 698 break; 699 } 700 701 int ret = darray_insert_item(&s->ids, i - s->ids.data, id); 702 if (ret) 703 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 704 s->ids.size); 705 return ret; 706 } 707 708 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, 709 enum btree_id btree_id, struct bpos pos) 710 { 711 if (!bkey_eq(s->pos, pos)) 712 s->ids.nr = 0; 713 s->pos = pos; 714 715 return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); 716 } 717 718 /** 719 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, 720 * and @ancestor hasn't been overwritten in @seen 721 * 722 * @c: filesystem handle 723 * @seen: list of snapshot ids already seen at current position 724 * @id: descendent snapshot id 725 * @ancestor: ancestor snapshot id 726 * 727 * Returns: whether key in @ancestor snapshot is visible in @id snapshot 728 */ 729 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, 730 u32 id, u32 ancestor) 731 { 732 EBUG_ON(id > ancestor); 733 734 if (id == ancestor) 735 return true; 736 737 if (!bch2_snapshot_is_ancestor(c, id, ancestor)) 738 return false; 739 740 /* 741 * We know that @id is a descendant of @ancestor, we're checking if 742 * we've seen a key that overwrote @ancestor - i.e. also a descendent of 743 * @ascestor and with @id as a descendent. 744 * 745 * But we already know that we're scanning IDs between @id and @ancestor 746 * numerically, since snapshot ID lists are kept sorted, so if we find 747 * an id that's an ancestor of @id we're done: 748 */ 749 darray_for_each_reverse(seen->ids, i) 750 if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i)) 751 return false; 752 753 return true; 754 } 755 756 /** 757 * ref_visible - given a key with snapshot id @src that points to a key with 758 * snapshot id @dst, test whether there is some snapshot in which @dst is 759 * visible. 760 * 761 * @c: filesystem handle 762 * @s: list of snapshot IDs already seen at @src 763 * @src: snapshot ID of src key 764 * @dst: snapshot ID of dst key 765 * Returns: true if there is some snapshot in which @dst is visible 766 * 767 * Assumes we're visiting @src keys in natural key order 768 */ 769 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, 770 u32 src, u32 dst) 771 { 772 return dst <= src 773 ? key_visible_in_snapshot(c, s, dst, src) 774 : bch2_snapshot_is_ancestor(c, src, dst); 775 } 776 777 static int ref_visible2(struct bch_fs *c, 778 u32 src, struct snapshots_seen *src_seen, 779 u32 dst, struct snapshots_seen *dst_seen) 780 { 781 if (dst > src) { 782 swap(dst, src); 783 swap(dst_seen, src_seen); 784 } 785 return key_visible_in_snapshot(c, src_seen, dst, src); 786 } 787 788 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ 789 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ 790 (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ 791 if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) 792 793 struct inode_walker_entry { 794 struct bch_inode_unpacked inode; 795 bool whiteout; 796 u64 count; 797 u64 i_size; 798 }; 799 800 struct inode_walker { 801 bool first_this_inode; 802 bool have_inodes; 803 bool recalculate_sums; 804 struct bpos last_pos; 805 806 DARRAY(struct inode_walker_entry) inodes; 807 snapshot_id_list deletes; 808 }; 809 810 static void inode_walker_exit(struct inode_walker *w) 811 { 812 darray_exit(&w->inodes); 813 darray_exit(&w->deletes); 814 } 815 816 static struct inode_walker inode_walker_init(void) 817 { 818 return (struct inode_walker) { 0, }; 819 } 820 821 static int add_inode(struct bch_fs *c, struct inode_walker *w, 822 struct bkey_s_c inode) 823 { 824 int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { 825 .whiteout = !bkey_is_inode(inode.k), 826 })); 827 if (ret) 828 return ret; 829 830 struct inode_walker_entry *n = &darray_last(w->inodes); 831 if (!n->whiteout) { 832 return bch2_inode_unpack(inode, &n->inode); 833 } else { 834 n->inode.bi_inum = inode.k->p.offset; 835 n->inode.bi_snapshot = inode.k->p.snapshot; 836 return 0; 837 } 838 } 839 840 static int get_inodes_all_snapshots(struct btree_trans *trans, 841 struct inode_walker *w, u64 inum) 842 { 843 struct bch_fs *c = trans->c; 844 struct btree_iter iter; 845 struct bkey_s_c k; 846 int ret; 847 848 /* 849 * We no longer have inodes for w->last_pos; clear this to avoid 850 * screwing up check_i_sectors/check_subdir_count if we take a 851 * transaction restart here: 852 */ 853 w->have_inodes = false; 854 w->recalculate_sums = false; 855 w->inodes.nr = 0; 856 857 for_each_btree_key_max_norestart(trans, iter, 858 BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), 859 BTREE_ITER_all_snapshots, k, ret) { 860 ret = add_inode(c, w, k); 861 if (ret) 862 break; 863 } 864 bch2_trans_iter_exit(trans, &iter); 865 866 if (ret) 867 return ret; 868 869 w->first_this_inode = true; 870 w->have_inodes = true; 871 return 0; 872 } 873 874 static int get_visible_inodes(struct btree_trans *trans, 875 struct inode_walker *w, 876 struct snapshots_seen *s, 877 u64 inum) 878 { 879 struct bch_fs *c = trans->c; 880 struct btree_iter iter; 881 struct bkey_s_c k; 882 int ret; 883 884 w->inodes.nr = 0; 885 w->deletes.nr = 0; 886 887 for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), 888 BTREE_ITER_all_snapshots, k, ret) { 889 if (k.k->p.offset != inum) 890 break; 891 892 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) 893 continue; 894 895 if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) 896 continue; 897 898 ret = bkey_is_inode(k.k) 899 ? add_inode(c, w, k) 900 : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); 901 if (ret) 902 break; 903 } 904 bch2_trans_iter_exit(trans, &iter); 905 906 return ret; 907 } 908 909 static struct inode_walker_entry * 910 lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) 911 { 912 struct bch_fs *c = trans->c; 913 914 struct inode_walker_entry *i = darray_find_p(w->inodes, i, 915 bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)); 916 917 if (!i) 918 return NULL; 919 920 struct printbuf buf = PRINTBUF; 921 int ret = 0; 922 923 if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, 924 trans, snapshot_key_missing_inode_snapshot, 925 "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" 926 "unexpected because we should always update the inode when we update a key in that inode\n" 927 "%s", 928 w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, 929 (bch2_bkey_val_to_text(&buf, c, k), 930 buf.buf))) { 931 if (!i->whiteout) { 932 struct bch_inode_unpacked new = i->inode; 933 new.bi_snapshot = k.k->p.snapshot; 934 ret = __bch2_fsck_write_inode(trans, &new); 935 } else { 936 struct bkey_i whiteout; 937 bkey_init(&whiteout.k); 938 whiteout.k.type = KEY_TYPE_whiteout; 939 whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot); 940 ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, 941 &whiteout, 942 BTREE_UPDATE_internal_snapshot_node); 943 } 944 945 if (ret) 946 goto fsck_err; 947 948 ret = bch2_trans_commit(trans, NULL, NULL, 0); 949 if (ret) 950 goto fsck_err; 951 952 struct inode_walker_entry new_entry = *i; 953 954 new_entry.inode.bi_snapshot = k.k->p.snapshot; 955 new_entry.count = 0; 956 new_entry.i_size = 0; 957 958 while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) 959 --i; 960 961 size_t pos = i - w->inodes.data; 962 ret = darray_insert_item(&w->inodes, pos, new_entry); 963 if (ret) 964 goto fsck_err; 965 966 ret = bch_err_throw(c, transaction_restart_nested); 967 goto fsck_err; 968 } 969 970 printbuf_exit(&buf); 971 return i; 972 fsck_err: 973 printbuf_exit(&buf); 974 return ERR_PTR(ret); 975 } 976 977 static struct inode_walker_entry *walk_inode(struct btree_trans *trans, 978 struct inode_walker *w, 979 struct bkey_s_c k) 980 { 981 if (w->last_pos.inode != k.k->p.inode) { 982 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); 983 if (ret) 984 return ERR_PTR(ret); 985 } 986 987 w->last_pos = k.k->p; 988 989 return lookup_inode_for_snapshot(trans, w, k); 990 } 991 992 /* 993 * Prefer to delete the first one, since that will be the one at the wrong 994 * offset: 995 * return value: 0 -> delete k1, 1 -> delete k2 996 */ 997 int bch2_fsck_update_backpointers(struct btree_trans *trans, 998 struct snapshots_seen *s, 999 const struct bch_hash_desc desc, 1000 struct bch_hash_info *hash_info, 1001 struct bkey_i *new) 1002 { 1003 if (new->k.type != KEY_TYPE_dirent) 1004 return 0; 1005 1006 struct bkey_i_dirent *d = bkey_i_to_dirent(new); 1007 struct inode_walker target = inode_walker_init(); 1008 int ret = 0; 1009 1010 if (d->v.d_type == DT_SUBVOL) { 1011 bch_err(trans->c, "%s does not support DT_SUBVOL", __func__); 1012 ret = -BCH_ERR_fsck_repair_unimplemented; 1013 } else { 1014 ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); 1015 if (ret) 1016 goto err; 1017 1018 darray_for_each(target.inodes, i) { 1019 i->inode.bi_dir_offset = d->k.p.offset; 1020 ret = __bch2_fsck_write_inode(trans, &i->inode); 1021 if (ret) 1022 goto err; 1023 } 1024 } 1025 err: 1026 inode_walker_exit(&target); 1027 return ret; 1028 } 1029 1030 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, 1031 struct btree_iter *iter, 1032 struct bch_inode_unpacked *inode, 1033 u32 *snapshot) 1034 { 1035 if (inode->bi_subvol) { 1036 u64 inum; 1037 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); 1038 if (ret) 1039 return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); 1040 } 1041 1042 return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); 1043 } 1044 1045 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) 1046 { 1047 struct btree_iter iter; 1048 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); 1049 int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; 1050 bch2_trans_iter_exit(trans, &iter); 1051 return ret; 1052 } 1053 1054 static int check_inode_dirent_inode(struct btree_trans *trans, 1055 struct bch_inode_unpacked *inode, 1056 bool *write_inode) 1057 { 1058 struct bch_fs *c = trans->c; 1059 struct printbuf buf = PRINTBUF; 1060 1061 u32 inode_snapshot = inode->bi_snapshot; 1062 struct btree_iter dirent_iter = {}; 1063 struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); 1064 int ret = bkey_err(d); 1065 if (ret && !bch2_err_matches(ret, ENOENT)) 1066 return ret; 1067 1068 if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) && 1069 inode->bi_subvol && 1070 (inode->bi_flags & BCH_INODE_has_child_snapshot)) { 1071 /* Older version of a renamed subvolume root: we won't have a 1072 * correct dirent for it. That's expected, see 1073 * inode_should_reattach(). 1074 * 1075 * We don't clear the backpointer field when doing the rename 1076 * because there might be arbitrarily many versions in older 1077 * snapshots. 1078 */ 1079 inode->bi_dir = 0; 1080 inode->bi_dir_offset = 0; 1081 *write_inode = true; 1082 goto out; 1083 } 1084 1085 if (fsck_err_on(ret, 1086 trans, inode_points_to_missing_dirent, 1087 "inode points to missing dirent\n%s", 1088 (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || 1089 fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode), 1090 trans, inode_points_to_wrong_dirent, 1091 "%s", 1092 (printbuf_reset(&buf), 1093 dirent_inode_mismatch_msg(&buf, c, d, inode), 1094 buf.buf))) { 1095 /* 1096 * We just clear the backpointer fields for now. If we find a 1097 * dirent that points to this inode in check_dirents(), we'll 1098 * update it then; then when we get to check_path() if the 1099 * backpointer is still 0 we'll reattach it. 1100 */ 1101 inode->bi_dir = 0; 1102 inode->bi_dir_offset = 0; 1103 *write_inode = true; 1104 } 1105 out: 1106 ret = 0; 1107 fsck_err: 1108 bch2_trans_iter_exit(trans, &dirent_iter); 1109 printbuf_exit(&buf); 1110 bch_err_fn(c, ret); 1111 return ret; 1112 } 1113 1114 static int check_inode(struct btree_trans *trans, 1115 struct btree_iter *iter, 1116 struct bkey_s_c k, 1117 struct bch_inode_unpacked *snapshot_root, 1118 struct snapshots_seen *s) 1119 { 1120 struct bch_fs *c = trans->c; 1121 struct printbuf buf = PRINTBUF; 1122 struct bch_inode_unpacked u; 1123 bool do_update = false; 1124 int ret; 1125 1126 ret = bch2_check_key_has_snapshot(trans, iter, k); 1127 if (ret < 0) 1128 goto err; 1129 if (ret) 1130 return 0; 1131 1132 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1133 if (ret) 1134 goto err; 1135 1136 if (!bkey_is_inode(k.k)) 1137 return 0; 1138 1139 ret = bch2_inode_unpack(k, &u); 1140 if (ret) 1141 goto err; 1142 1143 if (snapshot_root->bi_inum != u.bi_inum) { 1144 ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); 1145 if (ret) 1146 goto err; 1147 } 1148 1149 if (u.bi_hash_seed != snapshot_root->bi_hash_seed || 1150 INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { 1151 ret = bch2_repair_inode_hash_info(trans, snapshot_root); 1152 BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); 1153 if (ret) 1154 goto err; 1155 } 1156 1157 ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); 1158 if (ret) 1159 goto err; 1160 1161 if (bch2_inode_has_backpointer(&u)) { 1162 ret = check_inode_dirent_inode(trans, &u, &do_update); 1163 if (ret) 1164 goto err; 1165 } 1166 1167 if (fsck_err_on(bch2_inode_has_backpointer(&u) && 1168 (u.bi_flags & BCH_INODE_unlinked), 1169 trans, inode_unlinked_but_has_dirent, 1170 "inode unlinked but has dirent\n%s", 1171 (printbuf_reset(&buf), 1172 bch2_inode_unpacked_to_text(&buf, &u), 1173 buf.buf))) { 1174 u.bi_flags &= ~BCH_INODE_unlinked; 1175 do_update = true; 1176 } 1177 1178 if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) { 1179 /* Check for this early so that check_unreachable_inode() will reattach it */ 1180 1181 ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot); 1182 if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty) 1183 goto err; 1184 1185 fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty, 1186 "dir unlinked but not empty\n%s", 1187 (printbuf_reset(&buf), 1188 bch2_inode_unpacked_to_text(&buf, &u), 1189 buf.buf)); 1190 u.bi_flags &= ~BCH_INODE_unlinked; 1191 do_update = true; 1192 ret = 0; 1193 } 1194 1195 if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, 1196 trans, inode_dir_has_nonzero_i_size, 1197 "directory %llu:%u with nonzero i_size %lli", 1198 u.bi_inum, u.bi_snapshot, u.bi_size)) { 1199 u.bi_size = 0; 1200 do_update = true; 1201 } 1202 1203 ret = bch2_inode_has_child_snapshots(trans, k.k->p); 1204 if (ret < 0) 1205 goto err; 1206 1207 if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), 1208 trans, inode_has_child_snapshots_wrong, 1209 "inode has_child_snapshots flag wrong (should be %u)\n%s", 1210 ret, 1211 (printbuf_reset(&buf), 1212 bch2_inode_unpacked_to_text(&buf, &u), 1213 buf.buf))) { 1214 if (ret) 1215 u.bi_flags |= BCH_INODE_has_child_snapshot; 1216 else 1217 u.bi_flags &= ~BCH_INODE_has_child_snapshot; 1218 do_update = true; 1219 } 1220 ret = 0; 1221 1222 if ((u.bi_flags & BCH_INODE_unlinked) && 1223 !(u.bi_flags & BCH_INODE_has_child_snapshot)) { 1224 if (!test_bit(BCH_FS_started, &c->flags)) { 1225 /* 1226 * If we're not in online fsck, don't delete unlinked 1227 * inodes, just make sure they're on the deleted list. 1228 * 1229 * They might be referred to by a logged operation - 1230 * i.e. we might have crashed in the middle of a 1231 * truncate on an unlinked but open file - so we want to 1232 * let the delete_dead_inodes kill it after resuming 1233 * logged ops. 1234 */ 1235 ret = check_inode_deleted_list(trans, k.k->p); 1236 if (ret < 0) 1237 goto err_noprint; 1238 1239 fsck_err_on(!ret, 1240 trans, unlinked_inode_not_on_deleted_list, 1241 "inode %llu:%u unlinked, but not on deleted list", 1242 u.bi_inum, k.k->p.snapshot); 1243 1244 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); 1245 if (ret) 1246 goto err; 1247 } else { 1248 ret = bch2_inode_or_descendents_is_open(trans, k.k->p); 1249 if (ret < 0) 1250 goto err; 1251 1252 if (fsck_err_on(!ret, 1253 trans, inode_unlinked_and_not_open, 1254 "inode %llu:%u unlinked and not open", 1255 u.bi_inum, u.bi_snapshot)) { 1256 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); 1257 bch_err_msg(c, ret, "in fsck deleting inode"); 1258 goto err_noprint; 1259 } 1260 ret = 0; 1261 } 1262 } 1263 1264 if (fsck_err_on(u.bi_parent_subvol && 1265 (u.bi_subvol == 0 || 1266 u.bi_subvol == BCACHEFS_ROOT_SUBVOL), 1267 trans, inode_bi_parent_nonzero, 1268 "inode %llu:%u has subvol %u but nonzero parent subvol %u", 1269 u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { 1270 u.bi_parent_subvol = 0; 1271 do_update = true; 1272 } 1273 1274 if (u.bi_subvol) { 1275 struct bch_subvolume s; 1276 1277 ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); 1278 if (ret && !bch2_err_matches(ret, ENOENT)) 1279 goto err; 1280 1281 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 1282 ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); 1283 goto do_update; 1284 } 1285 1286 if (fsck_err_on(ret, 1287 trans, inode_bi_subvol_missing, 1288 "inode %llu:%u bi_subvol points to missing subvolume %u", 1289 u.bi_inum, k.k->p.snapshot, u.bi_subvol) || 1290 fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || 1291 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), 1292 k.k->p.snapshot), 1293 trans, inode_bi_subvol_wrong, 1294 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", 1295 u.bi_inum, k.k->p.snapshot, u.bi_subvol, 1296 le64_to_cpu(s.inode), 1297 le32_to_cpu(s.snapshot))) { 1298 u.bi_subvol = 0; 1299 u.bi_parent_subvol = 0; 1300 do_update = true; 1301 } 1302 } 1303 1304 if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), 1305 trans, inode_journal_seq_in_future, 1306 "inode journal seq in future (currently at %llu)\n%s", 1307 journal_cur_seq(&c->journal), 1308 (printbuf_reset(&buf), 1309 bch2_inode_unpacked_to_text(&buf, &u), 1310 buf.buf))) { 1311 u.bi_journal_seq = journal_cur_seq(&c->journal); 1312 do_update = true; 1313 } 1314 do_update: 1315 if (do_update) { 1316 ret = __bch2_fsck_write_inode(trans, &u); 1317 bch_err_msg(c, ret, "in fsck updating inode"); 1318 if (ret) 1319 goto err_noprint; 1320 } 1321 err: 1322 fsck_err: 1323 bch_err_fn(c, ret); 1324 err_noprint: 1325 printbuf_exit(&buf); 1326 return ret; 1327 } 1328 1329 int bch2_check_inodes(struct bch_fs *c) 1330 { 1331 struct bch_inode_unpacked snapshot_root = {}; 1332 struct snapshots_seen s; 1333 1334 snapshots_seen_init(&s); 1335 1336 int ret = bch2_trans_run(c, 1337 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1338 POS_MIN, 1339 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1340 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1341 check_inode(trans, &iter, k, &snapshot_root, &s))); 1342 1343 snapshots_seen_exit(&s); 1344 bch_err_fn(c, ret); 1345 return ret; 1346 } 1347 1348 static int find_oldest_inode_needs_reattach(struct btree_trans *trans, 1349 struct bch_inode_unpacked *inode) 1350 { 1351 struct bch_fs *c = trans->c; 1352 struct btree_iter iter; 1353 struct bkey_s_c k; 1354 int ret = 0; 1355 1356 /* 1357 * We look for inodes to reattach in natural key order, leaves first, 1358 * but we should do the reattach at the oldest version that needs to be 1359 * reattached: 1360 */ 1361 for_each_btree_key_norestart(trans, iter, 1362 BTREE_ID_inodes, 1363 SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), 1364 BTREE_ITER_all_snapshots, k, ret) { 1365 if (k.k->p.offset != inode->bi_inum) 1366 break; 1367 1368 if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) 1369 continue; 1370 1371 if (!bkey_is_inode(k.k)) 1372 break; 1373 1374 struct bch_inode_unpacked parent_inode; 1375 ret = bch2_inode_unpack(k, &parent_inode); 1376 if (ret) 1377 break; 1378 1379 if (!inode_should_reattach(&parent_inode)) 1380 break; 1381 1382 *inode = parent_inode; 1383 } 1384 bch2_trans_iter_exit(trans, &iter); 1385 1386 return ret; 1387 } 1388 1389 static int check_unreachable_inode(struct btree_trans *trans, 1390 struct btree_iter *iter, 1391 struct bkey_s_c k) 1392 { 1393 struct printbuf buf = PRINTBUF; 1394 int ret = 0; 1395 1396 if (!bkey_is_inode(k.k)) 1397 return 0; 1398 1399 struct bch_inode_unpacked inode; 1400 ret = bch2_inode_unpack(k, &inode); 1401 if (ret) 1402 return ret; 1403 1404 if (!inode_should_reattach(&inode)) 1405 return 0; 1406 1407 ret = find_oldest_inode_needs_reattach(trans, &inode); 1408 if (ret) 1409 return ret; 1410 1411 if (fsck_err(trans, inode_unreachable, 1412 "unreachable inode:\n%s", 1413 (bch2_inode_unpacked_to_text(&buf, &inode), 1414 buf.buf))) 1415 ret = reattach_inode(trans, &inode); 1416 fsck_err: 1417 printbuf_exit(&buf); 1418 return ret; 1419 } 1420 1421 /* 1422 * Reattach unreachable (but not unlinked) inodes 1423 * 1424 * Run after check_inodes() and check_dirents(), so we node that inode 1425 * backpointer fields point to valid dirents, and every inode that has a dirent 1426 * that points to it has its backpointer field set - so we're just looking for 1427 * non-unlinked inodes without backpointers: 1428 * 1429 * XXX: this is racy w.r.t. hardlink removal in online fsck 1430 */ 1431 int bch2_check_unreachable_inodes(struct bch_fs *c) 1432 { 1433 int ret = bch2_trans_run(c, 1434 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1435 POS_MIN, 1436 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1437 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1438 check_unreachable_inode(trans, &iter, k))); 1439 bch_err_fn(c, ret); 1440 return ret; 1441 } 1442 1443 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) 1444 { 1445 switch (btree) { 1446 case BTREE_ID_extents: 1447 return S_ISREG(mode) || S_ISLNK(mode); 1448 case BTREE_ID_dirents: 1449 return S_ISDIR(mode); 1450 case BTREE_ID_xattrs: 1451 return true; 1452 default: 1453 BUG(); 1454 } 1455 } 1456 1457 static int check_key_has_inode(struct btree_trans *trans, 1458 struct btree_iter *iter, 1459 struct inode_walker *inode, 1460 struct inode_walker_entry *i, 1461 struct bkey_s_c k) 1462 { 1463 struct bch_fs *c = trans->c; 1464 struct printbuf buf = PRINTBUF; 1465 struct btree_iter iter2 = {}; 1466 int ret = PTR_ERR_OR_ZERO(i); 1467 if (ret) 1468 return ret; 1469 1470 if (k.k->type == KEY_TYPE_whiteout) 1471 goto out; 1472 1473 bool have_inode = i && !i->whiteout; 1474 1475 if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) 1476 goto reconstruct; 1477 1478 if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode)) 1479 goto out; 1480 1481 prt_printf(&buf, ", "); 1482 1483 bool have_old_inode = false; 1484 darray_for_each(inode->inodes, i2) 1485 if (!i2->whiteout && 1486 bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) && 1487 btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) { 1488 prt_printf(&buf, "but found good inode in older snapshot\n"); 1489 bch2_inode_unpacked_to_text(&buf, &i2->inode); 1490 prt_newline(&buf); 1491 have_old_inode = true; 1492 break; 1493 } 1494 1495 struct bkey_s_c k2; 1496 unsigned nr_keys = 0; 1497 1498 prt_printf(&buf, "found keys:\n"); 1499 1500 for_each_btree_key_max_norestart(trans, iter2, iter->btree_id, 1501 SPOS(k.k->p.inode, 0, k.k->p.snapshot), 1502 POS(k.k->p.inode, U64_MAX), 1503 0, k2, ret) { 1504 nr_keys++; 1505 if (nr_keys <= 10) { 1506 bch2_bkey_val_to_text(&buf, c, k2); 1507 prt_newline(&buf); 1508 } 1509 if (nr_keys >= 100) 1510 break; 1511 } 1512 1513 if (ret) 1514 goto err; 1515 1516 if (nr_keys > 100) 1517 prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); 1518 else if (nr_keys > 10) 1519 prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); 1520 1521 if (!have_inode) { 1522 if (fsck_err_on(!have_inode, 1523 trans, key_in_missing_inode, 1524 "key in missing inode%s", buf.buf)) { 1525 /* 1526 * Maybe a deletion that raced with data move, or something 1527 * weird like that? But if we know the inode was deleted, or 1528 * it's just a few keys, we can safely delete them. 1529 * 1530 * If it's many keys, we should probably recreate the inode 1531 */ 1532 if (have_old_inode || nr_keys <= 2) 1533 goto delete; 1534 else 1535 goto reconstruct; 1536 } 1537 } else { 1538 /* 1539 * not autofix, this one would be a giant wtf - bit error in the 1540 * inode corrupting i_mode? 1541 * 1542 * may want to try repairing inode instead of deleting 1543 */ 1544 if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), 1545 trans, key_in_wrong_inode_type, 1546 "key for wrong inode mode %o%s", 1547 i->inode.bi_mode, buf.buf)) 1548 goto delete; 1549 } 1550 out: 1551 err: 1552 fsck_err: 1553 bch2_trans_iter_exit(trans, &iter2); 1554 printbuf_exit(&buf); 1555 bch_err_fn(c, ret); 1556 return ret; 1557 delete: 1558 /* 1559 * XXX: print out more info 1560 * count up extents for this inode, check if we have different inode in 1561 * an older snapshot version, perhaps decide if we want to reconstitute 1562 */ 1563 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); 1564 goto out; 1565 reconstruct: 1566 ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: 1567 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1568 if (ret) 1569 goto err; 1570 1571 inode->last_pos.inode--; 1572 ret = bch_err_throw(c, transaction_restart_nested); 1573 goto out; 1574 } 1575 1576 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) 1577 { 1578 struct bch_fs *c = trans->c; 1579 int ret = 0; 1580 s64 count2; 1581 1582 darray_for_each(w->inodes, i) { 1583 if (i->inode.bi_sectors == i->count) 1584 continue; 1585 1586 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); 1587 1588 if (w->recalculate_sums) 1589 i->count = count2; 1590 1591 if (i->count != count2) { 1592 bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1593 w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); 1594 i->count = count2; 1595 } 1596 1597 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), 1598 trans, inode_i_sectors_wrong, 1599 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", 1600 w->last_pos.inode, i->inode.bi_snapshot, 1601 i->inode.bi_sectors, i->count)) { 1602 i->inode.bi_sectors = i->count; 1603 ret = bch2_fsck_write_inode(trans, &i->inode); 1604 if (ret) 1605 break; 1606 } 1607 } 1608 fsck_err: 1609 bch_err_fn(c, ret); 1610 return ret; 1611 } 1612 1613 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) 1614 { 1615 u32 restart_count = trans->restart_count; 1616 return check_i_sectors_notnested(trans, w) ?: 1617 trans_was_restarted(trans, restart_count); 1618 } 1619 1620 struct extent_end { 1621 u32 snapshot; 1622 u64 offset; 1623 struct snapshots_seen seen; 1624 }; 1625 1626 struct extent_ends { 1627 struct bpos last_pos; 1628 DARRAY(struct extent_end) e; 1629 }; 1630 1631 static void extent_ends_reset(struct extent_ends *extent_ends) 1632 { 1633 darray_for_each(extent_ends->e, i) 1634 snapshots_seen_exit(&i->seen); 1635 extent_ends->e.nr = 0; 1636 } 1637 1638 static void extent_ends_exit(struct extent_ends *extent_ends) 1639 { 1640 extent_ends_reset(extent_ends); 1641 darray_exit(&extent_ends->e); 1642 } 1643 1644 static void extent_ends_init(struct extent_ends *extent_ends) 1645 { 1646 memset(extent_ends, 0, sizeof(*extent_ends)); 1647 } 1648 1649 static int extent_ends_at(struct bch_fs *c, 1650 struct extent_ends *extent_ends, 1651 struct snapshots_seen *seen, 1652 struct bkey_s_c k) 1653 { 1654 struct extent_end *i, n = (struct extent_end) { 1655 .offset = k.k->p.offset, 1656 .snapshot = k.k->p.snapshot, 1657 .seen = *seen, 1658 }; 1659 1660 n.seen.ids.data = kmemdup(seen->ids.data, 1661 sizeof(seen->ids.data[0]) * seen->ids.size, 1662 GFP_KERNEL); 1663 if (!n.seen.ids.data) 1664 return bch_err_throw(c, ENOMEM_fsck_extent_ends_at); 1665 1666 __darray_for_each(extent_ends->e, i) { 1667 if (i->snapshot == k.k->p.snapshot) { 1668 snapshots_seen_exit(&i->seen); 1669 *i = n; 1670 return 0; 1671 } 1672 1673 if (i->snapshot >= k.k->p.snapshot) 1674 break; 1675 } 1676 1677 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); 1678 } 1679 1680 static int overlapping_extents_found(struct btree_trans *trans, 1681 enum btree_id btree, 1682 struct bpos pos1, struct snapshots_seen *pos1_seen, 1683 struct bkey pos2, 1684 bool *fixed, 1685 struct extent_end *extent_end) 1686 { 1687 struct bch_fs *c = trans->c; 1688 struct printbuf buf = PRINTBUF; 1689 struct btree_iter iter1, iter2 = {}; 1690 struct bkey_s_c k1, k2; 1691 int ret; 1692 1693 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); 1694 1695 bch2_trans_iter_init(trans, &iter1, btree, pos1, 1696 BTREE_ITER_all_snapshots| 1697 BTREE_ITER_not_extents); 1698 k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); 1699 ret = bkey_err(k1); 1700 if (ret) 1701 goto err; 1702 1703 prt_newline(&buf); 1704 bch2_bkey_val_to_text(&buf, c, k1); 1705 1706 if (!bpos_eq(pos1, k1.k->p)) { 1707 prt_str(&buf, "\nwanted\n "); 1708 bch2_bpos_to_text(&buf, pos1); 1709 prt_str(&buf, "\n"); 1710 bch2_bkey_to_text(&buf, &pos2); 1711 1712 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", 1713 __func__, buf.buf); 1714 ret = bch_err_throw(c, internal_fsck_err); 1715 goto err; 1716 } 1717 1718 bch2_trans_copy_iter(trans, &iter2, &iter1); 1719 1720 while (1) { 1721 bch2_btree_iter_advance(trans, &iter2); 1722 1723 k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); 1724 ret = bkey_err(k2); 1725 if (ret) 1726 goto err; 1727 1728 if (bpos_ge(k2.k->p, pos2.p)) 1729 break; 1730 } 1731 1732 prt_newline(&buf); 1733 bch2_bkey_val_to_text(&buf, c, k2); 1734 1735 if (bpos_gt(k2.k->p, pos2.p) || 1736 pos2.size != k2.k->size) { 1737 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", 1738 __func__, buf.buf); 1739 ret = bch_err_throw(c, internal_fsck_err); 1740 goto err; 1741 } 1742 1743 prt_printf(&buf, "\noverwriting %s extent", 1744 pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); 1745 1746 if (fsck_err(trans, extent_overlapping, 1747 "overlapping extents%s", buf.buf)) { 1748 struct btree_iter *old_iter = &iter1; 1749 struct disk_reservation res = { 0 }; 1750 1751 if (pos1.snapshot < pos2.p.snapshot) { 1752 old_iter = &iter2; 1753 swap(k1, k2); 1754 } 1755 1756 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); 1757 1758 ret = bch2_trans_update_extent_overwrite(trans, old_iter, 1759 BTREE_UPDATE_internal_snapshot_node, 1760 k1, k2) ?: 1761 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); 1762 bch2_disk_reservation_put(c, &res); 1763 1764 bch_info(c, "repair ret %s", bch2_err_str(ret)); 1765 1766 if (ret) 1767 goto err; 1768 1769 *fixed = true; 1770 1771 if (pos1.snapshot == pos2.p.snapshot) { 1772 /* 1773 * We overwrote the first extent, and did the overwrite 1774 * in the same snapshot: 1775 */ 1776 extent_end->offset = bkey_start_offset(&pos2); 1777 } else if (pos1.snapshot > pos2.p.snapshot) { 1778 /* 1779 * We overwrote the first extent in pos2's snapshot: 1780 */ 1781 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); 1782 } else { 1783 /* 1784 * We overwrote the second extent - restart 1785 * check_extent() from the top: 1786 */ 1787 ret = bch_err_throw(c, transaction_restart_nested); 1788 } 1789 } 1790 fsck_err: 1791 err: 1792 bch2_trans_iter_exit(trans, &iter2); 1793 bch2_trans_iter_exit(trans, &iter1); 1794 printbuf_exit(&buf); 1795 return ret; 1796 } 1797 1798 static int check_overlapping_extents(struct btree_trans *trans, 1799 struct snapshots_seen *seen, 1800 struct extent_ends *extent_ends, 1801 struct bkey_s_c k, 1802 struct btree_iter *iter, 1803 bool *fixed) 1804 { 1805 struct bch_fs *c = trans->c; 1806 int ret = 0; 1807 1808 /* transaction restart, running again */ 1809 if (bpos_eq(extent_ends->last_pos, k.k->p)) 1810 return 0; 1811 1812 if (extent_ends->last_pos.inode != k.k->p.inode) 1813 extent_ends_reset(extent_ends); 1814 1815 darray_for_each(extent_ends->e, i) { 1816 if (i->offset <= bkey_start_offset(k.k)) 1817 continue; 1818 1819 if (!ref_visible2(c, 1820 k.k->p.snapshot, seen, 1821 i->snapshot, &i->seen)) 1822 continue; 1823 1824 ret = overlapping_extents_found(trans, iter->btree_id, 1825 SPOS(iter->pos.inode, 1826 i->offset, 1827 i->snapshot), 1828 &i->seen, 1829 *k.k, fixed, i); 1830 if (ret) 1831 goto err; 1832 } 1833 1834 extent_ends->last_pos = k.k->p; 1835 err: 1836 return ret; 1837 } 1838 1839 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, 1840 struct bkey_s_c k) 1841 { 1842 struct bch_fs *c = trans->c; 1843 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1844 struct bch_extent_crc_unpacked crc; 1845 const union bch_extent_entry *i; 1846 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; 1847 1848 bkey_for_each_crc(k.k, ptrs, crc, i) 1849 if (crc_is_encoded(crc) && 1850 crc.uncompressed_size > encoded_extent_max_sectors) { 1851 struct printbuf buf = PRINTBUF; 1852 1853 bch2_bkey_val_to_text(&buf, c, k); 1854 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); 1855 printbuf_exit(&buf); 1856 } 1857 1858 return 0; 1859 } 1860 1861 static int check_extent(struct btree_trans *trans, struct btree_iter *iter, 1862 struct bkey_s_c k, 1863 struct inode_walker *inode, 1864 struct snapshots_seen *s, 1865 struct extent_ends *extent_ends, 1866 struct disk_reservation *res) 1867 { 1868 struct bch_fs *c = trans->c; 1869 struct printbuf buf = PRINTBUF; 1870 int ret = 0; 1871 1872 ret = bch2_check_key_has_snapshot(trans, iter, k); 1873 if (ret) { 1874 ret = ret < 0 ? ret : 0; 1875 goto out; 1876 } 1877 1878 if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { 1879 ret = check_i_sectors(trans, inode); 1880 if (ret) 1881 goto err; 1882 } 1883 1884 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1885 if (ret) 1886 goto err; 1887 1888 struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); 1889 ret = PTR_ERR_OR_ZERO(extent_i); 1890 if (ret) 1891 goto err; 1892 1893 ret = check_key_has_inode(trans, iter, inode, extent_i, k); 1894 if (ret) 1895 goto err; 1896 1897 if (k.k->type != KEY_TYPE_whiteout) { 1898 ret = check_overlapping_extents(trans, s, extent_ends, k, iter, 1899 &inode->recalculate_sums); 1900 if (ret) 1901 goto err; 1902 1903 /* 1904 * Check inodes in reverse order, from oldest snapshots to 1905 * newest, starting from the inode that matches this extent's 1906 * snapshot. If we didn't have one, iterate over all inodes: 1907 */ 1908 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1909 inode->inodes.data && i >= inode->inodes.data; 1910 --i) { 1911 if (i->inode.bi_snapshot > k.k->p.snapshot || 1912 !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) 1913 continue; 1914 1915 u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9; 1916 1917 if (fsck_err_on(k.k->p.offset > last_block && 1918 !bkey_extent_is_reservation(k), 1919 trans, extent_past_end_of_inode, 1920 "extent type past end of inode %llu:%u, i_size %llu\n%s", 1921 i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, 1922 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1923 ret = snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?: 1924 bch2_fpunch_snapshot(trans, 1925 SPOS(i->inode.bi_inum, 1926 last_block, 1927 i->inode.bi_snapshot), 1928 POS(i->inode.bi_inum, U64_MAX)); 1929 if (ret) 1930 goto err; 1931 1932 iter->k.type = KEY_TYPE_whiteout; 1933 break; 1934 } 1935 } 1936 } 1937 1938 ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); 1939 if (ret) 1940 goto err; 1941 1942 if (bkey_extent_is_allocation(k.k)) { 1943 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1944 inode->inodes.data && i >= inode->inodes.data; 1945 --i) { 1946 if (i->whiteout || 1947 i->inode.bi_snapshot > k.k->p.snapshot || 1948 !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) 1949 continue; 1950 1951 i->count += k.k->size; 1952 } 1953 } 1954 1955 if (k.k->type != KEY_TYPE_whiteout) { 1956 ret = extent_ends_at(c, extent_ends, s, k); 1957 if (ret) 1958 goto err; 1959 } 1960 out: 1961 err: 1962 fsck_err: 1963 printbuf_exit(&buf); 1964 bch_err_fn(c, ret); 1965 return ret; 1966 } 1967 1968 /* 1969 * Walk extents: verify that extents have a corresponding S_ISREG inode, and 1970 * that i_size an i_sectors are consistent 1971 */ 1972 int bch2_check_extents(struct bch_fs *c) 1973 { 1974 struct inode_walker w = inode_walker_init(); 1975 struct snapshots_seen s; 1976 struct extent_ends extent_ends; 1977 struct disk_reservation res = { 0 }; 1978 1979 snapshots_seen_init(&s); 1980 extent_ends_init(&extent_ends); 1981 1982 int ret = bch2_trans_run(c, 1983 for_each_btree_key(trans, iter, BTREE_ID_extents, 1984 POS(BCACHEFS_ROOT_INO, 0), 1985 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ 1986 bch2_disk_reservation_put(c, &res); 1987 check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: 1988 check_extent_overbig(trans, &iter, k); 1989 })) ?: 1990 check_i_sectors_notnested(trans, &w)); 1991 1992 bch2_disk_reservation_put(c, &res); 1993 extent_ends_exit(&extent_ends); 1994 inode_walker_exit(&w); 1995 snapshots_seen_exit(&s); 1996 1997 bch_err_fn(c, ret); 1998 return ret; 1999 } 2000 2001 int bch2_check_indirect_extents(struct bch_fs *c) 2002 { 2003 struct disk_reservation res = { 0 }; 2004 2005 int ret = bch2_trans_run(c, 2006 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, 2007 POS_MIN, 2008 BTREE_ITER_prefetch, k, 2009 &res, NULL, 2010 BCH_TRANS_COMMIT_no_enospc, ({ 2011 bch2_disk_reservation_put(c, &res); 2012 check_extent_overbig(trans, &iter, k); 2013 }))); 2014 2015 bch2_disk_reservation_put(c, &res); 2016 bch_err_fn(c, ret); 2017 return ret; 2018 } 2019 2020 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) 2021 { 2022 struct bch_fs *c = trans->c; 2023 int ret = 0; 2024 s64 count2; 2025 2026 darray_for_each(w->inodes, i) { 2027 if (i->inode.bi_nlink == i->count) 2028 continue; 2029 2030 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); 2031 if (count2 < 0) 2032 return count2; 2033 2034 if (i->count != count2) { 2035 bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", 2036 w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); 2037 i->count = count2; 2038 if (i->inode.bi_nlink == i->count) 2039 continue; 2040 } 2041 2042 if (i->inode.bi_nlink != i->count) { 2043 CLASS(printbuf, buf)(); 2044 2045 lockrestart_do(trans, 2046 bch2_inum_snapshot_to_path(trans, w->last_pos.inode, 2047 i->inode.bi_snapshot, NULL, &buf)); 2048 2049 if (fsck_err_on(i->inode.bi_nlink != i->count, 2050 trans, inode_dir_wrong_nlink, 2051 "directory with wrong i_nlink: got %u, should be %llu\n%s", 2052 i->inode.bi_nlink, i->count, buf.buf)) { 2053 i->inode.bi_nlink = i->count; 2054 ret = bch2_fsck_write_inode(trans, &i->inode); 2055 if (ret) 2056 break; 2057 } 2058 } 2059 } 2060 fsck_err: 2061 bch_err_fn(c, ret); 2062 return ret; 2063 } 2064 2065 static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) 2066 { 2067 u32 restart_count = trans->restart_count; 2068 return check_subdir_count_notnested(trans, w) ?: 2069 trans_was_restarted(trans, restart_count); 2070 } 2071 2072 /* find a subvolume that's a descendent of @snapshot: */ 2073 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) 2074 { 2075 struct btree_iter iter; 2076 struct bkey_s_c k; 2077 int ret; 2078 2079 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { 2080 if (k.k->type != KEY_TYPE_subvolume) 2081 continue; 2082 2083 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2084 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { 2085 bch2_trans_iter_exit(trans, &iter); 2086 *subvolid = k.k->p.offset; 2087 goto found; 2088 } 2089 } 2090 if (!ret) 2091 ret = -ENOENT; 2092 found: 2093 bch2_trans_iter_exit(trans, &iter); 2094 return ret; 2095 } 2096 2097 noinline_for_stack 2098 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, 2099 struct bkey_s_c_dirent d) 2100 { 2101 struct bch_fs *c = trans->c; 2102 struct btree_iter subvol_iter = {}; 2103 struct bch_inode_unpacked subvol_root; 2104 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); 2105 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 2106 u32 parent_snapshot; 2107 u32 new_parent_subvol = 0; 2108 u64 parent_inum; 2109 struct printbuf buf = PRINTBUF; 2110 int ret = 0; 2111 2112 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); 2113 if (ret && !bch2_err_matches(ret, ENOENT)) 2114 return ret; 2115 2116 if (ret || 2117 (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { 2118 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); 2119 if (ret2 && !bch2_err_matches(ret, ENOENT)) 2120 return ret2; 2121 } 2122 2123 if (ret && 2124 !new_parent_subvol && 2125 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 2126 /* 2127 * Couldn't find a subvol for dirent's snapshot - but we lost 2128 * subvols, so we need to reconstruct: 2129 */ 2130 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); 2131 if (ret) 2132 return ret; 2133 2134 parent_snapshot = d.k->p.snapshot; 2135 } 2136 2137 if (fsck_err_on(ret, 2138 trans, dirent_to_missing_parent_subvol, 2139 "dirent parent_subvol points to missing subvolume\n%s", 2140 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || 2141 fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), 2142 trans, dirent_not_visible_in_parent_subvol, 2143 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", 2144 parent_snapshot, 2145 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 2146 if (!new_parent_subvol) { 2147 bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); 2148 return bch_err_throw(c, fsck_repair_unimplemented); 2149 } 2150 2151 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); 2152 ret = PTR_ERR_OR_ZERO(new_dirent); 2153 if (ret) 2154 goto err; 2155 2156 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); 2157 } 2158 2159 struct bkey_s_c_subvolume s = 2160 bch2_bkey_get_iter_typed(trans, &subvol_iter, 2161 BTREE_ID_subvolumes, POS(0, target_subvol), 2162 0, subvolume); 2163 ret = bkey_err(s.s_c); 2164 if (ret && !bch2_err_matches(ret, ENOENT)) 2165 goto err; 2166 2167 if (ret) { 2168 if (fsck_err(trans, dirent_to_missing_subvol, 2169 "dirent points to missing subvolume\n%s", 2170 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) 2171 return bch2_fsck_remove_dirent(trans, d.k->p); 2172 ret = 0; 2173 goto out; 2174 } 2175 2176 if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { 2177 printbuf_reset(&buf); 2178 2179 prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", 2180 parent_subvol); 2181 2182 ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, 2183 le64_to_cpu(s.v->inode) }, &buf); 2184 if (ret) 2185 goto err; 2186 prt_newline(&buf); 2187 bch2_bkey_val_to_text(&buf, c, s.s_c); 2188 2189 if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { 2190 struct bkey_i_subvolume *n = 2191 bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); 2192 ret = PTR_ERR_OR_ZERO(n); 2193 if (ret) 2194 goto err; 2195 2196 n->v.fs_path_parent = cpu_to_le32(parent_subvol); 2197 } 2198 } 2199 2200 u64 target_inum = le64_to_cpu(s.v->inode); 2201 u32 target_snapshot = le32_to_cpu(s.v->snapshot); 2202 2203 ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, 2204 &subvol_root, 0); 2205 if (ret && !bch2_err_matches(ret, ENOENT)) 2206 goto err; 2207 2208 if (ret) { 2209 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); 2210 ret = bch_err_throw(c, fsck_repair_unimplemented); 2211 goto err; 2212 } 2213 2214 if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, 2215 trans, inode_bi_parent_wrong, 2216 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", 2217 target_inum, 2218 subvol_root.bi_parent_subvol, parent_subvol)) { 2219 subvol_root.bi_parent_subvol = parent_subvol; 2220 subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot); 2221 ret = __bch2_fsck_write_inode(trans, &subvol_root); 2222 if (ret) 2223 goto err; 2224 } 2225 2226 ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); 2227 if (ret) 2228 goto err; 2229 out: 2230 err: 2231 fsck_err: 2232 bch2_trans_iter_exit(trans, &subvol_iter); 2233 printbuf_exit(&buf); 2234 return ret; 2235 } 2236 2237 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, 2238 struct bkey_s_c k, 2239 struct bch_hash_info *hash_info, 2240 struct inode_walker *dir, 2241 struct inode_walker *target, 2242 struct snapshots_seen *s, 2243 bool *need_second_pass) 2244 { 2245 struct bch_fs *c = trans->c; 2246 struct inode_walker_entry *i; 2247 struct printbuf buf = PRINTBUF; 2248 int ret = 0; 2249 2250 ret = bch2_check_key_has_snapshot(trans, iter, k); 2251 if (ret) { 2252 ret = ret < 0 ? ret : 0; 2253 goto out; 2254 } 2255 2256 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 2257 if (ret) 2258 goto err; 2259 2260 if (k.k->type == KEY_TYPE_whiteout) 2261 goto out; 2262 2263 if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { 2264 ret = check_subdir_dirents_count(trans, dir); 2265 if (ret) 2266 goto err; 2267 } 2268 2269 i = walk_inode(trans, dir, k); 2270 ret = PTR_ERR_OR_ZERO(i); 2271 if (ret < 0) 2272 goto err; 2273 2274 ret = check_key_has_inode(trans, iter, dir, i, k); 2275 if (ret) 2276 goto err; 2277 2278 if (!i || i->whiteout) 2279 goto out; 2280 2281 if (dir->first_this_inode) 2282 *hash_info = bch2_hash_info_init(c, &i->inode); 2283 dir->first_this_inode = false; 2284 2285 hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; 2286 2287 ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, 2288 iter, k, need_second_pass); 2289 if (ret < 0) 2290 goto err; 2291 if (ret) { 2292 /* dirent has been deleted */ 2293 ret = 0; 2294 goto out; 2295 } 2296 2297 if (k.k->type != KEY_TYPE_dirent) 2298 goto out; 2299 2300 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2301 2302 /* check casefold */ 2303 if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding, 2304 trans, dirent_casefold_mismatch, 2305 "dirent casefold does not match dir casefold\n%s", 2306 (printbuf_reset(&buf), 2307 bch2_bkey_val_to_text(&buf, c, k), 2308 buf.buf))) { 2309 subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL 2310 ? le32_to_cpu(d.v->d_parent_subvol) 2311 : 0, 2312 }; 2313 u64 target = d.v->d_type == DT_SUBVOL 2314 ? le32_to_cpu(d.v->d_child_subvol) 2315 : le64_to_cpu(d.v->d_inum); 2316 struct qstr name = bch2_dirent_get_name(d); 2317 2318 struct bkey_i_dirent *new_d = 2319 bch2_dirent_create_key(trans, hash_info, dir_inum, 2320 d.v->d_type, &name, NULL, target); 2321 ret = PTR_ERR_OR_ZERO(new_d); 2322 if (ret) 2323 goto out; 2324 2325 new_d->k.p.inode = d.k->p.inode; 2326 new_d->k.p.snapshot = d.k->p.snapshot; 2327 2328 struct btree_iter dup_iter = {}; 2329 ret = bch2_hash_delete_at(trans, 2330 bch2_dirent_hash_desc, hash_info, iter, 2331 BTREE_UPDATE_internal_snapshot_node) ?: 2332 bch2_str_hash_repair_key(trans, s, 2333 &bch2_dirent_hash_desc, hash_info, 2334 iter, bkey_i_to_s_c(&new_d->k_i), 2335 &dup_iter, bkey_s_c_null, 2336 need_second_pass); 2337 goto out; 2338 } 2339 2340 if (d.v->d_type == DT_SUBVOL) { 2341 ret = check_dirent_to_subvol(trans, iter, d); 2342 if (ret) 2343 goto err; 2344 } else { 2345 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); 2346 if (ret) 2347 goto err; 2348 2349 if (fsck_err_on(!target->inodes.nr, 2350 trans, dirent_to_missing_inode, 2351 "dirent points to missing inode:\n%s", 2352 (printbuf_reset(&buf), 2353 bch2_bkey_val_to_text(&buf, c, k), 2354 buf.buf))) { 2355 ret = bch2_fsck_remove_dirent(trans, d.k->p); 2356 if (ret) 2357 goto err; 2358 } 2359 2360 darray_for_each(target->inodes, i) { 2361 ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); 2362 if (ret) 2363 goto err; 2364 } 2365 2366 darray_for_each(target->deletes, i) 2367 if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i), 2368 trans, dirent_to_overwritten_inode, 2369 "dirent points to inode overwritten in snapshot %u:\n%s", 2370 *i, 2371 (printbuf_reset(&buf), 2372 bch2_bkey_val_to_text(&buf, c, k), 2373 buf.buf))) { 2374 struct btree_iter delete_iter; 2375 bch2_trans_iter_init(trans, &delete_iter, 2376 BTREE_ID_dirents, 2377 SPOS(k.k->p.inode, k.k->p.offset, *i), 2378 BTREE_ITER_intent); 2379 ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: 2380 bch2_hash_delete_at(trans, bch2_dirent_hash_desc, 2381 hash_info, 2382 &delete_iter, 2383 BTREE_UPDATE_internal_snapshot_node); 2384 bch2_trans_iter_exit(trans, &delete_iter); 2385 if (ret) 2386 goto err; 2387 2388 } 2389 } 2390 2391 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 2392 if (ret) 2393 goto err; 2394 2395 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { 2396 if (d.v->d_type == DT_DIR) 2397 i->count++; 2398 i->i_size += bkey_bytes(d.k); 2399 } 2400 out: 2401 err: 2402 fsck_err: 2403 printbuf_exit(&buf); 2404 return ret; 2405 } 2406 2407 /* 2408 * Walk dirents: verify that they all have a corresponding S_ISDIR inode, 2409 * validate d_type 2410 */ 2411 int bch2_check_dirents(struct bch_fs *c) 2412 { 2413 struct inode_walker dir = inode_walker_init(); 2414 struct inode_walker target = inode_walker_init(); 2415 struct snapshots_seen s; 2416 struct bch_hash_info hash_info; 2417 bool need_second_pass = false, did_second_pass = false; 2418 int ret; 2419 2420 snapshots_seen_init(&s); 2421 again: 2422 ret = bch2_trans_run(c, 2423 for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, 2424 POS(BCACHEFS_ROOT_INO, 0), 2425 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 2426 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2427 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, 2428 &need_second_pass)) ?: 2429 check_subdir_count_notnested(trans, &dir)); 2430 2431 if (!ret && need_second_pass && !did_second_pass) { 2432 bch_info(c, "check_dirents requires second pass"); 2433 swap(did_second_pass, need_second_pass); 2434 goto again; 2435 } 2436 2437 if (!ret && need_second_pass) { 2438 bch_err(c, "dirents not repairing"); 2439 ret = -EINVAL; 2440 } 2441 2442 snapshots_seen_exit(&s); 2443 inode_walker_exit(&dir); 2444 inode_walker_exit(&target); 2445 bch_err_fn(c, ret); 2446 return ret; 2447 } 2448 2449 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, 2450 struct bkey_s_c k, 2451 struct bch_hash_info *hash_info, 2452 struct inode_walker *inode) 2453 { 2454 struct bch_fs *c = trans->c; 2455 2456 int ret = bch2_check_key_has_snapshot(trans, iter, k); 2457 if (ret < 0) 2458 return ret; 2459 if (ret) 2460 return 0; 2461 2462 struct inode_walker_entry *i = walk_inode(trans, inode, k); 2463 ret = PTR_ERR_OR_ZERO(i); 2464 if (ret) 2465 return ret; 2466 2467 ret = check_key_has_inode(trans, iter, inode, i, k); 2468 if (ret) 2469 return ret; 2470 2471 if (!i || i->whiteout) 2472 return 0; 2473 2474 if (inode->first_this_inode) 2475 *hash_info = bch2_hash_info_init(c, &i->inode); 2476 inode->first_this_inode = false; 2477 2478 bool need_second_pass = false; 2479 return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, 2480 iter, k, &need_second_pass); 2481 } 2482 2483 /* 2484 * Walk xattrs: verify that they all have a corresponding inode 2485 */ 2486 int bch2_check_xattrs(struct bch_fs *c) 2487 { 2488 struct inode_walker inode = inode_walker_init(); 2489 struct bch_hash_info hash_info; 2490 int ret = 0; 2491 2492 ret = bch2_trans_run(c, 2493 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, 2494 POS(BCACHEFS_ROOT_INO, 0), 2495 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, 2496 k, 2497 NULL, NULL, 2498 BCH_TRANS_COMMIT_no_enospc, 2499 check_xattr(trans, &iter, k, &hash_info, &inode))); 2500 2501 inode_walker_exit(&inode); 2502 bch_err_fn(c, ret); 2503 return ret; 2504 } 2505 2506 static int check_root_trans(struct btree_trans *trans) 2507 { 2508 struct bch_fs *c = trans->c; 2509 struct bch_inode_unpacked root_inode; 2510 u32 snapshot; 2511 u64 inum; 2512 int ret; 2513 2514 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); 2515 if (ret && !bch2_err_matches(ret, ENOENT)) 2516 return ret; 2517 2518 if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, 2519 "root subvol missing")) { 2520 struct bkey_i_subvolume *root_subvol = 2521 bch2_trans_kmalloc(trans, sizeof(*root_subvol)); 2522 ret = PTR_ERR_OR_ZERO(root_subvol); 2523 if (ret) 2524 goto err; 2525 2526 snapshot = U32_MAX; 2527 inum = BCACHEFS_ROOT_INO; 2528 2529 bkey_subvolume_init(&root_subvol->k_i); 2530 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; 2531 root_subvol->v.flags = 0; 2532 root_subvol->v.snapshot = cpu_to_le32(snapshot); 2533 root_subvol->v.inode = cpu_to_le64(inum); 2534 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); 2535 bch_err_msg(c, ret, "writing root subvol"); 2536 if (ret) 2537 goto err; 2538 } 2539 2540 ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, 2541 &root_inode, 0); 2542 if (ret && !bch2_err_matches(ret, ENOENT)) 2543 return ret; 2544 2545 if (mustfix_fsck_err_on(ret, 2546 trans, root_dir_missing, 2547 "root directory missing") || 2548 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), 2549 trans, root_inode_not_dir, 2550 "root inode not a directory")) { 2551 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 2552 0, NULL); 2553 root_inode.bi_inum = inum; 2554 root_inode.bi_snapshot = snapshot; 2555 2556 ret = __bch2_fsck_write_inode(trans, &root_inode); 2557 bch_err_msg(c, ret, "writing root inode"); 2558 } 2559 err: 2560 fsck_err: 2561 return ret; 2562 } 2563 2564 /* Get root directory, create if it doesn't exist: */ 2565 int bch2_check_root(struct bch_fs *c) 2566 { 2567 int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2568 check_root_trans(trans)); 2569 bch_err_fn(c, ret); 2570 return ret; 2571 } 2572 2573 static bool darray_u32_has(darray_u32 *d, u32 v) 2574 { 2575 darray_for_each(*d, i) 2576 if (*i == v) 2577 return true; 2578 return false; 2579 } 2580 2581 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) 2582 { 2583 struct bch_fs *c = trans->c; 2584 struct btree_iter parent_iter = {}; 2585 darray_u32 subvol_path = {}; 2586 struct printbuf buf = PRINTBUF; 2587 int ret = 0; 2588 2589 if (k.k->type != KEY_TYPE_subvolume) 2590 return 0; 2591 2592 subvol_inum start = { 2593 .subvol = k.k->p.offset, 2594 .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode), 2595 }; 2596 2597 while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { 2598 ret = darray_push(&subvol_path, k.k->p.offset); 2599 if (ret) 2600 goto err; 2601 2602 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2603 2604 struct bch_inode_unpacked subvol_root; 2605 ret = bch2_inode_find_by_inum_trans(trans, 2606 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 2607 &subvol_root); 2608 if (ret) 2609 break; 2610 2611 u32 parent = le32_to_cpu(s.v->fs_path_parent); 2612 2613 if (darray_u32_has(&subvol_path, parent)) { 2614 printbuf_reset(&buf); 2615 prt_printf(&buf, "subvolume loop: "); 2616 2617 ret = bch2_inum_to_path(trans, start, &buf); 2618 if (ret) 2619 goto err; 2620 2621 if (fsck_err(trans, subvol_loop, "%s", buf.buf)) 2622 ret = reattach_subvol(trans, s); 2623 break; 2624 } 2625 2626 bch2_trans_iter_exit(trans, &parent_iter); 2627 bch2_trans_iter_init(trans, &parent_iter, 2628 BTREE_ID_subvolumes, POS(0, parent), 0); 2629 k = bch2_btree_iter_peek_slot(trans, &parent_iter); 2630 ret = bkey_err(k); 2631 if (ret) 2632 goto err; 2633 2634 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, 2635 trans, subvol_unreachable, 2636 "unreachable subvolume %s", 2637 (printbuf_reset(&buf), 2638 bch2_bkey_val_to_text(&buf, c, s.s_c), 2639 buf.buf))) { 2640 ret = reattach_subvol(trans, s); 2641 break; 2642 } 2643 } 2644 fsck_err: 2645 err: 2646 printbuf_exit(&buf); 2647 darray_exit(&subvol_path); 2648 bch2_trans_iter_exit(trans, &parent_iter); 2649 return ret; 2650 } 2651 2652 int bch2_check_subvolume_structure(struct bch_fs *c) 2653 { 2654 int ret = bch2_trans_run(c, 2655 for_each_btree_key_commit(trans, iter, 2656 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, 2657 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2658 check_subvol_path(trans, &iter, k))); 2659 bch_err_fn(c, ret); 2660 return ret; 2661 } 2662 2663 static int bch2_bi_depth_renumber_one(struct btree_trans *trans, 2664 u64 inum, u32 snapshot, 2665 u32 new_depth) 2666 { 2667 struct btree_iter iter; 2668 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 2669 SPOS(0, inum, snapshot), 0); 2670 2671 struct bch_inode_unpacked inode; 2672 int ret = bkey_err(k) ?: 2673 !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode 2674 : bch2_inode_unpack(k, &inode); 2675 if (ret) 2676 goto err; 2677 2678 if (inode.bi_depth != new_depth) { 2679 inode.bi_depth = new_depth; 2680 ret = __bch2_fsck_write_inode(trans, &inode) ?: 2681 bch2_trans_commit(trans, NULL, NULL, 0); 2682 } 2683 err: 2684 bch2_trans_iter_exit(trans, &iter); 2685 return ret; 2686 } 2687 2688 static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path, 2689 u32 snapshot, u32 new_bi_depth) 2690 { 2691 u32 restart_count = trans->restart_count; 2692 int ret = 0; 2693 2694 darray_for_each_reverse(*path, i) { 2695 ret = nested_lockrestart_do(trans, 2696 bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth)); 2697 bch_err_fn(trans->c, ret); 2698 if (ret) 2699 break; 2700 2701 new_bi_depth++; 2702 } 2703 2704 return ret ?: trans_was_restarted(trans, restart_count); 2705 } 2706 2707 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) 2708 { 2709 struct bch_fs *c = trans->c; 2710 struct btree_iter inode_iter = {}; 2711 darray_u64 path = {}; 2712 struct printbuf buf = PRINTBUF; 2713 u32 snapshot = inode_k.k->p.snapshot; 2714 bool redo_bi_depth = false; 2715 u32 min_bi_depth = U32_MAX; 2716 int ret = 0; 2717 2718 struct bpos start = inode_k.k->p; 2719 2720 struct bch_inode_unpacked inode; 2721 ret = bch2_inode_unpack(inode_k, &inode); 2722 if (ret) 2723 return ret; 2724 2725 /* 2726 * If we're running full fsck, check_dirents() will have already ran, 2727 * and we shouldn't see any missing backpointers here - otherwise that's 2728 * handled separately, by check_unreachable_inodes 2729 */ 2730 while (!inode.bi_subvol && 2731 bch2_inode_has_backpointer(&inode)) { 2732 struct btree_iter dirent_iter; 2733 struct bkey_s_c_dirent d; 2734 2735 d = dirent_get_by_pos(trans, &dirent_iter, 2736 SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot)); 2737 ret = bkey_err(d.s_c); 2738 if (ret && !bch2_err_matches(ret, ENOENT)) 2739 goto out; 2740 2741 if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) 2742 bch2_trans_iter_exit(trans, &dirent_iter); 2743 2744 if (bch2_err_matches(ret, ENOENT)) { 2745 printbuf_reset(&buf); 2746 bch2_bkey_val_to_text(&buf, c, inode_k); 2747 bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", 2748 bch2_err_str(ret), buf.buf); 2749 goto out; 2750 } 2751 2752 bch2_trans_iter_exit(trans, &dirent_iter); 2753 2754 ret = darray_push(&path, inode.bi_inum); 2755 if (ret) 2756 return ret; 2757 2758 bch2_trans_iter_exit(trans, &inode_iter); 2759 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, 2760 SPOS(0, inode.bi_dir, snapshot), 0); 2761 2762 struct bch_inode_unpacked parent_inode; 2763 ret = bkey_err(inode_k) ?: 2764 !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode 2765 : bch2_inode_unpack(inode_k, &parent_inode); 2766 if (ret) { 2767 /* Should have been caught in dirents pass */ 2768 bch_err_msg(c, ret, "error looking up parent directory"); 2769 goto out; 2770 } 2771 2772 min_bi_depth = parent_inode.bi_depth; 2773 2774 if (parent_inode.bi_depth < inode.bi_depth && 2775 min_bi_depth < U16_MAX) 2776 break; 2777 2778 inode = parent_inode; 2779 redo_bi_depth = true; 2780 2781 if (darray_find(path, inode.bi_inum)) { 2782 printbuf_reset(&buf); 2783 prt_printf(&buf, "directory structure loop in snapshot %u: ", 2784 snapshot); 2785 2786 ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf); 2787 if (ret) 2788 goto out; 2789 2790 if (c->opts.verbose) { 2791 prt_newline(&buf); 2792 darray_for_each(path, i) 2793 prt_printf(&buf, "%llu ", *i); 2794 } 2795 2796 if (fsck_err(trans, dir_loop, "%s", buf.buf)) { 2797 ret = remove_backpointer(trans, &inode); 2798 bch_err_msg(c, ret, "removing dirent"); 2799 if (ret) 2800 goto out; 2801 2802 ret = reattach_inode(trans, &inode); 2803 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 2804 } 2805 2806 goto out; 2807 } 2808 } 2809 2810 if (inode.bi_subvol) 2811 min_bi_depth = 0; 2812 2813 if (redo_bi_depth) 2814 ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth); 2815 out: 2816 fsck_err: 2817 bch2_trans_iter_exit(trans, &inode_iter); 2818 darray_exit(&path); 2819 printbuf_exit(&buf); 2820 bch_err_fn(c, ret); 2821 return ret; 2822 } 2823 2824 /* 2825 * Check for loops in the directory structure: all other connectivity issues 2826 * have been fixed by prior passes 2827 */ 2828 int bch2_check_directory_structure(struct bch_fs *c) 2829 { 2830 int ret = bch2_trans_run(c, 2831 for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN, 2832 BTREE_ITER_intent| 2833 BTREE_ITER_prefetch| 2834 BTREE_ITER_all_snapshots, k, 2835 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 2836 if (!S_ISDIR(bkey_inode_mode(k))) 2837 continue; 2838 2839 if (bch2_inode_flags(k) & BCH_INODE_unlinked) 2840 continue; 2841 2842 check_path_loop(trans, k); 2843 }))); 2844 2845 bch_err_fn(c, ret); 2846 return ret; 2847 } 2848 2849 struct nlink_table { 2850 size_t nr; 2851 size_t size; 2852 2853 struct nlink { 2854 u64 inum; 2855 u32 snapshot; 2856 u32 count; 2857 } *d; 2858 }; 2859 2860 static int add_nlink(struct bch_fs *c, struct nlink_table *t, 2861 u64 inum, u32 snapshot) 2862 { 2863 if (t->nr == t->size) { 2864 size_t new_size = max_t(size_t, 128UL, t->size * 2); 2865 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); 2866 2867 if (!d) { 2868 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", 2869 new_size); 2870 return bch_err_throw(c, ENOMEM_fsck_add_nlink); 2871 } 2872 2873 if (t->d) 2874 memcpy(d, t->d, t->size * sizeof(t->d[0])); 2875 kvfree(t->d); 2876 2877 t->d = d; 2878 t->size = new_size; 2879 } 2880 2881 2882 t->d[t->nr++] = (struct nlink) { 2883 .inum = inum, 2884 .snapshot = snapshot, 2885 }; 2886 2887 return 0; 2888 } 2889 2890 static int nlink_cmp(const void *_l, const void *_r) 2891 { 2892 const struct nlink *l = _l; 2893 const struct nlink *r = _r; 2894 2895 return cmp_int(l->inum, r->inum); 2896 } 2897 2898 static void inc_link(struct bch_fs *c, struct snapshots_seen *s, 2899 struct nlink_table *links, 2900 u64 range_start, u64 range_end, u64 inum, u32 snapshot) 2901 { 2902 struct nlink *link, key = { 2903 .inum = inum, .snapshot = U32_MAX, 2904 }; 2905 2906 if (inum < range_start || inum >= range_end) 2907 return; 2908 2909 link = __inline_bsearch(&key, links->d, links->nr, 2910 sizeof(links->d[0]), nlink_cmp); 2911 if (!link) 2912 return; 2913 2914 while (link > links->d && link[0].inum == link[-1].inum) 2915 --link; 2916 2917 for (; link < links->d + links->nr && link->inum == inum; link++) 2918 if (ref_visible(c, s, snapshot, link->snapshot)) { 2919 link->count++; 2920 if (link->snapshot >= snapshot) 2921 break; 2922 } 2923 } 2924 2925 noinline_for_stack 2926 static int check_nlinks_find_hardlinks(struct bch_fs *c, 2927 struct nlink_table *t, 2928 u64 start, u64 *end) 2929 { 2930 int ret = bch2_trans_run(c, 2931 for_each_btree_key(trans, iter, BTREE_ID_inodes, 2932 POS(0, start), 2933 BTREE_ITER_intent| 2934 BTREE_ITER_prefetch| 2935 BTREE_ITER_all_snapshots, k, ({ 2936 if (!bkey_is_inode(k.k)) 2937 continue; 2938 2939 /* Should never fail, checked by bch2_inode_invalid: */ 2940 struct bch_inode_unpacked u; 2941 _ret3 = bch2_inode_unpack(k, &u); 2942 if (_ret3) 2943 break; 2944 2945 /* 2946 * Backpointer and directory structure checks are sufficient for 2947 * directories, since they can't have hardlinks: 2948 */ 2949 if (S_ISDIR(u.bi_mode)) 2950 continue; 2951 2952 /* 2953 * Previous passes ensured that bi_nlink is nonzero if 2954 * it had multiple hardlinks: 2955 */ 2956 if (!u.bi_nlink) 2957 continue; 2958 2959 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); 2960 if (ret) { 2961 *end = k.k->p.offset; 2962 ret = 0; 2963 break; 2964 } 2965 0; 2966 }))); 2967 2968 bch_err_fn(c, ret); 2969 return ret; 2970 } 2971 2972 noinline_for_stack 2973 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, 2974 u64 range_start, u64 range_end) 2975 { 2976 struct snapshots_seen s; 2977 2978 snapshots_seen_init(&s); 2979 2980 int ret = bch2_trans_run(c, 2981 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, 2982 BTREE_ITER_intent| 2983 BTREE_ITER_prefetch| 2984 BTREE_ITER_all_snapshots, k, ({ 2985 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); 2986 if (ret) 2987 break; 2988 2989 if (k.k->type == KEY_TYPE_dirent) { 2990 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2991 2992 if (d.v->d_type != DT_DIR && 2993 d.v->d_type != DT_SUBVOL) 2994 inc_link(c, &s, links, range_start, range_end, 2995 le64_to_cpu(d.v->d_inum), d.k->p.snapshot); 2996 } 2997 0; 2998 }))); 2999 3000 snapshots_seen_exit(&s); 3001 3002 bch_err_fn(c, ret); 3003 return ret; 3004 } 3005 3006 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, 3007 struct bkey_s_c k, 3008 struct nlink_table *links, 3009 size_t *idx, u64 range_end) 3010 { 3011 struct bch_inode_unpacked u; 3012 struct nlink *link = &links->d[*idx]; 3013 int ret = 0; 3014 3015 if (k.k->p.offset >= range_end) 3016 return 1; 3017 3018 if (!bkey_is_inode(k.k)) 3019 return 0; 3020 3021 ret = bch2_inode_unpack(k, &u); 3022 if (ret) 3023 return ret; 3024 3025 if (S_ISDIR(u.bi_mode)) 3026 return 0; 3027 3028 if (!u.bi_nlink) 3029 return 0; 3030 3031 while ((cmp_int(link->inum, k.k->p.offset) ?: 3032 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { 3033 BUG_ON(*idx == links->nr); 3034 link = &links->d[++*idx]; 3035 } 3036 3037 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, 3038 trans, inode_wrong_nlink, 3039 "inode %llu type %s has wrong i_nlink (%u, should be %u)", 3040 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], 3041 bch2_inode_nlink_get(&u), link->count)) { 3042 bch2_inode_nlink_set(&u, link->count); 3043 ret = __bch2_fsck_write_inode(trans, &u); 3044 } 3045 fsck_err: 3046 return ret; 3047 } 3048 3049 noinline_for_stack 3050 static int check_nlinks_update_hardlinks(struct bch_fs *c, 3051 struct nlink_table *links, 3052 u64 range_start, u64 range_end) 3053 { 3054 size_t idx = 0; 3055 3056 int ret = bch2_trans_run(c, 3057 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 3058 POS(0, range_start), 3059 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 3060 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 3061 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); 3062 if (ret < 0) { 3063 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); 3064 return ret; 3065 } 3066 3067 return 0; 3068 } 3069 3070 int bch2_check_nlinks(struct bch_fs *c) 3071 { 3072 struct nlink_table links = { 0 }; 3073 u64 this_iter_range_start, next_iter_range_start = 0; 3074 int ret = 0; 3075 3076 do { 3077 this_iter_range_start = next_iter_range_start; 3078 next_iter_range_start = U64_MAX; 3079 3080 ret = check_nlinks_find_hardlinks(c, &links, 3081 this_iter_range_start, 3082 &next_iter_range_start); 3083 3084 ret = check_nlinks_walk_dirents(c, &links, 3085 this_iter_range_start, 3086 next_iter_range_start); 3087 if (ret) 3088 break; 3089 3090 ret = check_nlinks_update_hardlinks(c, &links, 3091 this_iter_range_start, 3092 next_iter_range_start); 3093 if (ret) 3094 break; 3095 3096 links.nr = 0; 3097 } while (next_iter_range_start != U64_MAX); 3098 3099 kvfree(links.d); 3100 bch_err_fn(c, ret); 3101 return ret; 3102 } 3103 3104 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, 3105 struct bkey_s_c k) 3106 { 3107 struct bkey_s_c_reflink_p p; 3108 struct bkey_i_reflink_p *u; 3109 3110 if (k.k->type != KEY_TYPE_reflink_p) 3111 return 0; 3112 3113 p = bkey_s_c_to_reflink_p(k); 3114 3115 if (!p.v->front_pad && !p.v->back_pad) 3116 return 0; 3117 3118 u = bch2_trans_kmalloc(trans, sizeof(*u)); 3119 int ret = PTR_ERR_OR_ZERO(u); 3120 if (ret) 3121 return ret; 3122 3123 bkey_reassemble(&u->k_i, k); 3124 u->v.front_pad = 0; 3125 u->v.back_pad = 0; 3126 3127 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); 3128 } 3129 3130 int bch2_fix_reflink_p(struct bch_fs *c) 3131 { 3132 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) 3133 return 0; 3134 3135 int ret = bch2_trans_run(c, 3136 for_each_btree_key_commit(trans, iter, 3137 BTREE_ID_extents, POS_MIN, 3138 BTREE_ITER_intent|BTREE_ITER_prefetch| 3139 BTREE_ITER_all_snapshots, k, 3140 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 3141 fix_reflink_p_key(trans, &iter, k))); 3142 bch_err_fn(c, ret); 3143 return ret; 3144 } 3145 3146 #ifndef NO_BCACHEFS_CHARDEV 3147 3148 struct fsck_thread { 3149 struct thread_with_stdio thr; 3150 struct bch_fs *c; 3151 struct bch_opts opts; 3152 }; 3153 3154 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) 3155 { 3156 struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); 3157 kfree(thr); 3158 } 3159 3160 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) 3161 { 3162 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); 3163 struct bch_fs *c = thr->c; 3164 3165 int ret = PTR_ERR_OR_ZERO(c); 3166 if (ret) 3167 return ret; 3168 3169 ret = bch2_fs_start(thr->c); 3170 if (ret) 3171 goto err; 3172 3173 if (test_bit(BCH_FS_errors_fixed, &c->flags)) { 3174 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); 3175 ret |= 1; 3176 } 3177 if (test_bit(BCH_FS_error, &c->flags)) { 3178 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); 3179 ret |= 4; 3180 } 3181 err: 3182 bch2_fs_stop(c); 3183 return ret; 3184 } 3185 3186 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { 3187 .exit = bch2_fsck_thread_exit, 3188 .fn = bch2_fsck_offline_thread_fn, 3189 }; 3190 3191 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) 3192 { 3193 struct bch_ioctl_fsck_offline arg; 3194 struct fsck_thread *thr = NULL; 3195 darray_const_str devs = {}; 3196 long ret = 0; 3197 3198 if (copy_from_user(&arg, user_arg, sizeof(arg))) 3199 return -EFAULT; 3200 3201 if (arg.flags) 3202 return -EINVAL; 3203 3204 if (!capable(CAP_SYS_ADMIN)) 3205 return -EPERM; 3206 3207 for (size_t i = 0; i < arg.nr_devs; i++) { 3208 u64 dev_u64; 3209 ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); 3210 if (ret) 3211 goto err; 3212 3213 char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); 3214 ret = PTR_ERR_OR_ZERO(dev_str); 3215 if (ret) 3216 goto err; 3217 3218 ret = darray_push(&devs, dev_str); 3219 if (ret) { 3220 kfree(dev_str); 3221 goto err; 3222 } 3223 } 3224 3225 thr = kzalloc(sizeof(*thr), GFP_KERNEL); 3226 if (!thr) { 3227 ret = -ENOMEM; 3228 goto err; 3229 } 3230 3231 thr->opts = bch2_opts_empty(); 3232 3233 if (arg.opts) { 3234 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); 3235 ret = PTR_ERR_OR_ZERO(optstr) ?: 3236 bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false); 3237 if (!IS_ERR(optstr)) 3238 kfree(optstr); 3239 3240 if (ret) 3241 goto err; 3242 } 3243 3244 opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); 3245 opt_set(thr->opts, read_only, 1); 3246 opt_set(thr->opts, ratelimit_errors, 0); 3247 3248 /* We need request_key() to be called before we punt to kthread: */ 3249 opt_set(thr->opts, nostart, true); 3250 3251 bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); 3252 3253 thr->c = bch2_fs_open(&devs, &thr->opts); 3254 3255 if (!IS_ERR(thr->c) && 3256 thr->c->opts.errors == BCH_ON_ERROR_panic) 3257 thr->c->opts.errors = BCH_ON_ERROR_ro; 3258 3259 ret = __bch2_run_thread_with_stdio(&thr->thr); 3260 out: 3261 darray_for_each(devs, i) 3262 kfree(*i); 3263 darray_exit(&devs); 3264 return ret; 3265 err: 3266 if (thr) 3267 bch2_fsck_thread_exit(&thr->thr); 3268 pr_err("ret %s", bch2_err_str(ret)); 3269 goto out; 3270 } 3271 3272 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) 3273 { 3274 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); 3275 struct bch_fs *c = thr->c; 3276 3277 c->stdio_filter = current; 3278 c->stdio = &thr->thr.stdio; 3279 3280 /* 3281 * XXX: can we figure out a way to do this without mucking with c->opts? 3282 */ 3283 unsigned old_fix_errors = c->opts.fix_errors; 3284 if (opt_defined(thr->opts, fix_errors)) 3285 c->opts.fix_errors = thr->opts.fix_errors; 3286 else 3287 c->opts.fix_errors = FSCK_FIX_ask; 3288 3289 c->opts.fsck = true; 3290 set_bit(BCH_FS_in_fsck, &c->flags); 3291 3292 int ret = bch2_run_online_recovery_passes(c, ~0ULL); 3293 3294 clear_bit(BCH_FS_in_fsck, &c->flags); 3295 bch_err_fn(c, ret); 3296 3297 c->stdio = NULL; 3298 c->stdio_filter = NULL; 3299 c->opts.fix_errors = old_fix_errors; 3300 3301 up(&c->recovery.run_lock); 3302 bch2_ro_ref_put(c); 3303 return ret; 3304 } 3305 3306 static const struct thread_with_stdio_ops bch2_online_fsck_ops = { 3307 .exit = bch2_fsck_thread_exit, 3308 .fn = bch2_fsck_online_thread_fn, 3309 }; 3310 3311 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) 3312 { 3313 struct fsck_thread *thr = NULL; 3314 long ret = 0; 3315 3316 if (arg.flags) 3317 return -EINVAL; 3318 3319 if (!capable(CAP_SYS_ADMIN)) 3320 return -EPERM; 3321 3322 if (!bch2_ro_ref_tryget(c)) 3323 return -EROFS; 3324 3325 if (down_trylock(&c->recovery.run_lock)) { 3326 bch2_ro_ref_put(c); 3327 return -EAGAIN; 3328 } 3329 3330 thr = kzalloc(sizeof(*thr), GFP_KERNEL); 3331 if (!thr) { 3332 ret = -ENOMEM; 3333 goto err; 3334 } 3335 3336 thr->c = c; 3337 thr->opts = bch2_opts_empty(); 3338 3339 if (arg.opts) { 3340 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); 3341 3342 ret = PTR_ERR_OR_ZERO(optstr) ?: 3343 bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false); 3344 if (!IS_ERR(optstr)) 3345 kfree(optstr); 3346 3347 if (ret) 3348 goto err; 3349 } 3350 3351 ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); 3352 err: 3353 if (ret < 0) { 3354 bch_err_fn(c, ret); 3355 if (thr) 3356 bch2_fsck_thread_exit(&thr->thr); 3357 up(&c->recovery.run_lock); 3358 bch2_ro_ref_put(c); 3359 } 3360 return ret; 3361 } 3362 3363 #endif /* NO_BCACHEFS_CHARDEV */ 3364