1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bcachefs_ioctl.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "darray.h" 10 #include "dirent.h" 11 #include "error.h" 12 #include "fs.h" 13 #include "fsck.h" 14 #include "inode.h" 15 #include "io_misc.h" 16 #include "keylist.h" 17 #include "namei.h" 18 #include "recovery_passes.h" 19 #include "snapshot.h" 20 #include "super.h" 21 #include "thread_with_file.h" 22 #include "xattr.h" 23 24 #include <linux/bsearch.h> 25 #include <linux/dcache.h> /* struct qstr */ 26 27 static int dirent_points_to_inode_nowarn(struct bch_fs *c, 28 struct bkey_s_c_dirent d, 29 struct bch_inode_unpacked *inode) 30 { 31 if (d.v->d_type == DT_SUBVOL 32 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 33 : le64_to_cpu(d.v->d_inum) == inode->bi_inum) 34 return 0; 35 return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); 36 } 37 38 static void dirent_inode_mismatch_msg(struct printbuf *out, 39 struct bch_fs *c, 40 struct bkey_s_c_dirent dirent, 41 struct bch_inode_unpacked *inode) 42 { 43 prt_str(out, "inode points to dirent that does not point back:"); 44 prt_newline(out); 45 bch2_bkey_val_to_text(out, c, dirent.s_c); 46 prt_newline(out); 47 bch2_inode_unpacked_to_text(out, inode); 48 } 49 50 static int dirent_points_to_inode(struct bch_fs *c, 51 struct bkey_s_c_dirent dirent, 52 struct bch_inode_unpacked *inode) 53 { 54 int ret = dirent_points_to_inode_nowarn(c, dirent, inode); 55 if (ret) { 56 struct printbuf buf = PRINTBUF; 57 dirent_inode_mismatch_msg(&buf, c, dirent, inode); 58 bch_warn(c, "%s", buf.buf); 59 printbuf_exit(&buf); 60 } 61 return ret; 62 } 63 64 /* 65 * XXX: this is handling transaction restarts without returning 66 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: 67 */ 68 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, 69 u32 snapshot) 70 { 71 u64 sectors = 0; 72 73 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, 74 SPOS(inum, 0, snapshot), 75 POS(inum, U64_MAX), 76 0, k, ({ 77 if (bkey_extent_is_allocation(k.k)) 78 sectors += k.k->size; 79 0; 80 })); 81 82 return ret ?: sectors; 83 } 84 85 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, 86 u32 snapshot) 87 { 88 u64 subdirs = 0; 89 90 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, 91 SPOS(inum, 0, snapshot), 92 POS(inum, U64_MAX), 93 0, k, ({ 94 if (k.k->type == KEY_TYPE_dirent && 95 bkey_s_c_to_dirent(k).v->d_type == DT_DIR) 96 subdirs++; 97 0; 98 })); 99 100 return ret ?: subdirs; 101 } 102 103 static int subvol_lookup(struct btree_trans *trans, u32 subvol, 104 u32 *snapshot, u64 *inum) 105 { 106 struct bch_subvolume s; 107 int ret = bch2_subvolume_get(trans, subvol, false, &s); 108 109 *snapshot = le32_to_cpu(s.snapshot); 110 *inum = le64_to_cpu(s.inode); 111 return ret; 112 } 113 114 static int lookup_dirent_in_snapshot(struct btree_trans *trans, 115 struct bch_hash_info hash_info, 116 subvol_inum dir, struct qstr *name, 117 u64 *target, unsigned *type, u32 snapshot) 118 { 119 struct btree_iter iter; 120 struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, 121 &hash_info, dir, name, 0, snapshot); 122 int ret = bkey_err(k); 123 if (ret) 124 return ret; 125 126 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 127 *target = le64_to_cpu(d.v->d_inum); 128 *type = d.v->d_type; 129 bch2_trans_iter_exit(trans, &iter); 130 return 0; 131 } 132 133 /* 134 * Find any subvolume associated with a tree of snapshots 135 * We can't rely on master_subvol - it might have been deleted. 136 */ 137 static int find_snapshot_tree_subvol(struct btree_trans *trans, 138 u32 tree_id, u32 *subvol) 139 { 140 struct btree_iter iter; 141 struct bkey_s_c k; 142 int ret; 143 144 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { 145 if (k.k->type != KEY_TYPE_snapshot) 146 continue; 147 148 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); 149 if (le32_to_cpu(s.v->tree) != tree_id) 150 continue; 151 152 if (s.v->subvol) { 153 *subvol = le32_to_cpu(s.v->subvol); 154 goto found; 155 } 156 } 157 ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol); 158 found: 159 bch2_trans_iter_exit(trans, &iter); 160 return ret; 161 } 162 163 /* Get lost+found, create if it doesn't exist: */ 164 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, 165 struct bch_inode_unpacked *lostfound, 166 u64 reattaching_inum) 167 { 168 struct bch_fs *c = trans->c; 169 struct qstr lostfound_str = QSTR("lost+found"); 170 struct btree_iter lostfound_iter = {}; 171 u64 inum = 0; 172 unsigned d_type = 0; 173 int ret; 174 175 struct bch_snapshot_tree st; 176 ret = bch2_snapshot_tree_lookup(trans, 177 bch2_snapshot_tree(c, snapshot), &st); 178 if (ret) 179 return ret; 180 181 u32 subvolid; 182 ret = find_snapshot_tree_subvol(trans, 183 bch2_snapshot_tree(c, snapshot), &subvolid); 184 bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u", 185 bch2_snapshot_tree(c, snapshot)); 186 if (ret) 187 return ret; 188 189 struct bch_subvolume subvol; 190 ret = bch2_subvolume_get(trans, subvolid, false, &subvol); 191 bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot); 192 if (ret) 193 return ret; 194 195 if (!subvol.inode) { 196 struct btree_iter iter; 197 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, 198 BTREE_ID_subvolumes, POS(0, subvolid), 199 0, subvolume); 200 ret = PTR_ERR_OR_ZERO(subvol); 201 if (ret) 202 return ret; 203 204 subvol->v.inode = cpu_to_le64(reattaching_inum); 205 bch2_trans_iter_exit(trans, &iter); 206 } 207 208 subvol_inum root_inum = { 209 .subvol = subvolid, 210 .inum = le64_to_cpu(subvol.inode) 211 }; 212 213 struct bch_inode_unpacked root_inode; 214 struct bch_hash_info root_hash_info; 215 ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); 216 bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", 217 root_inum.inum, subvolid); 218 if (ret) 219 return ret; 220 221 root_hash_info = bch2_hash_info_init(c, &root_inode); 222 223 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, 224 &lostfound_str, &inum, &d_type, snapshot); 225 if (bch2_err_matches(ret, ENOENT)) 226 goto create_lostfound; 227 228 bch_err_fn(c, ret); 229 if (ret) 230 return ret; 231 232 if (d_type != DT_DIR) { 233 bch_err(c, "error looking up lost+found: not a directory"); 234 return bch_err_throw(c, ENOENT_not_directory); 235 } 236 237 /* 238 * The bch2_check_dirents pass has already run, dangling dirents 239 * shouldn't exist here: 240 */ 241 ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); 242 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", 243 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); 244 return ret; 245 246 create_lostfound: 247 /* 248 * we always create lost+found in the root snapshot; we don't want 249 * different branches of the snapshot tree to have different lost+found 250 */ 251 snapshot = le32_to_cpu(st.root_snapshot); 252 /* 253 * XXX: we could have a nicer log message here if we had a nice way to 254 * walk backpointers to print a path 255 */ 256 struct printbuf path = PRINTBUF; 257 ret = bch2_inum_to_path(trans, root_inum, &path); 258 if (ret) 259 goto err; 260 261 bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", 262 path.buf, root_inum.subvol, snapshot); 263 printbuf_exit(&path); 264 265 u64 now = bch2_current_time(c); 266 u64 cpu = raw_smp_processor_id(); 267 268 bch2_inode_init_early(c, lostfound); 269 bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); 270 lostfound->bi_dir = root_inode.bi_inum; 271 lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); 272 273 root_inode.bi_nlink++; 274 275 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); 276 if (ret) 277 goto err; 278 279 bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); 280 ret = bch2_btree_iter_traverse(trans, &lostfound_iter); 281 if (ret) 282 goto err; 283 284 ret = bch2_dirent_create_snapshot(trans, 285 0, root_inode.bi_inum, snapshot, &root_hash_info, 286 mode_to_type(lostfound->bi_mode), 287 &lostfound_str, 288 lostfound->bi_inum, 289 &lostfound->bi_dir_offset, 290 BTREE_UPDATE_internal_snapshot_node| 291 STR_HASH_must_create) ?: 292 bch2_inode_write_flags(trans, &lostfound_iter, lostfound, 293 BTREE_UPDATE_internal_snapshot_node); 294 err: 295 bch_err_msg(c, ret, "creating lost+found"); 296 bch2_trans_iter_exit(trans, &lostfound_iter); 297 return ret; 298 } 299 300 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) 301 { 302 if (inode->bi_inum == BCACHEFS_ROOT_INO && 303 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) 304 return false; 305 306 /* 307 * Subvolume roots are special: older versions of subvolume roots may be 308 * disconnected, it's only the newest version that matters. 309 * 310 * We only keep a single dirent pointing to a subvolume root, i.e. 311 * older versions of snapshots will not have a different dirent pointing 312 * to the same subvolume root. 313 * 314 * This is because dirents that point to subvolumes are only visible in 315 * the parent subvolume - versioning is not needed - and keeping them 316 * around would break fsck, because when we're crossing subvolumes we 317 * don't have a consistent snapshot ID to do check the inode <-> dirent 318 * relationships. 319 * 320 * Thus, a subvolume root that's been renamed after a snapshot will have 321 * a disconnected older version - that's expected. 322 * 323 * Note that taking a snapshot always updates the root inode (to update 324 * the dirent backpointer), so a subvolume root inode with 325 * BCH_INODE_has_child_snapshot is never visible. 326 */ 327 if (inode->bi_subvol && 328 (inode->bi_flags & BCH_INODE_has_child_snapshot)) 329 return false; 330 331 return !bch2_inode_has_backpointer(inode) && 332 !(inode->bi_flags & BCH_INODE_unlinked); 333 } 334 335 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) 336 { 337 struct btree_iter iter; 338 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, 339 SPOS(d_pos.inode, d_pos.offset, snapshot), 340 BTREE_ITER_intent| 341 BTREE_ITER_with_updates); 342 int ret = bkey_err(k); 343 if (ret) 344 return ret; 345 346 if (bpos_eq(k.k->p, d_pos)) { 347 /* 348 * delet_at() doesn't work because the update path doesn't 349 * internally use BTREE_ITER_with_updates yet 350 */ 351 struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); 352 ret = PTR_ERR_OR_ZERO(k); 353 if (ret) 354 goto err; 355 356 bkey_init(&k->k); 357 k->k.type = KEY_TYPE_whiteout; 358 k->k.p = iter.pos; 359 ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); 360 } 361 err: 362 bch2_trans_iter_exit(trans, &iter); 363 return ret; 364 } 365 366 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) 367 { 368 struct bch_fs *c = trans->c; 369 struct bch_inode_unpacked lostfound; 370 char name_buf[20]; 371 int ret; 372 373 u32 dirent_snapshot = inode->bi_snapshot; 374 if (inode->bi_subvol) { 375 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; 376 377 struct btree_iter subvol_iter; 378 struct bkey_i_subvolume *subvol = 379 bch2_bkey_get_mut_typed(trans, &subvol_iter, 380 BTREE_ID_subvolumes, POS(0, inode->bi_subvol), 381 0, subvolume); 382 ret = PTR_ERR_OR_ZERO(subvol); 383 if (ret) 384 return ret; 385 386 subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL; 387 bch2_trans_iter_exit(trans, &subvol_iter); 388 389 u64 root_inum; 390 ret = subvol_lookup(trans, inode->bi_parent_subvol, 391 &dirent_snapshot, &root_inum); 392 if (ret) 393 return ret; 394 395 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); 396 } else { 397 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 398 } 399 400 ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); 401 if (ret) 402 return ret; 403 404 bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum); 405 406 lostfound.bi_nlink += S_ISDIR(inode->bi_mode); 407 408 /* ensure lost+found inode is also present in inode snapshot */ 409 if (!inode->bi_subvol) { 410 BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot)); 411 lostfound.bi_snapshot = inode->bi_snapshot; 412 } 413 414 ret = __bch2_fsck_write_inode(trans, &lostfound); 415 if (ret) 416 return ret; 417 418 struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); 419 struct qstr name = QSTR(name_buf); 420 421 inode->bi_dir = lostfound.bi_inum; 422 423 ret = bch2_dirent_create_snapshot(trans, 424 inode->bi_parent_subvol, lostfound.bi_inum, 425 dirent_snapshot, 426 &dir_hash, 427 inode_d_type(inode), 428 &name, 429 inode->bi_subvol ?: inode->bi_inum, 430 &inode->bi_dir_offset, 431 BTREE_UPDATE_internal_snapshot_node| 432 STR_HASH_must_create); 433 if (ret) { 434 bch_err_msg(c, ret, "error creating dirent"); 435 return ret; 436 } 437 438 ret = __bch2_fsck_write_inode(trans, inode); 439 if (ret) 440 return ret; 441 442 { 443 CLASS(printbuf, buf)(); 444 ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, 445 inode->bi_snapshot, NULL, &buf); 446 if (ret) 447 return ret; 448 449 bch_info(c, "reattached at %s", buf.buf); 450 } 451 452 /* 453 * Fix up inodes in child snapshots: if they should also be reattached 454 * update the backpointer field, if they should not be we need to emit 455 * whiteouts for the dirent we just created. 456 */ 457 if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { 458 snapshot_id_list whiteouts_done; 459 struct btree_iter iter; 460 struct bkey_s_c k; 461 462 darray_init(&whiteouts_done); 463 464 for_each_btree_key_reverse_norestart(trans, iter, 465 BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), 466 BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { 467 if (k.k->p.offset != inode->bi_inum) 468 break; 469 470 if (!bkey_is_inode(k.k) || 471 !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || 472 snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) 473 continue; 474 475 struct bch_inode_unpacked child_inode; 476 ret = bch2_inode_unpack(k, &child_inode); 477 if (ret) 478 break; 479 480 if (!inode_should_reattach(&child_inode)) { 481 ret = maybe_delete_dirent(trans, 482 SPOS(lostfound.bi_inum, inode->bi_dir_offset, 483 dirent_snapshot), 484 k.k->p.snapshot); 485 if (ret) 486 break; 487 488 ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); 489 if (ret) 490 break; 491 } else { 492 iter.snapshot = k.k->p.snapshot; 493 child_inode.bi_dir = inode->bi_dir; 494 child_inode.bi_dir_offset = inode->bi_dir_offset; 495 496 ret = bch2_inode_write_flags(trans, &iter, &child_inode, 497 BTREE_UPDATE_internal_snapshot_node); 498 if (ret) 499 break; 500 } 501 } 502 darray_exit(&whiteouts_done); 503 bch2_trans_iter_exit(trans, &iter); 504 } 505 506 return ret; 507 } 508 509 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 510 struct btree_iter *iter, 511 struct bpos pos) 512 { 513 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 514 } 515 516 static int remove_backpointer(struct btree_trans *trans, 517 struct bch_inode_unpacked *inode) 518 { 519 if (!bch2_inode_has_backpointer(inode)) 520 return 0; 521 522 u32 snapshot = inode->bi_snapshot; 523 524 if (inode->bi_parent_subvol) { 525 int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot); 526 if (ret) 527 return ret; 528 } 529 530 struct bch_fs *c = trans->c; 531 struct btree_iter iter; 532 struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, 533 SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); 534 int ret = bkey_err(d) ?: 535 dirent_points_to_inode(c, d, inode) ?: 536 bch2_fsck_remove_dirent(trans, d.k->p); 537 bch2_trans_iter_exit(trans, &iter); 538 return ret; 539 } 540 541 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) 542 { 543 struct bch_fs *c = trans->c; 544 545 struct bch_inode_unpacked inode; 546 int ret = bch2_inode_find_by_inum_trans(trans, 547 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 548 &inode); 549 if (ret) 550 return ret; 551 552 ret = remove_backpointer(trans, &inode); 553 if (!bch2_err_matches(ret, ENOENT)) 554 bch_err_msg(c, ret, "removing dirent"); 555 if (ret) 556 return ret; 557 558 ret = reattach_inode(trans, &inode); 559 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 560 return ret; 561 } 562 563 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) 564 { 565 struct bch_fs *c = trans->c; 566 567 if (!bch2_snapshot_is_leaf(c, snapshotid)) { 568 bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); 569 return bch_err_throw(c, fsck_repair_unimplemented); 570 } 571 572 /* 573 * If inum isn't set, that means we're being called from check_dirents, 574 * not check_inodes - the root of this subvolume doesn't exist or we 575 * would have found it there: 576 */ 577 if (!inum) { 578 struct btree_iter inode_iter = {}; 579 struct bch_inode_unpacked new_inode; 580 u64 cpu = raw_smp_processor_id(); 581 582 bch2_inode_init_early(c, &new_inode); 583 bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); 584 585 new_inode.bi_subvol = subvolid; 586 587 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: 588 bch2_btree_iter_traverse(trans, &inode_iter) ?: 589 bch2_inode_write(trans, &inode_iter, &new_inode); 590 bch2_trans_iter_exit(trans, &inode_iter); 591 if (ret) 592 return ret; 593 594 inum = new_inode.bi_inum; 595 } 596 597 bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); 598 599 struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); 600 int ret = PTR_ERR_OR_ZERO(new_subvol); 601 if (ret) 602 return ret; 603 604 bkey_subvolume_init(&new_subvol->k_i); 605 new_subvol->k.p.offset = subvolid; 606 new_subvol->v.snapshot = cpu_to_le32(snapshotid); 607 new_subvol->v.inode = cpu_to_le64(inum); 608 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); 609 if (ret) 610 return ret; 611 612 struct btree_iter iter; 613 struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, 614 BTREE_ID_snapshots, POS(0, snapshotid), 615 0, snapshot); 616 ret = PTR_ERR_OR_ZERO(s); 617 bch_err_msg(c, ret, "getting snapshot %u", snapshotid); 618 if (ret) 619 return ret; 620 621 u32 snapshot_tree = le32_to_cpu(s->v.tree); 622 623 s->v.subvol = cpu_to_le32(subvolid); 624 SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); 625 bch2_trans_iter_exit(trans, &iter); 626 627 struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, 628 BTREE_ID_snapshot_trees, POS(0, snapshot_tree), 629 0, snapshot_tree); 630 ret = PTR_ERR_OR_ZERO(st); 631 bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); 632 if (ret) 633 return ret; 634 635 if (!st->v.master_subvol) 636 st->v.master_subvol = cpu_to_le32(subvolid); 637 638 bch2_trans_iter_exit(trans, &iter); 639 return 0; 640 } 641 642 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) 643 { 644 struct bch_fs *c = trans->c; 645 unsigned i_mode = S_IFREG; 646 u64 i_size = 0; 647 648 switch (btree) { 649 case BTREE_ID_extents: { 650 struct btree_iter iter = {}; 651 652 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); 653 struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); 654 bch2_trans_iter_exit(trans, &iter); 655 int ret = bkey_err(k); 656 if (ret) 657 return ret; 658 659 i_size = k.k->p.offset << 9; 660 break; 661 } 662 case BTREE_ID_dirents: 663 i_mode = S_IFDIR; 664 break; 665 case BTREE_ID_xattrs: 666 break; 667 default: 668 BUG(); 669 } 670 671 struct bch_inode_unpacked new_inode; 672 bch2_inode_init_early(c, &new_inode); 673 bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); 674 new_inode.bi_size = i_size; 675 new_inode.bi_inum = inum; 676 new_inode.bi_snapshot = snapshot; 677 678 return __bch2_fsck_write_inode(trans, &new_inode); 679 } 680 681 static inline void snapshots_seen_exit(struct snapshots_seen *s) 682 { 683 darray_exit(&s->ids); 684 } 685 686 static inline void snapshots_seen_init(struct snapshots_seen *s) 687 { 688 memset(s, 0, sizeof(*s)); 689 } 690 691 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) 692 { 693 u32 *i; 694 __darray_for_each(s->ids, i) { 695 if (*i == id) 696 return 0; 697 if (*i > id) 698 break; 699 } 700 701 int ret = darray_insert_item(&s->ids, i - s->ids.data, id); 702 if (ret) 703 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 704 s->ids.size); 705 return ret; 706 } 707 708 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, 709 enum btree_id btree_id, struct bpos pos) 710 { 711 if (!bkey_eq(s->pos, pos)) 712 s->ids.nr = 0; 713 s->pos = pos; 714 715 return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); 716 } 717 718 /** 719 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, 720 * and @ancestor hasn't been overwritten in @seen 721 * 722 * @c: filesystem handle 723 * @seen: list of snapshot ids already seen at current position 724 * @id: descendent snapshot id 725 * @ancestor: ancestor snapshot id 726 * 727 * Returns: whether key in @ancestor snapshot is visible in @id snapshot 728 */ 729 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, 730 u32 id, u32 ancestor) 731 { 732 EBUG_ON(id > ancestor); 733 734 if (id == ancestor) 735 return true; 736 737 if (!bch2_snapshot_is_ancestor(c, id, ancestor)) 738 return false; 739 740 /* 741 * We know that @id is a descendant of @ancestor, we're checking if 742 * we've seen a key that overwrote @ancestor - i.e. also a descendent of 743 * @ascestor and with @id as a descendent. 744 * 745 * But we already know that we're scanning IDs between @id and @ancestor 746 * numerically, since snapshot ID lists are kept sorted, so if we find 747 * an id that's an ancestor of @id we're done: 748 */ 749 darray_for_each_reverse(seen->ids, i) 750 if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i)) 751 return false; 752 753 return true; 754 } 755 756 /** 757 * ref_visible - given a key with snapshot id @src that points to a key with 758 * snapshot id @dst, test whether there is some snapshot in which @dst is 759 * visible. 760 * 761 * @c: filesystem handle 762 * @s: list of snapshot IDs already seen at @src 763 * @src: snapshot ID of src key 764 * @dst: snapshot ID of dst key 765 * Returns: true if there is some snapshot in which @dst is visible 766 * 767 * Assumes we're visiting @src keys in natural key order 768 */ 769 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, 770 u32 src, u32 dst) 771 { 772 return dst <= src 773 ? key_visible_in_snapshot(c, s, dst, src) 774 : bch2_snapshot_is_ancestor(c, src, dst); 775 } 776 777 static int ref_visible2(struct bch_fs *c, 778 u32 src, struct snapshots_seen *src_seen, 779 u32 dst, struct snapshots_seen *dst_seen) 780 { 781 if (dst > src) { 782 swap(dst, src); 783 swap(dst_seen, src_seen); 784 } 785 return key_visible_in_snapshot(c, src_seen, dst, src); 786 } 787 788 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ 789 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ 790 (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ 791 if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) 792 793 struct inode_walker_entry { 794 struct bch_inode_unpacked inode; 795 bool whiteout; 796 u64 count; 797 u64 i_size; 798 }; 799 800 struct inode_walker { 801 bool first_this_inode; 802 bool have_inodes; 803 bool recalculate_sums; 804 struct bpos last_pos; 805 806 DARRAY(struct inode_walker_entry) inodes; 807 snapshot_id_list deletes; 808 }; 809 810 static void inode_walker_exit(struct inode_walker *w) 811 { 812 darray_exit(&w->inodes); 813 darray_exit(&w->deletes); 814 } 815 816 static struct inode_walker inode_walker_init(void) 817 { 818 return (struct inode_walker) { 0, }; 819 } 820 821 static int add_inode(struct bch_fs *c, struct inode_walker *w, 822 struct bkey_s_c inode) 823 { 824 int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { 825 .whiteout = !bkey_is_inode(inode.k), 826 })); 827 if (ret) 828 return ret; 829 830 struct inode_walker_entry *n = &darray_last(w->inodes); 831 if (!n->whiteout) { 832 return bch2_inode_unpack(inode, &n->inode); 833 } else { 834 n->inode.bi_inum = inode.k->p.offset; 835 n->inode.bi_snapshot = inode.k->p.snapshot; 836 return 0; 837 } 838 } 839 840 static int get_inodes_all_snapshots(struct btree_trans *trans, 841 struct inode_walker *w, u64 inum) 842 { 843 struct bch_fs *c = trans->c; 844 struct btree_iter iter; 845 struct bkey_s_c k; 846 int ret; 847 848 /* 849 * We no longer have inodes for w->last_pos; clear this to avoid 850 * screwing up check_i_sectors/check_subdir_count if we take a 851 * transaction restart here: 852 */ 853 w->have_inodes = false; 854 w->recalculate_sums = false; 855 w->inodes.nr = 0; 856 857 for_each_btree_key_max_norestart(trans, iter, 858 BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), 859 BTREE_ITER_all_snapshots, k, ret) { 860 ret = add_inode(c, w, k); 861 if (ret) 862 break; 863 } 864 bch2_trans_iter_exit(trans, &iter); 865 866 if (ret) 867 return ret; 868 869 w->first_this_inode = true; 870 w->have_inodes = true; 871 return 0; 872 } 873 874 static int get_visible_inodes(struct btree_trans *trans, 875 struct inode_walker *w, 876 struct snapshots_seen *s, 877 u64 inum) 878 { 879 struct bch_fs *c = trans->c; 880 struct btree_iter iter; 881 struct bkey_s_c k; 882 int ret; 883 884 w->inodes.nr = 0; 885 w->deletes.nr = 0; 886 887 for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), 888 BTREE_ITER_all_snapshots, k, ret) { 889 if (k.k->p.offset != inum) 890 break; 891 892 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) 893 continue; 894 895 if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) 896 continue; 897 898 ret = bkey_is_inode(k.k) 899 ? add_inode(c, w, k) 900 : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); 901 if (ret) 902 break; 903 } 904 bch2_trans_iter_exit(trans, &iter); 905 906 return ret; 907 } 908 909 static struct inode_walker_entry * 910 lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) 911 { 912 struct bch_fs *c = trans->c; 913 914 struct inode_walker_entry *i = darray_find_p(w->inodes, i, 915 bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)); 916 917 if (!i) 918 return NULL; 919 920 struct printbuf buf = PRINTBUF; 921 int ret = 0; 922 923 if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, 924 trans, snapshot_key_missing_inode_snapshot, 925 "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" 926 "unexpected because we should always update the inode when we update a key in that inode\n" 927 "%s", 928 w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, 929 (bch2_bkey_val_to_text(&buf, c, k), 930 buf.buf))) { 931 if (!i->whiteout) { 932 struct bch_inode_unpacked new = i->inode; 933 new.bi_snapshot = k.k->p.snapshot; 934 ret = __bch2_fsck_write_inode(trans, &new); 935 } else { 936 struct bkey_i whiteout; 937 bkey_init(&whiteout.k); 938 whiteout.k.type = KEY_TYPE_whiteout; 939 whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot); 940 ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, 941 &whiteout, 942 BTREE_UPDATE_internal_snapshot_node); 943 } 944 945 if (ret) 946 goto fsck_err; 947 948 ret = bch2_trans_commit(trans, NULL, NULL, 0); 949 if (ret) 950 goto fsck_err; 951 952 struct inode_walker_entry new_entry = *i; 953 954 new_entry.inode.bi_snapshot = k.k->p.snapshot; 955 new_entry.count = 0; 956 new_entry.i_size = 0; 957 958 while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) 959 --i; 960 961 size_t pos = i - w->inodes.data; 962 ret = darray_insert_item(&w->inodes, pos, new_entry); 963 if (ret) 964 goto fsck_err; 965 966 ret = bch_err_throw(c, transaction_restart_nested); 967 goto fsck_err; 968 } 969 970 printbuf_exit(&buf); 971 return i; 972 fsck_err: 973 printbuf_exit(&buf); 974 return ERR_PTR(ret); 975 } 976 977 static struct inode_walker_entry *walk_inode(struct btree_trans *trans, 978 struct inode_walker *w, 979 struct bkey_s_c k) 980 { 981 if (w->last_pos.inode != k.k->p.inode) { 982 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); 983 if (ret) 984 return ERR_PTR(ret); 985 } 986 987 w->last_pos = k.k->p; 988 989 return lookup_inode_for_snapshot(trans, w, k); 990 } 991 992 /* 993 * Prefer to delete the first one, since that will be the one at the wrong 994 * offset: 995 * return value: 0 -> delete k1, 1 -> delete k2 996 */ 997 int bch2_fsck_update_backpointers(struct btree_trans *trans, 998 struct snapshots_seen *s, 999 const struct bch_hash_desc desc, 1000 struct bch_hash_info *hash_info, 1001 struct bkey_i *new) 1002 { 1003 if (new->k.type != KEY_TYPE_dirent) 1004 return 0; 1005 1006 struct bkey_i_dirent *d = bkey_i_to_dirent(new); 1007 struct inode_walker target = inode_walker_init(); 1008 int ret = 0; 1009 1010 if (d->v.d_type == DT_SUBVOL) { 1011 bch_err(trans->c, "%s does not support DT_SUBVOL", __func__); 1012 ret = -BCH_ERR_fsck_repair_unimplemented; 1013 } else { 1014 ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); 1015 if (ret) 1016 goto err; 1017 1018 darray_for_each(target.inodes, i) { 1019 i->inode.bi_dir_offset = d->k.p.offset; 1020 ret = __bch2_fsck_write_inode(trans, &i->inode); 1021 if (ret) 1022 goto err; 1023 } 1024 } 1025 err: 1026 inode_walker_exit(&target); 1027 return ret; 1028 } 1029 1030 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, 1031 struct btree_iter *iter, 1032 struct bch_inode_unpacked *inode, 1033 u32 *snapshot) 1034 { 1035 if (inode->bi_subvol) { 1036 u64 inum; 1037 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); 1038 if (ret) 1039 return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); 1040 } 1041 1042 return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); 1043 } 1044 1045 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) 1046 { 1047 struct btree_iter iter; 1048 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); 1049 int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; 1050 bch2_trans_iter_exit(trans, &iter); 1051 return ret; 1052 } 1053 1054 static int check_inode_dirent_inode(struct btree_trans *trans, 1055 struct bch_inode_unpacked *inode, 1056 bool *write_inode) 1057 { 1058 struct bch_fs *c = trans->c; 1059 struct printbuf buf = PRINTBUF; 1060 1061 u32 inode_snapshot = inode->bi_snapshot; 1062 struct btree_iter dirent_iter = {}; 1063 struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); 1064 int ret = bkey_err(d); 1065 if (ret && !bch2_err_matches(ret, ENOENT)) 1066 return ret; 1067 1068 if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) && 1069 inode->bi_subvol && 1070 (inode->bi_flags & BCH_INODE_has_child_snapshot)) { 1071 /* Older version of a renamed subvolume root: we won't have a 1072 * correct dirent for it. That's expected, see 1073 * inode_should_reattach(). 1074 * 1075 * We don't clear the backpointer field when doing the rename 1076 * because there might be arbitrarily many versions in older 1077 * snapshots. 1078 */ 1079 inode->bi_dir = 0; 1080 inode->bi_dir_offset = 0; 1081 *write_inode = true; 1082 goto out; 1083 } 1084 1085 if (fsck_err_on(ret, 1086 trans, inode_points_to_missing_dirent, 1087 "inode points to missing dirent\n%s", 1088 (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || 1089 fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode), 1090 trans, inode_points_to_wrong_dirent, 1091 "%s", 1092 (printbuf_reset(&buf), 1093 dirent_inode_mismatch_msg(&buf, c, d, inode), 1094 buf.buf))) { 1095 /* 1096 * We just clear the backpointer fields for now. If we find a 1097 * dirent that points to this inode in check_dirents(), we'll 1098 * update it then; then when we get to check_path() if the 1099 * backpointer is still 0 we'll reattach it. 1100 */ 1101 inode->bi_dir = 0; 1102 inode->bi_dir_offset = 0; 1103 *write_inode = true; 1104 } 1105 out: 1106 ret = 0; 1107 fsck_err: 1108 bch2_trans_iter_exit(trans, &dirent_iter); 1109 printbuf_exit(&buf); 1110 bch_err_fn(c, ret); 1111 return ret; 1112 } 1113 1114 static int check_inode(struct btree_trans *trans, 1115 struct btree_iter *iter, 1116 struct bkey_s_c k, 1117 struct bch_inode_unpacked *snapshot_root, 1118 struct snapshots_seen *s) 1119 { 1120 struct bch_fs *c = trans->c; 1121 struct printbuf buf = PRINTBUF; 1122 struct bch_inode_unpacked u; 1123 bool do_update = false; 1124 int ret; 1125 1126 ret = bch2_check_key_has_snapshot(trans, iter, k); 1127 if (ret < 0) 1128 goto err; 1129 if (ret) 1130 return 0; 1131 1132 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1133 if (ret) 1134 goto err; 1135 1136 if (!bkey_is_inode(k.k)) 1137 return 0; 1138 1139 ret = bch2_inode_unpack(k, &u); 1140 if (ret) 1141 goto err; 1142 1143 if (snapshot_root->bi_inum != u.bi_inum) { 1144 ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); 1145 if (ret) 1146 goto err; 1147 } 1148 1149 if (u.bi_hash_seed != snapshot_root->bi_hash_seed || 1150 INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { 1151 ret = bch2_repair_inode_hash_info(trans, snapshot_root); 1152 BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); 1153 if (ret) 1154 goto err; 1155 } 1156 1157 ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); 1158 if (ret) 1159 goto err; 1160 1161 if (bch2_inode_has_backpointer(&u)) { 1162 ret = check_inode_dirent_inode(trans, &u, &do_update); 1163 if (ret) 1164 goto err; 1165 } 1166 1167 if (fsck_err_on(bch2_inode_has_backpointer(&u) && 1168 (u.bi_flags & BCH_INODE_unlinked), 1169 trans, inode_unlinked_but_has_dirent, 1170 "inode unlinked but has dirent\n%s", 1171 (printbuf_reset(&buf), 1172 bch2_inode_unpacked_to_text(&buf, &u), 1173 buf.buf))) { 1174 u.bi_flags &= ~BCH_INODE_unlinked; 1175 do_update = true; 1176 } 1177 1178 if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) { 1179 /* Check for this early so that check_unreachable_inode() will reattach it */ 1180 1181 ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot); 1182 if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty) 1183 goto err; 1184 1185 fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty, 1186 "dir unlinked but not empty\n%s", 1187 (printbuf_reset(&buf), 1188 bch2_inode_unpacked_to_text(&buf, &u), 1189 buf.buf)); 1190 u.bi_flags &= ~BCH_INODE_unlinked; 1191 do_update = true; 1192 ret = 0; 1193 } 1194 1195 if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, 1196 trans, inode_dir_has_nonzero_i_size, 1197 "directory %llu:%u with nonzero i_size %lli", 1198 u.bi_inum, u.bi_snapshot, u.bi_size)) { 1199 u.bi_size = 0; 1200 do_update = true; 1201 } 1202 1203 ret = bch2_inode_has_child_snapshots(trans, k.k->p); 1204 if (ret < 0) 1205 goto err; 1206 1207 if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), 1208 trans, inode_has_child_snapshots_wrong, 1209 "inode has_child_snapshots flag wrong (should be %u)\n%s", 1210 ret, 1211 (printbuf_reset(&buf), 1212 bch2_inode_unpacked_to_text(&buf, &u), 1213 buf.buf))) { 1214 if (ret) 1215 u.bi_flags |= BCH_INODE_has_child_snapshot; 1216 else 1217 u.bi_flags &= ~BCH_INODE_has_child_snapshot; 1218 do_update = true; 1219 } 1220 ret = 0; 1221 1222 if ((u.bi_flags & BCH_INODE_unlinked) && 1223 !(u.bi_flags & BCH_INODE_has_child_snapshot)) { 1224 if (!test_bit(BCH_FS_started, &c->flags)) { 1225 /* 1226 * If we're not in online fsck, don't delete unlinked 1227 * inodes, just make sure they're on the deleted list. 1228 * 1229 * They might be referred to by a logged operation - 1230 * i.e. we might have crashed in the middle of a 1231 * truncate on an unlinked but open file - so we want to 1232 * let the delete_dead_inodes kill it after resuming 1233 * logged ops. 1234 */ 1235 ret = check_inode_deleted_list(trans, k.k->p); 1236 if (ret < 0) 1237 goto err_noprint; 1238 1239 fsck_err_on(!ret, 1240 trans, unlinked_inode_not_on_deleted_list, 1241 "inode %llu:%u unlinked, but not on deleted list", 1242 u.bi_inum, k.k->p.snapshot); 1243 1244 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); 1245 if (ret) 1246 goto err; 1247 } else { 1248 ret = bch2_inode_or_descendents_is_open(trans, k.k->p); 1249 if (ret < 0) 1250 goto err; 1251 1252 if (fsck_err_on(!ret, 1253 trans, inode_unlinked_and_not_open, 1254 "inode %llu:%u unlinked and not open", 1255 u.bi_inum, u.bi_snapshot)) { 1256 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); 1257 bch_err_msg(c, ret, "in fsck deleting inode"); 1258 goto err_noprint; 1259 } 1260 ret = 0; 1261 } 1262 } 1263 1264 if (fsck_err_on(u.bi_parent_subvol && 1265 (u.bi_subvol == 0 || 1266 u.bi_subvol == BCACHEFS_ROOT_SUBVOL), 1267 trans, inode_bi_parent_nonzero, 1268 "inode %llu:%u has subvol %u but nonzero parent subvol %u", 1269 u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { 1270 u.bi_parent_subvol = 0; 1271 do_update = true; 1272 } 1273 1274 if (u.bi_subvol) { 1275 struct bch_subvolume s; 1276 1277 ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); 1278 if (ret && !bch2_err_matches(ret, ENOENT)) 1279 goto err; 1280 1281 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 1282 ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); 1283 goto do_update; 1284 } 1285 1286 if (fsck_err_on(ret, 1287 trans, inode_bi_subvol_missing, 1288 "inode %llu:%u bi_subvol points to missing subvolume %u", 1289 u.bi_inum, k.k->p.snapshot, u.bi_subvol) || 1290 fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || 1291 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), 1292 k.k->p.snapshot), 1293 trans, inode_bi_subvol_wrong, 1294 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", 1295 u.bi_inum, k.k->p.snapshot, u.bi_subvol, 1296 le64_to_cpu(s.inode), 1297 le32_to_cpu(s.snapshot))) { 1298 u.bi_subvol = 0; 1299 u.bi_parent_subvol = 0; 1300 do_update = true; 1301 } 1302 } 1303 1304 if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), 1305 trans, inode_journal_seq_in_future, 1306 "inode journal seq in future (currently at %llu)\n%s", 1307 journal_cur_seq(&c->journal), 1308 (printbuf_reset(&buf), 1309 bch2_inode_unpacked_to_text(&buf, &u), 1310 buf.buf))) { 1311 u.bi_journal_seq = journal_cur_seq(&c->journal); 1312 do_update = true; 1313 } 1314 do_update: 1315 if (do_update) { 1316 ret = __bch2_fsck_write_inode(trans, &u); 1317 bch_err_msg(c, ret, "in fsck updating inode"); 1318 if (ret) 1319 goto err_noprint; 1320 } 1321 err: 1322 fsck_err: 1323 bch_err_fn(c, ret); 1324 err_noprint: 1325 printbuf_exit(&buf); 1326 return ret; 1327 } 1328 1329 int bch2_check_inodes(struct bch_fs *c) 1330 { 1331 struct bch_inode_unpacked snapshot_root = {}; 1332 struct snapshots_seen s; 1333 1334 snapshots_seen_init(&s); 1335 1336 int ret = bch2_trans_run(c, 1337 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1338 POS_MIN, 1339 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1340 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1341 check_inode(trans, &iter, k, &snapshot_root, &s))); 1342 1343 snapshots_seen_exit(&s); 1344 bch_err_fn(c, ret); 1345 return ret; 1346 } 1347 1348 static int find_oldest_inode_needs_reattach(struct btree_trans *trans, 1349 struct bch_inode_unpacked *inode) 1350 { 1351 struct bch_fs *c = trans->c; 1352 struct btree_iter iter; 1353 struct bkey_s_c k; 1354 int ret = 0; 1355 1356 /* 1357 * We look for inodes to reattach in natural key order, leaves first, 1358 * but we should do the reattach at the oldest version that needs to be 1359 * reattached: 1360 */ 1361 for_each_btree_key_norestart(trans, iter, 1362 BTREE_ID_inodes, 1363 SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), 1364 BTREE_ITER_all_snapshots, k, ret) { 1365 if (k.k->p.offset != inode->bi_inum) 1366 break; 1367 1368 if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) 1369 continue; 1370 1371 if (!bkey_is_inode(k.k)) 1372 break; 1373 1374 struct bch_inode_unpacked parent_inode; 1375 ret = bch2_inode_unpack(k, &parent_inode); 1376 if (ret) 1377 break; 1378 1379 if (!inode_should_reattach(&parent_inode)) 1380 break; 1381 1382 *inode = parent_inode; 1383 } 1384 bch2_trans_iter_exit(trans, &iter); 1385 1386 return ret; 1387 } 1388 1389 static int check_unreachable_inode(struct btree_trans *trans, 1390 struct btree_iter *iter, 1391 struct bkey_s_c k) 1392 { 1393 struct printbuf buf = PRINTBUF; 1394 int ret = 0; 1395 1396 if (!bkey_is_inode(k.k)) 1397 return 0; 1398 1399 struct bch_inode_unpacked inode; 1400 ret = bch2_inode_unpack(k, &inode); 1401 if (ret) 1402 return ret; 1403 1404 if (!inode_should_reattach(&inode)) 1405 return 0; 1406 1407 ret = find_oldest_inode_needs_reattach(trans, &inode); 1408 if (ret) 1409 return ret; 1410 1411 if (fsck_err(trans, inode_unreachable, 1412 "unreachable inode:\n%s", 1413 (bch2_inode_unpacked_to_text(&buf, &inode), 1414 buf.buf))) 1415 ret = reattach_inode(trans, &inode); 1416 fsck_err: 1417 printbuf_exit(&buf); 1418 return ret; 1419 } 1420 1421 /* 1422 * Reattach unreachable (but not unlinked) inodes 1423 * 1424 * Run after check_inodes() and check_dirents(), so we node that inode 1425 * backpointer fields point to valid dirents, and every inode that has a dirent 1426 * that points to it has its backpointer field set - so we're just looking for 1427 * non-unlinked inodes without backpointers: 1428 * 1429 * XXX: this is racy w.r.t. hardlink removal in online fsck 1430 */ 1431 int bch2_check_unreachable_inodes(struct bch_fs *c) 1432 { 1433 int ret = bch2_trans_run(c, 1434 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1435 POS_MIN, 1436 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1437 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1438 check_unreachable_inode(trans, &iter, k))); 1439 bch_err_fn(c, ret); 1440 return ret; 1441 } 1442 1443 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) 1444 { 1445 switch (btree) { 1446 case BTREE_ID_extents: 1447 return S_ISREG(mode) || S_ISLNK(mode); 1448 case BTREE_ID_dirents: 1449 return S_ISDIR(mode); 1450 case BTREE_ID_xattrs: 1451 return true; 1452 default: 1453 BUG(); 1454 } 1455 } 1456 1457 static int check_key_has_inode(struct btree_trans *trans, 1458 struct btree_iter *iter, 1459 struct inode_walker *inode, 1460 struct inode_walker_entry *i, 1461 struct bkey_s_c k) 1462 { 1463 struct bch_fs *c = trans->c; 1464 struct printbuf buf = PRINTBUF; 1465 struct btree_iter iter2 = {}; 1466 int ret = PTR_ERR_OR_ZERO(i); 1467 if (ret) 1468 return ret; 1469 1470 if (k.k->type == KEY_TYPE_whiteout) 1471 goto out; 1472 1473 bool have_inode = i && !i->whiteout; 1474 1475 if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) 1476 goto reconstruct; 1477 1478 if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode)) 1479 goto out; 1480 1481 prt_printf(&buf, ", "); 1482 1483 bool have_old_inode = false; 1484 darray_for_each(inode->inodes, i2) 1485 if (!i2->whiteout && 1486 bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) && 1487 btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) { 1488 prt_printf(&buf, "but found good inode in older snapshot\n"); 1489 bch2_inode_unpacked_to_text(&buf, &i2->inode); 1490 prt_newline(&buf); 1491 have_old_inode = true; 1492 break; 1493 } 1494 1495 struct bkey_s_c k2; 1496 unsigned nr_keys = 0; 1497 1498 prt_printf(&buf, "found keys:\n"); 1499 1500 for_each_btree_key_max_norestart(trans, iter2, iter->btree_id, 1501 SPOS(k.k->p.inode, 0, k.k->p.snapshot), 1502 POS(k.k->p.inode, U64_MAX), 1503 0, k2, ret) { 1504 nr_keys++; 1505 if (nr_keys <= 10) { 1506 bch2_bkey_val_to_text(&buf, c, k2); 1507 prt_newline(&buf); 1508 } 1509 if (nr_keys >= 100) 1510 break; 1511 } 1512 1513 if (ret) 1514 goto err; 1515 1516 if (nr_keys > 100) 1517 prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); 1518 else if (nr_keys > 10) 1519 prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); 1520 1521 if (!have_inode) { 1522 if (fsck_err_on(!have_inode, 1523 trans, key_in_missing_inode, 1524 "key in missing inode%s", buf.buf)) { 1525 /* 1526 * Maybe a deletion that raced with data move, or something 1527 * weird like that? But if we know the inode was deleted, or 1528 * it's just a few keys, we can safely delete them. 1529 * 1530 * If it's many keys, we should probably recreate the inode 1531 */ 1532 if (have_old_inode || nr_keys <= 2) 1533 goto delete; 1534 else 1535 goto reconstruct; 1536 } 1537 } else { 1538 /* 1539 * not autofix, this one would be a giant wtf - bit error in the 1540 * inode corrupting i_mode? 1541 * 1542 * may want to try repairing inode instead of deleting 1543 */ 1544 if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), 1545 trans, key_in_wrong_inode_type, 1546 "key for wrong inode mode %o%s", 1547 i->inode.bi_mode, buf.buf)) 1548 goto delete; 1549 } 1550 out: 1551 err: 1552 fsck_err: 1553 bch2_trans_iter_exit(trans, &iter2); 1554 printbuf_exit(&buf); 1555 bch_err_fn(c, ret); 1556 return ret; 1557 delete: 1558 /* 1559 * XXX: print out more info 1560 * count up extents for this inode, check if we have different inode in 1561 * an older snapshot version, perhaps decide if we want to reconstitute 1562 */ 1563 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); 1564 goto out; 1565 reconstruct: 1566 ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: 1567 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1568 if (ret) 1569 goto err; 1570 1571 inode->last_pos.inode--; 1572 ret = bch_err_throw(c, transaction_restart_nested); 1573 goto out; 1574 } 1575 1576 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) 1577 { 1578 struct bch_fs *c = trans->c; 1579 int ret = 0; 1580 s64 count2; 1581 1582 darray_for_each(w->inodes, i) { 1583 if (i->inode.bi_sectors == i->count) 1584 continue; 1585 1586 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); 1587 1588 if (w->recalculate_sums) 1589 i->count = count2; 1590 1591 if (i->count != count2) { 1592 bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1593 w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); 1594 i->count = count2; 1595 } 1596 1597 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), 1598 trans, inode_i_sectors_wrong, 1599 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", 1600 w->last_pos.inode, i->inode.bi_snapshot, 1601 i->inode.bi_sectors, i->count)) { 1602 i->inode.bi_sectors = i->count; 1603 ret = bch2_fsck_write_inode(trans, &i->inode); 1604 if (ret) 1605 break; 1606 } 1607 } 1608 fsck_err: 1609 bch_err_fn(c, ret); 1610 return ret; 1611 } 1612 1613 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) 1614 { 1615 u32 restart_count = trans->restart_count; 1616 return check_i_sectors_notnested(trans, w) ?: 1617 trans_was_restarted(trans, restart_count); 1618 } 1619 1620 struct extent_end { 1621 u32 snapshot; 1622 u64 offset; 1623 struct snapshots_seen seen; 1624 }; 1625 1626 struct extent_ends { 1627 struct bpos last_pos; 1628 DARRAY(struct extent_end) e; 1629 }; 1630 1631 static void extent_ends_reset(struct extent_ends *extent_ends) 1632 { 1633 darray_for_each(extent_ends->e, i) 1634 snapshots_seen_exit(&i->seen); 1635 extent_ends->e.nr = 0; 1636 } 1637 1638 static void extent_ends_exit(struct extent_ends *extent_ends) 1639 { 1640 extent_ends_reset(extent_ends); 1641 darray_exit(&extent_ends->e); 1642 } 1643 1644 static void extent_ends_init(struct extent_ends *extent_ends) 1645 { 1646 memset(extent_ends, 0, sizeof(*extent_ends)); 1647 } 1648 1649 static int extent_ends_at(struct bch_fs *c, 1650 struct extent_ends *extent_ends, 1651 struct snapshots_seen *seen, 1652 struct bkey_s_c k) 1653 { 1654 struct extent_end *i, n = (struct extent_end) { 1655 .offset = k.k->p.offset, 1656 .snapshot = k.k->p.snapshot, 1657 .seen = *seen, 1658 }; 1659 1660 n.seen.ids.data = kmemdup(seen->ids.data, 1661 sizeof(seen->ids.data[0]) * seen->ids.size, 1662 GFP_KERNEL); 1663 if (!n.seen.ids.data) 1664 return bch_err_throw(c, ENOMEM_fsck_extent_ends_at); 1665 1666 __darray_for_each(extent_ends->e, i) { 1667 if (i->snapshot == k.k->p.snapshot) { 1668 snapshots_seen_exit(&i->seen); 1669 *i = n; 1670 return 0; 1671 } 1672 1673 if (i->snapshot >= k.k->p.snapshot) 1674 break; 1675 } 1676 1677 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); 1678 } 1679 1680 static int overlapping_extents_found(struct btree_trans *trans, 1681 enum btree_id btree, 1682 struct bpos pos1, struct snapshots_seen *pos1_seen, 1683 struct bkey pos2, 1684 bool *fixed, 1685 struct extent_end *extent_end) 1686 { 1687 struct bch_fs *c = trans->c; 1688 struct printbuf buf = PRINTBUF; 1689 struct btree_iter iter1, iter2 = {}; 1690 struct bkey_s_c k1, k2; 1691 int ret; 1692 1693 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); 1694 1695 bch2_trans_iter_init(trans, &iter1, btree, pos1, 1696 BTREE_ITER_all_snapshots| 1697 BTREE_ITER_not_extents); 1698 k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); 1699 ret = bkey_err(k1); 1700 if (ret) 1701 goto err; 1702 1703 prt_newline(&buf); 1704 bch2_bkey_val_to_text(&buf, c, k1); 1705 1706 if (!bpos_eq(pos1, k1.k->p)) { 1707 prt_str(&buf, "\nwanted\n "); 1708 bch2_bpos_to_text(&buf, pos1); 1709 prt_str(&buf, "\n"); 1710 bch2_bkey_to_text(&buf, &pos2); 1711 1712 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", 1713 __func__, buf.buf); 1714 ret = bch_err_throw(c, internal_fsck_err); 1715 goto err; 1716 } 1717 1718 bch2_trans_copy_iter(trans, &iter2, &iter1); 1719 1720 while (1) { 1721 bch2_btree_iter_advance(trans, &iter2); 1722 1723 k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); 1724 ret = bkey_err(k2); 1725 if (ret) 1726 goto err; 1727 1728 if (bpos_ge(k2.k->p, pos2.p)) 1729 break; 1730 } 1731 1732 prt_newline(&buf); 1733 bch2_bkey_val_to_text(&buf, c, k2); 1734 1735 if (bpos_gt(k2.k->p, pos2.p) || 1736 pos2.size != k2.k->size) { 1737 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", 1738 __func__, buf.buf); 1739 ret = bch_err_throw(c, internal_fsck_err); 1740 goto err; 1741 } 1742 1743 prt_printf(&buf, "\noverwriting %s extent", 1744 pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); 1745 1746 if (fsck_err(trans, extent_overlapping, 1747 "overlapping extents%s", buf.buf)) { 1748 struct btree_iter *old_iter = &iter1; 1749 struct disk_reservation res = { 0 }; 1750 1751 if (pos1.snapshot < pos2.p.snapshot) { 1752 old_iter = &iter2; 1753 swap(k1, k2); 1754 } 1755 1756 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); 1757 1758 ret = bch2_trans_update_extent_overwrite(trans, old_iter, 1759 BTREE_UPDATE_internal_snapshot_node, 1760 k1, k2) ?: 1761 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); 1762 bch2_disk_reservation_put(c, &res); 1763 1764 bch_info(c, "repair ret %s", bch2_err_str(ret)); 1765 1766 if (ret) 1767 goto err; 1768 1769 *fixed = true; 1770 1771 if (pos1.snapshot == pos2.p.snapshot) { 1772 /* 1773 * We overwrote the first extent, and did the overwrite 1774 * in the same snapshot: 1775 */ 1776 extent_end->offset = bkey_start_offset(&pos2); 1777 } else if (pos1.snapshot > pos2.p.snapshot) { 1778 /* 1779 * We overwrote the first extent in pos2's snapshot: 1780 */ 1781 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); 1782 } else { 1783 /* 1784 * We overwrote the second extent - restart 1785 * check_extent() from the top: 1786 */ 1787 ret = bch_err_throw(c, transaction_restart_nested); 1788 } 1789 } 1790 fsck_err: 1791 err: 1792 bch2_trans_iter_exit(trans, &iter2); 1793 bch2_trans_iter_exit(trans, &iter1); 1794 printbuf_exit(&buf); 1795 return ret; 1796 } 1797 1798 static int check_overlapping_extents(struct btree_trans *trans, 1799 struct snapshots_seen *seen, 1800 struct extent_ends *extent_ends, 1801 struct bkey_s_c k, 1802 struct btree_iter *iter, 1803 bool *fixed) 1804 { 1805 struct bch_fs *c = trans->c; 1806 int ret = 0; 1807 1808 /* transaction restart, running again */ 1809 if (bpos_eq(extent_ends->last_pos, k.k->p)) 1810 return 0; 1811 1812 if (extent_ends->last_pos.inode != k.k->p.inode) 1813 extent_ends_reset(extent_ends); 1814 1815 darray_for_each(extent_ends->e, i) { 1816 if (i->offset <= bkey_start_offset(k.k)) 1817 continue; 1818 1819 if (!ref_visible2(c, 1820 k.k->p.snapshot, seen, 1821 i->snapshot, &i->seen)) 1822 continue; 1823 1824 ret = overlapping_extents_found(trans, iter->btree_id, 1825 SPOS(iter->pos.inode, 1826 i->offset, 1827 i->snapshot), 1828 &i->seen, 1829 *k.k, fixed, i); 1830 if (ret) 1831 goto err; 1832 } 1833 1834 extent_ends->last_pos = k.k->p; 1835 err: 1836 return ret; 1837 } 1838 1839 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, 1840 struct bkey_s_c k) 1841 { 1842 struct bch_fs *c = trans->c; 1843 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1844 struct bch_extent_crc_unpacked crc; 1845 const union bch_extent_entry *i; 1846 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; 1847 1848 bkey_for_each_crc(k.k, ptrs, crc, i) 1849 if (crc_is_encoded(crc) && 1850 crc.uncompressed_size > encoded_extent_max_sectors) { 1851 struct printbuf buf = PRINTBUF; 1852 1853 bch2_bkey_val_to_text(&buf, c, k); 1854 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); 1855 printbuf_exit(&buf); 1856 } 1857 1858 return 0; 1859 } 1860 1861 static int check_extent(struct btree_trans *trans, struct btree_iter *iter, 1862 struct bkey_s_c k, 1863 struct inode_walker *inode, 1864 struct snapshots_seen *s, 1865 struct extent_ends *extent_ends, 1866 struct disk_reservation *res) 1867 { 1868 struct bch_fs *c = trans->c; 1869 struct printbuf buf = PRINTBUF; 1870 int ret = 0; 1871 1872 ret = bch2_check_key_has_snapshot(trans, iter, k); 1873 if (ret) { 1874 ret = ret < 0 ? ret : 0; 1875 goto out; 1876 } 1877 1878 if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { 1879 ret = check_i_sectors(trans, inode); 1880 if (ret) 1881 goto err; 1882 } 1883 1884 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1885 if (ret) 1886 goto err; 1887 1888 struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); 1889 ret = PTR_ERR_OR_ZERO(extent_i); 1890 if (ret) 1891 goto err; 1892 1893 ret = check_key_has_inode(trans, iter, inode, extent_i, k); 1894 if (ret) 1895 goto err; 1896 1897 if (k.k->type != KEY_TYPE_whiteout) { 1898 ret = check_overlapping_extents(trans, s, extent_ends, k, iter, 1899 &inode->recalculate_sums); 1900 if (ret) 1901 goto err; 1902 1903 /* 1904 * Check inodes in reverse order, from oldest snapshots to 1905 * newest, starting from the inode that matches this extent's 1906 * snapshot. If we didn't have one, iterate over all inodes: 1907 */ 1908 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1909 inode->inodes.data && i >= inode->inodes.data; 1910 --i) { 1911 if (i->inode.bi_snapshot > k.k->p.snapshot || 1912 !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) 1913 continue; 1914 1915 u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9; 1916 1917 if (fsck_err_on(k.k->p.offset > last_block && 1918 !bkey_extent_is_reservation(k), 1919 trans, extent_past_end_of_inode, 1920 "extent type past end of inode %llu:%u, i_size %llu\n%s", 1921 i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, 1922 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1923 ret = bch2_fpunch_snapshot(trans, 1924 SPOS(i->inode.bi_inum, 1925 last_block, 1926 i->inode.bi_snapshot), 1927 POS(i->inode.bi_inum, U64_MAX)); 1928 if (ret) 1929 goto err; 1930 1931 iter->k.type = KEY_TYPE_whiteout; 1932 break; 1933 } 1934 } 1935 } 1936 1937 ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); 1938 if (ret) 1939 goto err; 1940 1941 if (bkey_extent_is_allocation(k.k)) { 1942 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1943 inode->inodes.data && i >= inode->inodes.data; 1944 --i) { 1945 if (i->whiteout || 1946 i->inode.bi_snapshot > k.k->p.snapshot || 1947 !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) 1948 continue; 1949 1950 i->count += k.k->size; 1951 } 1952 } 1953 1954 if (k.k->type != KEY_TYPE_whiteout) { 1955 ret = extent_ends_at(c, extent_ends, s, k); 1956 if (ret) 1957 goto err; 1958 } 1959 out: 1960 err: 1961 fsck_err: 1962 printbuf_exit(&buf); 1963 bch_err_fn(c, ret); 1964 return ret; 1965 } 1966 1967 /* 1968 * Walk extents: verify that extents have a corresponding S_ISREG inode, and 1969 * that i_size an i_sectors are consistent 1970 */ 1971 int bch2_check_extents(struct bch_fs *c) 1972 { 1973 struct inode_walker w = inode_walker_init(); 1974 struct snapshots_seen s; 1975 struct extent_ends extent_ends; 1976 struct disk_reservation res = { 0 }; 1977 1978 snapshots_seen_init(&s); 1979 extent_ends_init(&extent_ends); 1980 1981 int ret = bch2_trans_run(c, 1982 for_each_btree_key(trans, iter, BTREE_ID_extents, 1983 POS(BCACHEFS_ROOT_INO, 0), 1984 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ 1985 bch2_disk_reservation_put(c, &res); 1986 check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: 1987 check_extent_overbig(trans, &iter, k); 1988 })) ?: 1989 check_i_sectors_notnested(trans, &w)); 1990 1991 bch2_disk_reservation_put(c, &res); 1992 extent_ends_exit(&extent_ends); 1993 inode_walker_exit(&w); 1994 snapshots_seen_exit(&s); 1995 1996 bch_err_fn(c, ret); 1997 return ret; 1998 } 1999 2000 int bch2_check_indirect_extents(struct bch_fs *c) 2001 { 2002 struct disk_reservation res = { 0 }; 2003 2004 int ret = bch2_trans_run(c, 2005 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, 2006 POS_MIN, 2007 BTREE_ITER_prefetch, k, 2008 &res, NULL, 2009 BCH_TRANS_COMMIT_no_enospc, ({ 2010 bch2_disk_reservation_put(c, &res); 2011 check_extent_overbig(trans, &iter, k); 2012 }))); 2013 2014 bch2_disk_reservation_put(c, &res); 2015 bch_err_fn(c, ret); 2016 return ret; 2017 } 2018 2019 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) 2020 { 2021 struct bch_fs *c = trans->c; 2022 int ret = 0; 2023 s64 count2; 2024 2025 darray_for_each(w->inodes, i) { 2026 if (i->inode.bi_nlink == i->count) 2027 continue; 2028 2029 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); 2030 if (count2 < 0) 2031 return count2; 2032 2033 if (i->count != count2) { 2034 bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", 2035 w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); 2036 i->count = count2; 2037 if (i->inode.bi_nlink == i->count) 2038 continue; 2039 } 2040 2041 if (i->inode.bi_nlink != i->count) { 2042 CLASS(printbuf, buf)(); 2043 2044 lockrestart_do(trans, 2045 bch2_inum_snapshot_to_path(trans, w->last_pos.inode, 2046 i->inode.bi_snapshot, NULL, &buf)); 2047 2048 if (fsck_err_on(i->inode.bi_nlink != i->count, 2049 trans, inode_dir_wrong_nlink, 2050 "directory with wrong i_nlink: got %u, should be %llu\n%s", 2051 i->inode.bi_nlink, i->count, buf.buf)) { 2052 i->inode.bi_nlink = i->count; 2053 ret = bch2_fsck_write_inode(trans, &i->inode); 2054 if (ret) 2055 break; 2056 } 2057 } 2058 } 2059 fsck_err: 2060 bch_err_fn(c, ret); 2061 return ret; 2062 } 2063 2064 static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) 2065 { 2066 u32 restart_count = trans->restart_count; 2067 return check_subdir_count_notnested(trans, w) ?: 2068 trans_was_restarted(trans, restart_count); 2069 } 2070 2071 /* find a subvolume that's a descendent of @snapshot: */ 2072 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) 2073 { 2074 struct btree_iter iter; 2075 struct bkey_s_c k; 2076 int ret; 2077 2078 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { 2079 if (k.k->type != KEY_TYPE_subvolume) 2080 continue; 2081 2082 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2083 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { 2084 bch2_trans_iter_exit(trans, &iter); 2085 *subvolid = k.k->p.offset; 2086 goto found; 2087 } 2088 } 2089 if (!ret) 2090 ret = -ENOENT; 2091 found: 2092 bch2_trans_iter_exit(trans, &iter); 2093 return ret; 2094 } 2095 2096 noinline_for_stack 2097 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, 2098 struct bkey_s_c_dirent d) 2099 { 2100 struct bch_fs *c = trans->c; 2101 struct btree_iter subvol_iter = {}; 2102 struct bch_inode_unpacked subvol_root; 2103 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); 2104 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 2105 u32 parent_snapshot; 2106 u32 new_parent_subvol = 0; 2107 u64 parent_inum; 2108 struct printbuf buf = PRINTBUF; 2109 int ret = 0; 2110 2111 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); 2112 if (ret && !bch2_err_matches(ret, ENOENT)) 2113 return ret; 2114 2115 if (ret || 2116 (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { 2117 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); 2118 if (ret2 && !bch2_err_matches(ret, ENOENT)) 2119 return ret2; 2120 } 2121 2122 if (ret && 2123 !new_parent_subvol && 2124 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 2125 /* 2126 * Couldn't find a subvol for dirent's snapshot - but we lost 2127 * subvols, so we need to reconstruct: 2128 */ 2129 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); 2130 if (ret) 2131 return ret; 2132 2133 parent_snapshot = d.k->p.snapshot; 2134 } 2135 2136 if (fsck_err_on(ret, 2137 trans, dirent_to_missing_parent_subvol, 2138 "dirent parent_subvol points to missing subvolume\n%s", 2139 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || 2140 fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), 2141 trans, dirent_not_visible_in_parent_subvol, 2142 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", 2143 parent_snapshot, 2144 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 2145 if (!new_parent_subvol) { 2146 bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); 2147 return bch_err_throw(c, fsck_repair_unimplemented); 2148 } 2149 2150 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); 2151 ret = PTR_ERR_OR_ZERO(new_dirent); 2152 if (ret) 2153 goto err; 2154 2155 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); 2156 } 2157 2158 struct bkey_s_c_subvolume s = 2159 bch2_bkey_get_iter_typed(trans, &subvol_iter, 2160 BTREE_ID_subvolumes, POS(0, target_subvol), 2161 0, subvolume); 2162 ret = bkey_err(s.s_c); 2163 if (ret && !bch2_err_matches(ret, ENOENT)) 2164 goto err; 2165 2166 if (ret) { 2167 if (fsck_err(trans, dirent_to_missing_subvol, 2168 "dirent points to missing subvolume\n%s", 2169 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) 2170 return bch2_fsck_remove_dirent(trans, d.k->p); 2171 ret = 0; 2172 goto out; 2173 } 2174 2175 if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { 2176 printbuf_reset(&buf); 2177 2178 prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", 2179 parent_subvol); 2180 2181 ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, 2182 le64_to_cpu(s.v->inode) }, &buf); 2183 if (ret) 2184 goto err; 2185 prt_newline(&buf); 2186 bch2_bkey_val_to_text(&buf, c, s.s_c); 2187 2188 if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { 2189 struct bkey_i_subvolume *n = 2190 bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); 2191 ret = PTR_ERR_OR_ZERO(n); 2192 if (ret) 2193 goto err; 2194 2195 n->v.fs_path_parent = cpu_to_le32(parent_subvol); 2196 } 2197 } 2198 2199 u64 target_inum = le64_to_cpu(s.v->inode); 2200 u32 target_snapshot = le32_to_cpu(s.v->snapshot); 2201 2202 ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, 2203 &subvol_root, 0); 2204 if (ret && !bch2_err_matches(ret, ENOENT)) 2205 goto err; 2206 2207 if (ret) { 2208 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); 2209 ret = bch_err_throw(c, fsck_repair_unimplemented); 2210 goto err; 2211 } 2212 2213 if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, 2214 trans, inode_bi_parent_wrong, 2215 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", 2216 target_inum, 2217 subvol_root.bi_parent_subvol, parent_subvol)) { 2218 subvol_root.bi_parent_subvol = parent_subvol; 2219 subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot); 2220 ret = __bch2_fsck_write_inode(trans, &subvol_root); 2221 if (ret) 2222 goto err; 2223 } 2224 2225 ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); 2226 if (ret) 2227 goto err; 2228 out: 2229 err: 2230 fsck_err: 2231 bch2_trans_iter_exit(trans, &subvol_iter); 2232 printbuf_exit(&buf); 2233 return ret; 2234 } 2235 2236 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, 2237 struct bkey_s_c k, 2238 struct bch_hash_info *hash_info, 2239 struct inode_walker *dir, 2240 struct inode_walker *target, 2241 struct snapshots_seen *s, 2242 bool *need_second_pass) 2243 { 2244 struct bch_fs *c = trans->c; 2245 struct inode_walker_entry *i; 2246 struct printbuf buf = PRINTBUF; 2247 int ret = 0; 2248 2249 ret = bch2_check_key_has_snapshot(trans, iter, k); 2250 if (ret) { 2251 ret = ret < 0 ? ret : 0; 2252 goto out; 2253 } 2254 2255 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 2256 if (ret) 2257 goto err; 2258 2259 if (k.k->type == KEY_TYPE_whiteout) 2260 goto out; 2261 2262 if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { 2263 ret = check_subdir_dirents_count(trans, dir); 2264 if (ret) 2265 goto err; 2266 } 2267 2268 i = walk_inode(trans, dir, k); 2269 ret = PTR_ERR_OR_ZERO(i); 2270 if (ret < 0) 2271 goto err; 2272 2273 ret = check_key_has_inode(trans, iter, dir, i, k); 2274 if (ret) 2275 goto err; 2276 2277 if (!i || i->whiteout) 2278 goto out; 2279 2280 if (dir->first_this_inode) 2281 *hash_info = bch2_hash_info_init(c, &i->inode); 2282 dir->first_this_inode = false; 2283 2284 hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; 2285 2286 ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, 2287 iter, k, need_second_pass); 2288 if (ret < 0) 2289 goto err; 2290 if (ret) { 2291 /* dirent has been deleted */ 2292 ret = 0; 2293 goto out; 2294 } 2295 2296 if (k.k->type != KEY_TYPE_dirent) 2297 goto out; 2298 2299 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2300 2301 /* check casefold */ 2302 if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding, 2303 trans, dirent_casefold_mismatch, 2304 "dirent casefold does not match dir casefold\n%s", 2305 (printbuf_reset(&buf), 2306 bch2_bkey_val_to_text(&buf, c, k), 2307 buf.buf))) { 2308 subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL 2309 ? le32_to_cpu(d.v->d_parent_subvol) 2310 : 0, 2311 }; 2312 u64 target = d.v->d_type == DT_SUBVOL 2313 ? le32_to_cpu(d.v->d_child_subvol) 2314 : le64_to_cpu(d.v->d_inum); 2315 struct qstr name = bch2_dirent_get_name(d); 2316 2317 struct bkey_i_dirent *new_d = 2318 bch2_dirent_create_key(trans, hash_info, dir_inum, 2319 d.v->d_type, &name, NULL, target); 2320 ret = PTR_ERR_OR_ZERO(new_d); 2321 if (ret) 2322 goto out; 2323 2324 new_d->k.p.inode = d.k->p.inode; 2325 new_d->k.p.snapshot = d.k->p.snapshot; 2326 2327 struct btree_iter dup_iter = {}; 2328 ret = bch2_hash_delete_at(trans, 2329 bch2_dirent_hash_desc, hash_info, iter, 2330 BTREE_UPDATE_internal_snapshot_node) ?: 2331 bch2_str_hash_repair_key(trans, s, 2332 &bch2_dirent_hash_desc, hash_info, 2333 iter, bkey_i_to_s_c(&new_d->k_i), 2334 &dup_iter, bkey_s_c_null, 2335 need_second_pass); 2336 goto out; 2337 } 2338 2339 if (d.v->d_type == DT_SUBVOL) { 2340 ret = check_dirent_to_subvol(trans, iter, d); 2341 if (ret) 2342 goto err; 2343 } else { 2344 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); 2345 if (ret) 2346 goto err; 2347 2348 if (fsck_err_on(!target->inodes.nr, 2349 trans, dirent_to_missing_inode, 2350 "dirent points to missing inode:\n%s", 2351 (printbuf_reset(&buf), 2352 bch2_bkey_val_to_text(&buf, c, k), 2353 buf.buf))) { 2354 ret = bch2_fsck_remove_dirent(trans, d.k->p); 2355 if (ret) 2356 goto err; 2357 } 2358 2359 darray_for_each(target->inodes, i) { 2360 ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); 2361 if (ret) 2362 goto err; 2363 } 2364 2365 darray_for_each(target->deletes, i) 2366 if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i), 2367 trans, dirent_to_overwritten_inode, 2368 "dirent points to inode overwritten in snapshot %u:\n%s", 2369 *i, 2370 (printbuf_reset(&buf), 2371 bch2_bkey_val_to_text(&buf, c, k), 2372 buf.buf))) { 2373 struct btree_iter delete_iter; 2374 bch2_trans_iter_init(trans, &delete_iter, 2375 BTREE_ID_dirents, 2376 SPOS(k.k->p.inode, k.k->p.offset, *i), 2377 BTREE_ITER_intent); 2378 ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: 2379 bch2_hash_delete_at(trans, bch2_dirent_hash_desc, 2380 hash_info, 2381 &delete_iter, 2382 BTREE_UPDATE_internal_snapshot_node); 2383 bch2_trans_iter_exit(trans, &delete_iter); 2384 if (ret) 2385 goto err; 2386 2387 } 2388 } 2389 2390 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 2391 if (ret) 2392 goto err; 2393 2394 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { 2395 if (d.v->d_type == DT_DIR) 2396 i->count++; 2397 i->i_size += bkey_bytes(d.k); 2398 } 2399 out: 2400 err: 2401 fsck_err: 2402 printbuf_exit(&buf); 2403 return ret; 2404 } 2405 2406 /* 2407 * Walk dirents: verify that they all have a corresponding S_ISDIR inode, 2408 * validate d_type 2409 */ 2410 int bch2_check_dirents(struct bch_fs *c) 2411 { 2412 struct inode_walker dir = inode_walker_init(); 2413 struct inode_walker target = inode_walker_init(); 2414 struct snapshots_seen s; 2415 struct bch_hash_info hash_info; 2416 bool need_second_pass = false, did_second_pass = false; 2417 int ret; 2418 2419 snapshots_seen_init(&s); 2420 again: 2421 ret = bch2_trans_run(c, 2422 for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, 2423 POS(BCACHEFS_ROOT_INO, 0), 2424 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 2425 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2426 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, 2427 &need_second_pass)) ?: 2428 check_subdir_count_notnested(trans, &dir)); 2429 2430 if (!ret && need_second_pass && !did_second_pass) { 2431 bch_info(c, "check_dirents requires second pass"); 2432 swap(did_second_pass, need_second_pass); 2433 goto again; 2434 } 2435 2436 if (!ret && need_second_pass) { 2437 bch_err(c, "dirents not repairing"); 2438 ret = -EINVAL; 2439 } 2440 2441 snapshots_seen_exit(&s); 2442 inode_walker_exit(&dir); 2443 inode_walker_exit(&target); 2444 bch_err_fn(c, ret); 2445 return ret; 2446 } 2447 2448 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, 2449 struct bkey_s_c k, 2450 struct bch_hash_info *hash_info, 2451 struct inode_walker *inode) 2452 { 2453 struct bch_fs *c = trans->c; 2454 2455 int ret = bch2_check_key_has_snapshot(trans, iter, k); 2456 if (ret < 0) 2457 return ret; 2458 if (ret) 2459 return 0; 2460 2461 struct inode_walker_entry *i = walk_inode(trans, inode, k); 2462 ret = PTR_ERR_OR_ZERO(i); 2463 if (ret) 2464 return ret; 2465 2466 ret = check_key_has_inode(trans, iter, inode, i, k); 2467 if (ret) 2468 return ret; 2469 2470 if (!i || i->whiteout) 2471 return 0; 2472 2473 if (inode->first_this_inode) 2474 *hash_info = bch2_hash_info_init(c, &i->inode); 2475 inode->first_this_inode = false; 2476 2477 bool need_second_pass = false; 2478 return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, 2479 iter, k, &need_second_pass); 2480 } 2481 2482 /* 2483 * Walk xattrs: verify that they all have a corresponding inode 2484 */ 2485 int bch2_check_xattrs(struct bch_fs *c) 2486 { 2487 struct inode_walker inode = inode_walker_init(); 2488 struct bch_hash_info hash_info; 2489 int ret = 0; 2490 2491 ret = bch2_trans_run(c, 2492 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, 2493 POS(BCACHEFS_ROOT_INO, 0), 2494 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, 2495 k, 2496 NULL, NULL, 2497 BCH_TRANS_COMMIT_no_enospc, 2498 check_xattr(trans, &iter, k, &hash_info, &inode))); 2499 2500 inode_walker_exit(&inode); 2501 bch_err_fn(c, ret); 2502 return ret; 2503 } 2504 2505 static int check_root_trans(struct btree_trans *trans) 2506 { 2507 struct bch_fs *c = trans->c; 2508 struct bch_inode_unpacked root_inode; 2509 u32 snapshot; 2510 u64 inum; 2511 int ret; 2512 2513 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); 2514 if (ret && !bch2_err_matches(ret, ENOENT)) 2515 return ret; 2516 2517 if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, 2518 "root subvol missing")) { 2519 struct bkey_i_subvolume *root_subvol = 2520 bch2_trans_kmalloc(trans, sizeof(*root_subvol)); 2521 ret = PTR_ERR_OR_ZERO(root_subvol); 2522 if (ret) 2523 goto err; 2524 2525 snapshot = U32_MAX; 2526 inum = BCACHEFS_ROOT_INO; 2527 2528 bkey_subvolume_init(&root_subvol->k_i); 2529 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; 2530 root_subvol->v.flags = 0; 2531 root_subvol->v.snapshot = cpu_to_le32(snapshot); 2532 root_subvol->v.inode = cpu_to_le64(inum); 2533 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); 2534 bch_err_msg(c, ret, "writing root subvol"); 2535 if (ret) 2536 goto err; 2537 } 2538 2539 ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, 2540 &root_inode, 0); 2541 if (ret && !bch2_err_matches(ret, ENOENT)) 2542 return ret; 2543 2544 if (mustfix_fsck_err_on(ret, 2545 trans, root_dir_missing, 2546 "root directory missing") || 2547 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), 2548 trans, root_inode_not_dir, 2549 "root inode not a directory")) { 2550 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 2551 0, NULL); 2552 root_inode.bi_inum = inum; 2553 root_inode.bi_snapshot = snapshot; 2554 2555 ret = __bch2_fsck_write_inode(trans, &root_inode); 2556 bch_err_msg(c, ret, "writing root inode"); 2557 } 2558 err: 2559 fsck_err: 2560 return ret; 2561 } 2562 2563 /* Get root directory, create if it doesn't exist: */ 2564 int bch2_check_root(struct bch_fs *c) 2565 { 2566 int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2567 check_root_trans(trans)); 2568 bch_err_fn(c, ret); 2569 return ret; 2570 } 2571 2572 static bool darray_u32_has(darray_u32 *d, u32 v) 2573 { 2574 darray_for_each(*d, i) 2575 if (*i == v) 2576 return true; 2577 return false; 2578 } 2579 2580 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) 2581 { 2582 struct bch_fs *c = trans->c; 2583 struct btree_iter parent_iter = {}; 2584 darray_u32 subvol_path = {}; 2585 struct printbuf buf = PRINTBUF; 2586 int ret = 0; 2587 2588 if (k.k->type != KEY_TYPE_subvolume) 2589 return 0; 2590 2591 subvol_inum start = { 2592 .subvol = k.k->p.offset, 2593 .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode), 2594 }; 2595 2596 while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { 2597 ret = darray_push(&subvol_path, k.k->p.offset); 2598 if (ret) 2599 goto err; 2600 2601 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2602 2603 struct bch_inode_unpacked subvol_root; 2604 ret = bch2_inode_find_by_inum_trans(trans, 2605 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 2606 &subvol_root); 2607 if (ret) 2608 break; 2609 2610 u32 parent = le32_to_cpu(s.v->fs_path_parent); 2611 2612 if (darray_u32_has(&subvol_path, parent)) { 2613 printbuf_reset(&buf); 2614 prt_printf(&buf, "subvolume loop: "); 2615 2616 ret = bch2_inum_to_path(trans, start, &buf); 2617 if (ret) 2618 goto err; 2619 2620 if (fsck_err(trans, subvol_loop, "%s", buf.buf)) 2621 ret = reattach_subvol(trans, s); 2622 break; 2623 } 2624 2625 bch2_trans_iter_exit(trans, &parent_iter); 2626 bch2_trans_iter_init(trans, &parent_iter, 2627 BTREE_ID_subvolumes, POS(0, parent), 0); 2628 k = bch2_btree_iter_peek_slot(trans, &parent_iter); 2629 ret = bkey_err(k); 2630 if (ret) 2631 goto err; 2632 2633 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, 2634 trans, subvol_unreachable, 2635 "unreachable subvolume %s", 2636 (printbuf_reset(&buf), 2637 bch2_bkey_val_to_text(&buf, c, s.s_c), 2638 buf.buf))) { 2639 ret = reattach_subvol(trans, s); 2640 break; 2641 } 2642 } 2643 fsck_err: 2644 err: 2645 printbuf_exit(&buf); 2646 darray_exit(&subvol_path); 2647 bch2_trans_iter_exit(trans, &parent_iter); 2648 return ret; 2649 } 2650 2651 int bch2_check_subvolume_structure(struct bch_fs *c) 2652 { 2653 int ret = bch2_trans_run(c, 2654 for_each_btree_key_commit(trans, iter, 2655 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, 2656 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2657 check_subvol_path(trans, &iter, k))); 2658 bch_err_fn(c, ret); 2659 return ret; 2660 } 2661 2662 static int bch2_bi_depth_renumber_one(struct btree_trans *trans, 2663 u64 inum, u32 snapshot, 2664 u32 new_depth) 2665 { 2666 struct btree_iter iter; 2667 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 2668 SPOS(0, inum, snapshot), 0); 2669 2670 struct bch_inode_unpacked inode; 2671 int ret = bkey_err(k) ?: 2672 !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode 2673 : bch2_inode_unpack(k, &inode); 2674 if (ret) 2675 goto err; 2676 2677 if (inode.bi_depth != new_depth) { 2678 inode.bi_depth = new_depth; 2679 ret = __bch2_fsck_write_inode(trans, &inode) ?: 2680 bch2_trans_commit(trans, NULL, NULL, 0); 2681 } 2682 err: 2683 bch2_trans_iter_exit(trans, &iter); 2684 return ret; 2685 } 2686 2687 static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path, 2688 u32 snapshot, u32 new_bi_depth) 2689 { 2690 u32 restart_count = trans->restart_count; 2691 int ret = 0; 2692 2693 darray_for_each_reverse(*path, i) { 2694 ret = nested_lockrestart_do(trans, 2695 bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth)); 2696 bch_err_fn(trans->c, ret); 2697 if (ret) 2698 break; 2699 2700 new_bi_depth++; 2701 } 2702 2703 return ret ?: trans_was_restarted(trans, restart_count); 2704 } 2705 2706 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) 2707 { 2708 struct bch_fs *c = trans->c; 2709 struct btree_iter inode_iter = {}; 2710 darray_u64 path = {}; 2711 struct printbuf buf = PRINTBUF; 2712 u32 snapshot = inode_k.k->p.snapshot; 2713 bool redo_bi_depth = false; 2714 u32 min_bi_depth = U32_MAX; 2715 int ret = 0; 2716 2717 struct bpos start = inode_k.k->p; 2718 2719 struct bch_inode_unpacked inode; 2720 ret = bch2_inode_unpack(inode_k, &inode); 2721 if (ret) 2722 return ret; 2723 2724 /* 2725 * If we're running full fsck, check_dirents() will have already ran, 2726 * and we shouldn't see any missing backpointers here - otherwise that's 2727 * handled separately, by check_unreachable_inodes 2728 */ 2729 while (!inode.bi_subvol && 2730 bch2_inode_has_backpointer(&inode)) { 2731 struct btree_iter dirent_iter; 2732 struct bkey_s_c_dirent d; 2733 2734 d = dirent_get_by_pos(trans, &dirent_iter, 2735 SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot)); 2736 ret = bkey_err(d.s_c); 2737 if (ret && !bch2_err_matches(ret, ENOENT)) 2738 goto out; 2739 2740 if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) 2741 bch2_trans_iter_exit(trans, &dirent_iter); 2742 2743 if (bch2_err_matches(ret, ENOENT)) { 2744 printbuf_reset(&buf); 2745 bch2_bkey_val_to_text(&buf, c, inode_k); 2746 bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", 2747 bch2_err_str(ret), buf.buf); 2748 goto out; 2749 } 2750 2751 bch2_trans_iter_exit(trans, &dirent_iter); 2752 2753 ret = darray_push(&path, inode.bi_inum); 2754 if (ret) 2755 return ret; 2756 2757 bch2_trans_iter_exit(trans, &inode_iter); 2758 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, 2759 SPOS(0, inode.bi_dir, snapshot), 0); 2760 2761 struct bch_inode_unpacked parent_inode; 2762 ret = bkey_err(inode_k) ?: 2763 !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode 2764 : bch2_inode_unpack(inode_k, &parent_inode); 2765 if (ret) { 2766 /* Should have been caught in dirents pass */ 2767 bch_err_msg(c, ret, "error looking up parent directory"); 2768 goto out; 2769 } 2770 2771 min_bi_depth = parent_inode.bi_depth; 2772 2773 if (parent_inode.bi_depth < inode.bi_depth && 2774 min_bi_depth < U16_MAX) 2775 break; 2776 2777 inode = parent_inode; 2778 redo_bi_depth = true; 2779 2780 if (darray_find(path, inode.bi_inum)) { 2781 printbuf_reset(&buf); 2782 prt_printf(&buf, "directory structure loop in snapshot %u: ", 2783 snapshot); 2784 2785 ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf); 2786 if (ret) 2787 goto out; 2788 2789 if (c->opts.verbose) { 2790 prt_newline(&buf); 2791 darray_for_each(path, i) 2792 prt_printf(&buf, "%llu ", *i); 2793 } 2794 2795 if (fsck_err(trans, dir_loop, "%s", buf.buf)) { 2796 ret = remove_backpointer(trans, &inode); 2797 bch_err_msg(c, ret, "removing dirent"); 2798 if (ret) 2799 goto out; 2800 2801 ret = reattach_inode(trans, &inode); 2802 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 2803 } 2804 2805 goto out; 2806 } 2807 } 2808 2809 if (inode.bi_subvol) 2810 min_bi_depth = 0; 2811 2812 if (redo_bi_depth) 2813 ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth); 2814 out: 2815 fsck_err: 2816 bch2_trans_iter_exit(trans, &inode_iter); 2817 darray_exit(&path); 2818 printbuf_exit(&buf); 2819 bch_err_fn(c, ret); 2820 return ret; 2821 } 2822 2823 /* 2824 * Check for loops in the directory structure: all other connectivity issues 2825 * have been fixed by prior passes 2826 */ 2827 int bch2_check_directory_structure(struct bch_fs *c) 2828 { 2829 int ret = bch2_trans_run(c, 2830 for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN, 2831 BTREE_ITER_intent| 2832 BTREE_ITER_prefetch| 2833 BTREE_ITER_all_snapshots, k, 2834 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 2835 if (!S_ISDIR(bkey_inode_mode(k))) 2836 continue; 2837 2838 if (bch2_inode_flags(k) & BCH_INODE_unlinked) 2839 continue; 2840 2841 check_path_loop(trans, k); 2842 }))); 2843 2844 bch_err_fn(c, ret); 2845 return ret; 2846 } 2847 2848 struct nlink_table { 2849 size_t nr; 2850 size_t size; 2851 2852 struct nlink { 2853 u64 inum; 2854 u32 snapshot; 2855 u32 count; 2856 } *d; 2857 }; 2858 2859 static int add_nlink(struct bch_fs *c, struct nlink_table *t, 2860 u64 inum, u32 snapshot) 2861 { 2862 if (t->nr == t->size) { 2863 size_t new_size = max_t(size_t, 128UL, t->size * 2); 2864 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); 2865 2866 if (!d) { 2867 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", 2868 new_size); 2869 return bch_err_throw(c, ENOMEM_fsck_add_nlink); 2870 } 2871 2872 if (t->d) 2873 memcpy(d, t->d, t->size * sizeof(t->d[0])); 2874 kvfree(t->d); 2875 2876 t->d = d; 2877 t->size = new_size; 2878 } 2879 2880 2881 t->d[t->nr++] = (struct nlink) { 2882 .inum = inum, 2883 .snapshot = snapshot, 2884 }; 2885 2886 return 0; 2887 } 2888 2889 static int nlink_cmp(const void *_l, const void *_r) 2890 { 2891 const struct nlink *l = _l; 2892 const struct nlink *r = _r; 2893 2894 return cmp_int(l->inum, r->inum); 2895 } 2896 2897 static void inc_link(struct bch_fs *c, struct snapshots_seen *s, 2898 struct nlink_table *links, 2899 u64 range_start, u64 range_end, u64 inum, u32 snapshot) 2900 { 2901 struct nlink *link, key = { 2902 .inum = inum, .snapshot = U32_MAX, 2903 }; 2904 2905 if (inum < range_start || inum >= range_end) 2906 return; 2907 2908 link = __inline_bsearch(&key, links->d, links->nr, 2909 sizeof(links->d[0]), nlink_cmp); 2910 if (!link) 2911 return; 2912 2913 while (link > links->d && link[0].inum == link[-1].inum) 2914 --link; 2915 2916 for (; link < links->d + links->nr && link->inum == inum; link++) 2917 if (ref_visible(c, s, snapshot, link->snapshot)) { 2918 link->count++; 2919 if (link->snapshot >= snapshot) 2920 break; 2921 } 2922 } 2923 2924 noinline_for_stack 2925 static int check_nlinks_find_hardlinks(struct bch_fs *c, 2926 struct nlink_table *t, 2927 u64 start, u64 *end) 2928 { 2929 int ret = bch2_trans_run(c, 2930 for_each_btree_key(trans, iter, BTREE_ID_inodes, 2931 POS(0, start), 2932 BTREE_ITER_intent| 2933 BTREE_ITER_prefetch| 2934 BTREE_ITER_all_snapshots, k, ({ 2935 if (!bkey_is_inode(k.k)) 2936 continue; 2937 2938 /* Should never fail, checked by bch2_inode_invalid: */ 2939 struct bch_inode_unpacked u; 2940 _ret3 = bch2_inode_unpack(k, &u); 2941 if (_ret3) 2942 break; 2943 2944 /* 2945 * Backpointer and directory structure checks are sufficient for 2946 * directories, since they can't have hardlinks: 2947 */ 2948 if (S_ISDIR(u.bi_mode)) 2949 continue; 2950 2951 /* 2952 * Previous passes ensured that bi_nlink is nonzero if 2953 * it had multiple hardlinks: 2954 */ 2955 if (!u.bi_nlink) 2956 continue; 2957 2958 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); 2959 if (ret) { 2960 *end = k.k->p.offset; 2961 ret = 0; 2962 break; 2963 } 2964 0; 2965 }))); 2966 2967 bch_err_fn(c, ret); 2968 return ret; 2969 } 2970 2971 noinline_for_stack 2972 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, 2973 u64 range_start, u64 range_end) 2974 { 2975 struct snapshots_seen s; 2976 2977 snapshots_seen_init(&s); 2978 2979 int ret = bch2_trans_run(c, 2980 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, 2981 BTREE_ITER_intent| 2982 BTREE_ITER_prefetch| 2983 BTREE_ITER_all_snapshots, k, ({ 2984 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); 2985 if (ret) 2986 break; 2987 2988 if (k.k->type == KEY_TYPE_dirent) { 2989 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2990 2991 if (d.v->d_type != DT_DIR && 2992 d.v->d_type != DT_SUBVOL) 2993 inc_link(c, &s, links, range_start, range_end, 2994 le64_to_cpu(d.v->d_inum), d.k->p.snapshot); 2995 } 2996 0; 2997 }))); 2998 2999 snapshots_seen_exit(&s); 3000 3001 bch_err_fn(c, ret); 3002 return ret; 3003 } 3004 3005 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, 3006 struct bkey_s_c k, 3007 struct nlink_table *links, 3008 size_t *idx, u64 range_end) 3009 { 3010 struct bch_inode_unpacked u; 3011 struct nlink *link = &links->d[*idx]; 3012 int ret = 0; 3013 3014 if (k.k->p.offset >= range_end) 3015 return 1; 3016 3017 if (!bkey_is_inode(k.k)) 3018 return 0; 3019 3020 ret = bch2_inode_unpack(k, &u); 3021 if (ret) 3022 return ret; 3023 3024 if (S_ISDIR(u.bi_mode)) 3025 return 0; 3026 3027 if (!u.bi_nlink) 3028 return 0; 3029 3030 while ((cmp_int(link->inum, k.k->p.offset) ?: 3031 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { 3032 BUG_ON(*idx == links->nr); 3033 link = &links->d[++*idx]; 3034 } 3035 3036 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, 3037 trans, inode_wrong_nlink, 3038 "inode %llu type %s has wrong i_nlink (%u, should be %u)", 3039 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], 3040 bch2_inode_nlink_get(&u), link->count)) { 3041 bch2_inode_nlink_set(&u, link->count); 3042 ret = __bch2_fsck_write_inode(trans, &u); 3043 } 3044 fsck_err: 3045 return ret; 3046 } 3047 3048 noinline_for_stack 3049 static int check_nlinks_update_hardlinks(struct bch_fs *c, 3050 struct nlink_table *links, 3051 u64 range_start, u64 range_end) 3052 { 3053 size_t idx = 0; 3054 3055 int ret = bch2_trans_run(c, 3056 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 3057 POS(0, range_start), 3058 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 3059 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 3060 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); 3061 if (ret < 0) { 3062 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); 3063 return ret; 3064 } 3065 3066 return 0; 3067 } 3068 3069 int bch2_check_nlinks(struct bch_fs *c) 3070 { 3071 struct nlink_table links = { 0 }; 3072 u64 this_iter_range_start, next_iter_range_start = 0; 3073 int ret = 0; 3074 3075 do { 3076 this_iter_range_start = next_iter_range_start; 3077 next_iter_range_start = U64_MAX; 3078 3079 ret = check_nlinks_find_hardlinks(c, &links, 3080 this_iter_range_start, 3081 &next_iter_range_start); 3082 3083 ret = check_nlinks_walk_dirents(c, &links, 3084 this_iter_range_start, 3085 next_iter_range_start); 3086 if (ret) 3087 break; 3088 3089 ret = check_nlinks_update_hardlinks(c, &links, 3090 this_iter_range_start, 3091 next_iter_range_start); 3092 if (ret) 3093 break; 3094 3095 links.nr = 0; 3096 } while (next_iter_range_start != U64_MAX); 3097 3098 kvfree(links.d); 3099 bch_err_fn(c, ret); 3100 return ret; 3101 } 3102 3103 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, 3104 struct bkey_s_c k) 3105 { 3106 struct bkey_s_c_reflink_p p; 3107 struct bkey_i_reflink_p *u; 3108 3109 if (k.k->type != KEY_TYPE_reflink_p) 3110 return 0; 3111 3112 p = bkey_s_c_to_reflink_p(k); 3113 3114 if (!p.v->front_pad && !p.v->back_pad) 3115 return 0; 3116 3117 u = bch2_trans_kmalloc(trans, sizeof(*u)); 3118 int ret = PTR_ERR_OR_ZERO(u); 3119 if (ret) 3120 return ret; 3121 3122 bkey_reassemble(&u->k_i, k); 3123 u->v.front_pad = 0; 3124 u->v.back_pad = 0; 3125 3126 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); 3127 } 3128 3129 int bch2_fix_reflink_p(struct bch_fs *c) 3130 { 3131 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) 3132 return 0; 3133 3134 int ret = bch2_trans_run(c, 3135 for_each_btree_key_commit(trans, iter, 3136 BTREE_ID_extents, POS_MIN, 3137 BTREE_ITER_intent|BTREE_ITER_prefetch| 3138 BTREE_ITER_all_snapshots, k, 3139 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 3140 fix_reflink_p_key(trans, &iter, k))); 3141 bch_err_fn(c, ret); 3142 return ret; 3143 } 3144 3145 #ifndef NO_BCACHEFS_CHARDEV 3146 3147 struct fsck_thread { 3148 struct thread_with_stdio thr; 3149 struct bch_fs *c; 3150 struct bch_opts opts; 3151 }; 3152 3153 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) 3154 { 3155 struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); 3156 kfree(thr); 3157 } 3158 3159 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) 3160 { 3161 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); 3162 struct bch_fs *c = thr->c; 3163 3164 int ret = PTR_ERR_OR_ZERO(c); 3165 if (ret) 3166 return ret; 3167 3168 ret = bch2_fs_start(thr->c); 3169 if (ret) 3170 goto err; 3171 3172 if (test_bit(BCH_FS_errors_fixed, &c->flags)) { 3173 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); 3174 ret |= 1; 3175 } 3176 if (test_bit(BCH_FS_error, &c->flags)) { 3177 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); 3178 ret |= 4; 3179 } 3180 err: 3181 bch2_fs_stop(c); 3182 return ret; 3183 } 3184 3185 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { 3186 .exit = bch2_fsck_thread_exit, 3187 .fn = bch2_fsck_offline_thread_fn, 3188 }; 3189 3190 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) 3191 { 3192 struct bch_ioctl_fsck_offline arg; 3193 struct fsck_thread *thr = NULL; 3194 darray_const_str devs = {}; 3195 long ret = 0; 3196 3197 if (copy_from_user(&arg, user_arg, sizeof(arg))) 3198 return -EFAULT; 3199 3200 if (arg.flags) 3201 return -EINVAL; 3202 3203 if (!capable(CAP_SYS_ADMIN)) 3204 return -EPERM; 3205 3206 for (size_t i = 0; i < arg.nr_devs; i++) { 3207 u64 dev_u64; 3208 ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); 3209 if (ret) 3210 goto err; 3211 3212 char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); 3213 ret = PTR_ERR_OR_ZERO(dev_str); 3214 if (ret) 3215 goto err; 3216 3217 ret = darray_push(&devs, dev_str); 3218 if (ret) { 3219 kfree(dev_str); 3220 goto err; 3221 } 3222 } 3223 3224 thr = kzalloc(sizeof(*thr), GFP_KERNEL); 3225 if (!thr) { 3226 ret = -ENOMEM; 3227 goto err; 3228 } 3229 3230 thr->opts = bch2_opts_empty(); 3231 3232 if (arg.opts) { 3233 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); 3234 ret = PTR_ERR_OR_ZERO(optstr) ?: 3235 bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false); 3236 if (!IS_ERR(optstr)) 3237 kfree(optstr); 3238 3239 if (ret) 3240 goto err; 3241 } 3242 3243 opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); 3244 opt_set(thr->opts, read_only, 1); 3245 opt_set(thr->opts, ratelimit_errors, 0); 3246 3247 /* We need request_key() to be called before we punt to kthread: */ 3248 opt_set(thr->opts, nostart, true); 3249 3250 bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); 3251 3252 thr->c = bch2_fs_open(&devs, &thr->opts); 3253 3254 if (!IS_ERR(thr->c) && 3255 thr->c->opts.errors == BCH_ON_ERROR_panic) 3256 thr->c->opts.errors = BCH_ON_ERROR_ro; 3257 3258 ret = __bch2_run_thread_with_stdio(&thr->thr); 3259 out: 3260 darray_for_each(devs, i) 3261 kfree(*i); 3262 darray_exit(&devs); 3263 return ret; 3264 err: 3265 if (thr) 3266 bch2_fsck_thread_exit(&thr->thr); 3267 pr_err("ret %s", bch2_err_str(ret)); 3268 goto out; 3269 } 3270 3271 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) 3272 { 3273 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); 3274 struct bch_fs *c = thr->c; 3275 3276 c->stdio_filter = current; 3277 c->stdio = &thr->thr.stdio; 3278 3279 /* 3280 * XXX: can we figure out a way to do this without mucking with c->opts? 3281 */ 3282 unsigned old_fix_errors = c->opts.fix_errors; 3283 if (opt_defined(thr->opts, fix_errors)) 3284 c->opts.fix_errors = thr->opts.fix_errors; 3285 else 3286 c->opts.fix_errors = FSCK_FIX_ask; 3287 3288 c->opts.fsck = true; 3289 set_bit(BCH_FS_in_fsck, &c->flags); 3290 3291 int ret = bch2_run_online_recovery_passes(c, ~0ULL); 3292 3293 clear_bit(BCH_FS_in_fsck, &c->flags); 3294 bch_err_fn(c, ret); 3295 3296 c->stdio = NULL; 3297 c->stdio_filter = NULL; 3298 c->opts.fix_errors = old_fix_errors; 3299 3300 up(&c->recovery.run_lock); 3301 bch2_ro_ref_put(c); 3302 return ret; 3303 } 3304 3305 static const struct thread_with_stdio_ops bch2_online_fsck_ops = { 3306 .exit = bch2_fsck_thread_exit, 3307 .fn = bch2_fsck_online_thread_fn, 3308 }; 3309 3310 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) 3311 { 3312 struct fsck_thread *thr = NULL; 3313 long ret = 0; 3314 3315 if (arg.flags) 3316 return -EINVAL; 3317 3318 if (!capable(CAP_SYS_ADMIN)) 3319 return -EPERM; 3320 3321 if (!bch2_ro_ref_tryget(c)) 3322 return -EROFS; 3323 3324 if (down_trylock(&c->recovery.run_lock)) { 3325 bch2_ro_ref_put(c); 3326 return -EAGAIN; 3327 } 3328 3329 thr = kzalloc(sizeof(*thr), GFP_KERNEL); 3330 if (!thr) { 3331 ret = -ENOMEM; 3332 goto err; 3333 } 3334 3335 thr->c = c; 3336 thr->opts = bch2_opts_empty(); 3337 3338 if (arg.opts) { 3339 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); 3340 3341 ret = PTR_ERR_OR_ZERO(optstr) ?: 3342 bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false); 3343 if (!IS_ERR(optstr)) 3344 kfree(optstr); 3345 3346 if (ret) 3347 goto err; 3348 } 3349 3350 ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); 3351 err: 3352 if (ret < 0) { 3353 bch_err_fn(c, ret); 3354 if (thr) 3355 bch2_fsck_thread_exit(&thr->thr); 3356 up(&c->recovery.run_lock); 3357 bch2_ro_ref_put(c); 3358 } 3359 return ret; 3360 } 3361 3362 #endif /* NO_BCACHEFS_CHARDEV */ 3363