1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bcachefs_ioctl.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "darray.h" 10 #include "dirent.h" 11 #include "error.h" 12 #include "fs.h" 13 #include "fsck.h" 14 #include "inode.h" 15 #include "keylist.h" 16 #include "namei.h" 17 #include "recovery_passes.h" 18 #include "snapshot.h" 19 #include "super.h" 20 #include "thread_with_file.h" 21 #include "xattr.h" 22 23 #include <linux/bsearch.h> 24 #include <linux/dcache.h> /* struct qstr */ 25 26 static int dirent_points_to_inode_nowarn(struct bch_fs *c, 27 struct bkey_s_c_dirent d, 28 struct bch_inode_unpacked *inode) 29 { 30 if (d.v->d_type == DT_SUBVOL 31 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 32 : le64_to_cpu(d.v->d_inum) == inode->bi_inum) 33 return 0; 34 return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); 35 } 36 37 static void dirent_inode_mismatch_msg(struct printbuf *out, 38 struct bch_fs *c, 39 struct bkey_s_c_dirent dirent, 40 struct bch_inode_unpacked *inode) 41 { 42 prt_str(out, "inode points to dirent that does not point back:"); 43 prt_newline(out); 44 bch2_bkey_val_to_text(out, c, dirent.s_c); 45 prt_newline(out); 46 bch2_inode_unpacked_to_text(out, inode); 47 } 48 49 static int dirent_points_to_inode(struct bch_fs *c, 50 struct bkey_s_c_dirent dirent, 51 struct bch_inode_unpacked *inode) 52 { 53 int ret = dirent_points_to_inode_nowarn(c, dirent, inode); 54 if (ret) { 55 struct printbuf buf = PRINTBUF; 56 dirent_inode_mismatch_msg(&buf, c, dirent, inode); 57 bch_warn(c, "%s", buf.buf); 58 printbuf_exit(&buf); 59 } 60 return ret; 61 } 62 63 /* 64 * XXX: this is handling transaction restarts without returning 65 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: 66 */ 67 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, 68 u32 snapshot) 69 { 70 u64 sectors = 0; 71 72 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, 73 SPOS(inum, 0, snapshot), 74 POS(inum, U64_MAX), 75 0, k, ({ 76 if (bkey_extent_is_allocation(k.k)) 77 sectors += k.k->size; 78 0; 79 })); 80 81 return ret ?: sectors; 82 } 83 84 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, 85 u32 snapshot) 86 { 87 u64 subdirs = 0; 88 89 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, 90 SPOS(inum, 0, snapshot), 91 POS(inum, U64_MAX), 92 0, k, ({ 93 if (k.k->type == KEY_TYPE_dirent && 94 bkey_s_c_to_dirent(k).v->d_type == DT_DIR) 95 subdirs++; 96 0; 97 })); 98 99 return ret ?: subdirs; 100 } 101 102 static int subvol_lookup(struct btree_trans *trans, u32 subvol, 103 u32 *snapshot, u64 *inum) 104 { 105 struct bch_subvolume s; 106 int ret = bch2_subvolume_get(trans, subvol, false, &s); 107 108 *snapshot = le32_to_cpu(s.snapshot); 109 *inum = le64_to_cpu(s.inode); 110 return ret; 111 } 112 113 static int lookup_dirent_in_snapshot(struct btree_trans *trans, 114 struct bch_hash_info hash_info, 115 subvol_inum dir, struct qstr *name, 116 u64 *target, unsigned *type, u32 snapshot) 117 { 118 struct btree_iter iter; 119 struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, 120 &hash_info, dir, name, 0, snapshot); 121 int ret = bkey_err(k); 122 if (ret) 123 return ret; 124 125 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 126 *target = le64_to_cpu(d.v->d_inum); 127 *type = d.v->d_type; 128 bch2_trans_iter_exit(trans, &iter); 129 return 0; 130 } 131 132 /* 133 * Find any subvolume associated with a tree of snapshots 134 * We can't rely on master_subvol - it might have been deleted. 135 */ 136 static int find_snapshot_tree_subvol(struct btree_trans *trans, 137 u32 tree_id, u32 *subvol) 138 { 139 struct btree_iter iter; 140 struct bkey_s_c k; 141 int ret; 142 143 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { 144 if (k.k->type != KEY_TYPE_snapshot) 145 continue; 146 147 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); 148 if (le32_to_cpu(s.v->tree) != tree_id) 149 continue; 150 151 if (s.v->subvol) { 152 *subvol = le32_to_cpu(s.v->subvol); 153 goto found; 154 } 155 } 156 ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol); 157 found: 158 bch2_trans_iter_exit(trans, &iter); 159 return ret; 160 } 161 162 /* Get lost+found, create if it doesn't exist: */ 163 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, 164 struct bch_inode_unpacked *lostfound, 165 u64 reattaching_inum) 166 { 167 struct bch_fs *c = trans->c; 168 struct qstr lostfound_str = QSTR("lost+found"); 169 struct btree_iter lostfound_iter = {}; 170 u64 inum = 0; 171 unsigned d_type = 0; 172 int ret; 173 174 struct bch_snapshot_tree st; 175 ret = bch2_snapshot_tree_lookup(trans, 176 bch2_snapshot_tree(c, snapshot), &st); 177 if (ret) 178 return ret; 179 180 u32 subvolid; 181 ret = find_snapshot_tree_subvol(trans, 182 bch2_snapshot_tree(c, snapshot), &subvolid); 183 bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u", 184 bch2_snapshot_tree(c, snapshot)); 185 if (ret) 186 return ret; 187 188 struct bch_subvolume subvol; 189 ret = bch2_subvolume_get(trans, subvolid, false, &subvol); 190 bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot); 191 if (ret) 192 return ret; 193 194 if (!subvol.inode) { 195 struct btree_iter iter; 196 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, 197 BTREE_ID_subvolumes, POS(0, subvolid), 198 0, subvolume); 199 ret = PTR_ERR_OR_ZERO(subvol); 200 if (ret) 201 return ret; 202 203 subvol->v.inode = cpu_to_le64(reattaching_inum); 204 bch2_trans_iter_exit(trans, &iter); 205 } 206 207 subvol_inum root_inum = { 208 .subvol = subvolid, 209 .inum = le64_to_cpu(subvol.inode) 210 }; 211 212 struct bch_inode_unpacked root_inode; 213 struct bch_hash_info root_hash_info; 214 ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); 215 bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", 216 root_inum.inum, subvolid); 217 if (ret) 218 return ret; 219 220 root_hash_info = bch2_hash_info_init(c, &root_inode); 221 222 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, 223 &lostfound_str, &inum, &d_type, snapshot); 224 if (bch2_err_matches(ret, ENOENT)) 225 goto create_lostfound; 226 227 bch_err_fn(c, ret); 228 if (ret) 229 return ret; 230 231 if (d_type != DT_DIR) { 232 bch_err(c, "error looking up lost+found: not a directory"); 233 return bch_err_throw(c, ENOENT_not_directory); 234 } 235 236 /* 237 * The bch2_check_dirents pass has already run, dangling dirents 238 * shouldn't exist here: 239 */ 240 ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); 241 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", 242 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); 243 return ret; 244 245 create_lostfound: 246 /* 247 * we always create lost+found in the root snapshot; we don't want 248 * different branches of the snapshot tree to have different lost+found 249 */ 250 snapshot = le32_to_cpu(st.root_snapshot); 251 /* 252 * XXX: we could have a nicer log message here if we had a nice way to 253 * walk backpointers to print a path 254 */ 255 struct printbuf path = PRINTBUF; 256 ret = bch2_inum_to_path(trans, root_inum, &path); 257 if (ret) 258 goto err; 259 260 bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", 261 path.buf, root_inum.subvol, snapshot); 262 printbuf_exit(&path); 263 264 u64 now = bch2_current_time(c); 265 u64 cpu = raw_smp_processor_id(); 266 267 bch2_inode_init_early(c, lostfound); 268 bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); 269 lostfound->bi_dir = root_inode.bi_inum; 270 lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); 271 272 root_inode.bi_nlink++; 273 274 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); 275 if (ret) 276 goto err; 277 278 bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); 279 ret = bch2_btree_iter_traverse(trans, &lostfound_iter); 280 if (ret) 281 goto err; 282 283 ret = bch2_dirent_create_snapshot(trans, 284 0, root_inode.bi_inum, snapshot, &root_hash_info, 285 mode_to_type(lostfound->bi_mode), 286 &lostfound_str, 287 lostfound->bi_inum, 288 &lostfound->bi_dir_offset, 289 BTREE_UPDATE_internal_snapshot_node| 290 STR_HASH_must_create) ?: 291 bch2_inode_write_flags(trans, &lostfound_iter, lostfound, 292 BTREE_UPDATE_internal_snapshot_node); 293 err: 294 bch_err_msg(c, ret, "creating lost+found"); 295 bch2_trans_iter_exit(trans, &lostfound_iter); 296 return ret; 297 } 298 299 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) 300 { 301 if (inode->bi_inum == BCACHEFS_ROOT_INO && 302 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) 303 return false; 304 305 /* 306 * Subvolume roots are special: older versions of subvolume roots may be 307 * disconnected, it's only the newest version that matters. 308 * 309 * We only keep a single dirent pointing to a subvolume root, i.e. 310 * older versions of snapshots will not have a different dirent pointing 311 * to the same subvolume root. 312 * 313 * This is because dirents that point to subvolumes are only visible in 314 * the parent subvolume - versioning is not needed - and keeping them 315 * around would break fsck, because when we're crossing subvolumes we 316 * don't have a consistent snapshot ID to do check the inode <-> dirent 317 * relationships. 318 * 319 * Thus, a subvolume root that's been renamed after a snapshot will have 320 * a disconnected older version - that's expected. 321 * 322 * Note that taking a snapshot always updates the root inode (to update 323 * the dirent backpointer), so a subvolume root inode with 324 * BCH_INODE_has_child_snapshot is never visible. 325 */ 326 if (inode->bi_subvol && 327 (inode->bi_flags & BCH_INODE_has_child_snapshot)) 328 return false; 329 330 return !bch2_inode_has_backpointer(inode) && 331 !(inode->bi_flags & BCH_INODE_unlinked); 332 } 333 334 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) 335 { 336 struct btree_iter iter; 337 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, 338 SPOS(d_pos.inode, d_pos.offset, snapshot), 339 BTREE_ITER_intent| 340 BTREE_ITER_with_updates); 341 int ret = bkey_err(k); 342 if (ret) 343 return ret; 344 345 if (bpos_eq(k.k->p, d_pos)) { 346 /* 347 * delet_at() doesn't work because the update path doesn't 348 * internally use BTREE_ITER_with_updates yet 349 */ 350 struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); 351 ret = PTR_ERR_OR_ZERO(k); 352 if (ret) 353 goto err; 354 355 bkey_init(&k->k); 356 k->k.type = KEY_TYPE_whiteout; 357 k->k.p = iter.pos; 358 ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); 359 } 360 err: 361 bch2_trans_iter_exit(trans, &iter); 362 return ret; 363 } 364 365 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) 366 { 367 struct bch_fs *c = trans->c; 368 struct bch_inode_unpacked lostfound; 369 char name_buf[20]; 370 int ret; 371 372 u32 dirent_snapshot = inode->bi_snapshot; 373 if (inode->bi_subvol) { 374 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; 375 376 struct btree_iter subvol_iter; 377 struct bkey_i_subvolume *subvol = 378 bch2_bkey_get_mut_typed(trans, &subvol_iter, 379 BTREE_ID_subvolumes, POS(0, inode->bi_subvol), 380 0, subvolume); 381 ret = PTR_ERR_OR_ZERO(subvol); 382 if (ret) 383 return ret; 384 385 subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL; 386 bch2_trans_iter_exit(trans, &subvol_iter); 387 388 u64 root_inum; 389 ret = subvol_lookup(trans, inode->bi_parent_subvol, 390 &dirent_snapshot, &root_inum); 391 if (ret) 392 return ret; 393 394 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); 395 } else { 396 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 397 } 398 399 ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); 400 if (ret) 401 return ret; 402 403 bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum); 404 405 lostfound.bi_nlink += S_ISDIR(inode->bi_mode); 406 407 /* ensure lost+found inode is also present in inode snapshot */ 408 if (!inode->bi_subvol) { 409 BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot)); 410 lostfound.bi_snapshot = inode->bi_snapshot; 411 } 412 413 ret = __bch2_fsck_write_inode(trans, &lostfound); 414 if (ret) 415 return ret; 416 417 struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); 418 struct qstr name = QSTR(name_buf); 419 420 inode->bi_dir = lostfound.bi_inum; 421 422 ret = bch2_dirent_create_snapshot(trans, 423 inode->bi_parent_subvol, lostfound.bi_inum, 424 dirent_snapshot, 425 &dir_hash, 426 inode_d_type(inode), 427 &name, 428 inode->bi_subvol ?: inode->bi_inum, 429 &inode->bi_dir_offset, 430 BTREE_UPDATE_internal_snapshot_node| 431 STR_HASH_must_create); 432 if (ret) { 433 bch_err_msg(c, ret, "error creating dirent"); 434 return ret; 435 } 436 437 ret = __bch2_fsck_write_inode(trans, inode); 438 if (ret) 439 return ret; 440 441 { 442 CLASS(printbuf, buf)(); 443 ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, 444 inode->bi_snapshot, NULL, &buf); 445 if (ret) 446 return ret; 447 448 bch_info(c, "reattached at %s", buf.buf); 449 } 450 451 /* 452 * Fix up inodes in child snapshots: if they should also be reattached 453 * update the backpointer field, if they should not be we need to emit 454 * whiteouts for the dirent we just created. 455 */ 456 if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { 457 snapshot_id_list whiteouts_done; 458 struct btree_iter iter; 459 struct bkey_s_c k; 460 461 darray_init(&whiteouts_done); 462 463 for_each_btree_key_reverse_norestart(trans, iter, 464 BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), 465 BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { 466 if (k.k->p.offset != inode->bi_inum) 467 break; 468 469 if (!bkey_is_inode(k.k) || 470 !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || 471 snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) 472 continue; 473 474 struct bch_inode_unpacked child_inode; 475 ret = bch2_inode_unpack(k, &child_inode); 476 if (ret) 477 break; 478 479 if (!inode_should_reattach(&child_inode)) { 480 ret = maybe_delete_dirent(trans, 481 SPOS(lostfound.bi_inum, inode->bi_dir_offset, 482 dirent_snapshot), 483 k.k->p.snapshot); 484 if (ret) 485 break; 486 487 ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); 488 if (ret) 489 break; 490 } else { 491 iter.snapshot = k.k->p.snapshot; 492 child_inode.bi_dir = inode->bi_dir; 493 child_inode.bi_dir_offset = inode->bi_dir_offset; 494 495 ret = bch2_inode_write_flags(trans, &iter, &child_inode, 496 BTREE_UPDATE_internal_snapshot_node); 497 if (ret) 498 break; 499 } 500 } 501 darray_exit(&whiteouts_done); 502 bch2_trans_iter_exit(trans, &iter); 503 } 504 505 return ret; 506 } 507 508 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 509 struct btree_iter *iter, 510 struct bpos pos) 511 { 512 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 513 } 514 515 static int remove_backpointer(struct btree_trans *trans, 516 struct bch_inode_unpacked *inode) 517 { 518 if (!bch2_inode_has_backpointer(inode)) 519 return 0; 520 521 u32 snapshot = inode->bi_snapshot; 522 523 if (inode->bi_parent_subvol) { 524 int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot); 525 if (ret) 526 return ret; 527 } 528 529 struct bch_fs *c = trans->c; 530 struct btree_iter iter; 531 struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, 532 SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); 533 int ret = bkey_err(d) ?: 534 dirent_points_to_inode(c, d, inode) ?: 535 bch2_fsck_remove_dirent(trans, d.k->p); 536 bch2_trans_iter_exit(trans, &iter); 537 return ret; 538 } 539 540 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) 541 { 542 struct bch_fs *c = trans->c; 543 544 struct bch_inode_unpacked inode; 545 int ret = bch2_inode_find_by_inum_trans(trans, 546 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 547 &inode); 548 if (ret) 549 return ret; 550 551 ret = remove_backpointer(trans, &inode); 552 if (!bch2_err_matches(ret, ENOENT)) 553 bch_err_msg(c, ret, "removing dirent"); 554 if (ret) 555 return ret; 556 557 ret = reattach_inode(trans, &inode); 558 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 559 return ret; 560 } 561 562 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) 563 { 564 struct bch_fs *c = trans->c; 565 566 if (!bch2_snapshot_is_leaf(c, snapshotid)) { 567 bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); 568 return bch_err_throw(c, fsck_repair_unimplemented); 569 } 570 571 /* 572 * If inum isn't set, that means we're being called from check_dirents, 573 * not check_inodes - the root of this subvolume doesn't exist or we 574 * would have found it there: 575 */ 576 if (!inum) { 577 struct btree_iter inode_iter = {}; 578 struct bch_inode_unpacked new_inode; 579 u64 cpu = raw_smp_processor_id(); 580 581 bch2_inode_init_early(c, &new_inode); 582 bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); 583 584 new_inode.bi_subvol = subvolid; 585 586 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: 587 bch2_btree_iter_traverse(trans, &inode_iter) ?: 588 bch2_inode_write(trans, &inode_iter, &new_inode); 589 bch2_trans_iter_exit(trans, &inode_iter); 590 if (ret) 591 return ret; 592 593 inum = new_inode.bi_inum; 594 } 595 596 bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); 597 598 struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); 599 int ret = PTR_ERR_OR_ZERO(new_subvol); 600 if (ret) 601 return ret; 602 603 bkey_subvolume_init(&new_subvol->k_i); 604 new_subvol->k.p.offset = subvolid; 605 new_subvol->v.snapshot = cpu_to_le32(snapshotid); 606 new_subvol->v.inode = cpu_to_le64(inum); 607 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); 608 if (ret) 609 return ret; 610 611 struct btree_iter iter; 612 struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, 613 BTREE_ID_snapshots, POS(0, snapshotid), 614 0, snapshot); 615 ret = PTR_ERR_OR_ZERO(s); 616 bch_err_msg(c, ret, "getting snapshot %u", snapshotid); 617 if (ret) 618 return ret; 619 620 u32 snapshot_tree = le32_to_cpu(s->v.tree); 621 622 s->v.subvol = cpu_to_le32(subvolid); 623 SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); 624 bch2_trans_iter_exit(trans, &iter); 625 626 struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, 627 BTREE_ID_snapshot_trees, POS(0, snapshot_tree), 628 0, snapshot_tree); 629 ret = PTR_ERR_OR_ZERO(st); 630 bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); 631 if (ret) 632 return ret; 633 634 if (!st->v.master_subvol) 635 st->v.master_subvol = cpu_to_le32(subvolid); 636 637 bch2_trans_iter_exit(trans, &iter); 638 return 0; 639 } 640 641 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) 642 { 643 struct bch_fs *c = trans->c; 644 unsigned i_mode = S_IFREG; 645 u64 i_size = 0; 646 647 switch (btree) { 648 case BTREE_ID_extents: { 649 struct btree_iter iter = {}; 650 651 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); 652 struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); 653 bch2_trans_iter_exit(trans, &iter); 654 int ret = bkey_err(k); 655 if (ret) 656 return ret; 657 658 i_size = k.k->p.offset << 9; 659 break; 660 } 661 case BTREE_ID_dirents: 662 i_mode = S_IFDIR; 663 break; 664 case BTREE_ID_xattrs: 665 break; 666 default: 667 BUG(); 668 } 669 670 struct bch_inode_unpacked new_inode; 671 bch2_inode_init_early(c, &new_inode); 672 bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); 673 new_inode.bi_size = i_size; 674 new_inode.bi_inum = inum; 675 new_inode.bi_snapshot = snapshot; 676 677 return __bch2_fsck_write_inode(trans, &new_inode); 678 } 679 680 static inline void snapshots_seen_exit(struct snapshots_seen *s) 681 { 682 darray_exit(&s->ids); 683 } 684 685 static inline void snapshots_seen_init(struct snapshots_seen *s) 686 { 687 memset(s, 0, sizeof(*s)); 688 } 689 690 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) 691 { 692 u32 *i; 693 __darray_for_each(s->ids, i) { 694 if (*i == id) 695 return 0; 696 if (*i > id) 697 break; 698 } 699 700 int ret = darray_insert_item(&s->ids, i - s->ids.data, id); 701 if (ret) 702 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 703 s->ids.size); 704 return ret; 705 } 706 707 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, 708 enum btree_id btree_id, struct bpos pos) 709 { 710 if (!bkey_eq(s->pos, pos)) 711 s->ids.nr = 0; 712 s->pos = pos; 713 714 return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); 715 } 716 717 /** 718 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, 719 * and @ancestor hasn't been overwritten in @seen 720 * 721 * @c: filesystem handle 722 * @seen: list of snapshot ids already seen at current position 723 * @id: descendent snapshot id 724 * @ancestor: ancestor snapshot id 725 * 726 * Returns: whether key in @ancestor snapshot is visible in @id snapshot 727 */ 728 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, 729 u32 id, u32 ancestor) 730 { 731 EBUG_ON(id > ancestor); 732 733 if (id == ancestor) 734 return true; 735 736 if (!bch2_snapshot_is_ancestor(c, id, ancestor)) 737 return false; 738 739 /* 740 * We know that @id is a descendant of @ancestor, we're checking if 741 * we've seen a key that overwrote @ancestor - i.e. also a descendent of 742 * @ascestor and with @id as a descendent. 743 * 744 * But we already know that we're scanning IDs between @id and @ancestor 745 * numerically, since snapshot ID lists are kept sorted, so if we find 746 * an id that's an ancestor of @id we're done: 747 */ 748 darray_for_each_reverse(seen->ids, i) 749 if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i)) 750 return false; 751 752 return true; 753 } 754 755 /** 756 * ref_visible - given a key with snapshot id @src that points to a key with 757 * snapshot id @dst, test whether there is some snapshot in which @dst is 758 * visible. 759 * 760 * @c: filesystem handle 761 * @s: list of snapshot IDs already seen at @src 762 * @src: snapshot ID of src key 763 * @dst: snapshot ID of dst key 764 * Returns: true if there is some snapshot in which @dst is visible 765 * 766 * Assumes we're visiting @src keys in natural key order 767 */ 768 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, 769 u32 src, u32 dst) 770 { 771 return dst <= src 772 ? key_visible_in_snapshot(c, s, dst, src) 773 : bch2_snapshot_is_ancestor(c, src, dst); 774 } 775 776 static int ref_visible2(struct bch_fs *c, 777 u32 src, struct snapshots_seen *src_seen, 778 u32 dst, struct snapshots_seen *dst_seen) 779 { 780 if (dst > src) { 781 swap(dst, src); 782 swap(dst_seen, src_seen); 783 } 784 return key_visible_in_snapshot(c, src_seen, dst, src); 785 } 786 787 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ 788 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ 789 (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ 790 if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) 791 792 struct inode_walker_entry { 793 struct bch_inode_unpacked inode; 794 bool whiteout; 795 u64 count; 796 u64 i_size; 797 }; 798 799 struct inode_walker { 800 bool first_this_inode; 801 bool have_inodes; 802 bool recalculate_sums; 803 struct bpos last_pos; 804 805 DARRAY(struct inode_walker_entry) inodes; 806 snapshot_id_list deletes; 807 }; 808 809 static void inode_walker_exit(struct inode_walker *w) 810 { 811 darray_exit(&w->inodes); 812 darray_exit(&w->deletes); 813 } 814 815 static struct inode_walker inode_walker_init(void) 816 { 817 return (struct inode_walker) { 0, }; 818 } 819 820 static int add_inode(struct bch_fs *c, struct inode_walker *w, 821 struct bkey_s_c inode) 822 { 823 int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { 824 .whiteout = !bkey_is_inode(inode.k), 825 })); 826 if (ret) 827 return ret; 828 829 struct inode_walker_entry *n = &darray_last(w->inodes); 830 if (!n->whiteout) { 831 return bch2_inode_unpack(inode, &n->inode); 832 } else { 833 n->inode.bi_inum = inode.k->p.offset; 834 n->inode.bi_snapshot = inode.k->p.snapshot; 835 return 0; 836 } 837 } 838 839 static int get_inodes_all_snapshots(struct btree_trans *trans, 840 struct inode_walker *w, u64 inum) 841 { 842 struct bch_fs *c = trans->c; 843 struct btree_iter iter; 844 struct bkey_s_c k; 845 int ret; 846 847 /* 848 * We no longer have inodes for w->last_pos; clear this to avoid 849 * screwing up check_i_sectors/check_subdir_count if we take a 850 * transaction restart here: 851 */ 852 w->have_inodes = false; 853 w->recalculate_sums = false; 854 w->inodes.nr = 0; 855 856 for_each_btree_key_max_norestart(trans, iter, 857 BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), 858 BTREE_ITER_all_snapshots, k, ret) { 859 ret = add_inode(c, w, k); 860 if (ret) 861 break; 862 } 863 bch2_trans_iter_exit(trans, &iter); 864 865 if (ret) 866 return ret; 867 868 w->first_this_inode = true; 869 w->have_inodes = true; 870 return 0; 871 } 872 873 static int get_visible_inodes(struct btree_trans *trans, 874 struct inode_walker *w, 875 struct snapshots_seen *s, 876 u64 inum) 877 { 878 struct bch_fs *c = trans->c; 879 struct btree_iter iter; 880 struct bkey_s_c k; 881 int ret; 882 883 w->inodes.nr = 0; 884 w->deletes.nr = 0; 885 886 for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), 887 BTREE_ITER_all_snapshots, k, ret) { 888 if (k.k->p.offset != inum) 889 break; 890 891 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) 892 continue; 893 894 if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) 895 continue; 896 897 ret = bkey_is_inode(k.k) 898 ? add_inode(c, w, k) 899 : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); 900 if (ret) 901 break; 902 } 903 bch2_trans_iter_exit(trans, &iter); 904 905 return ret; 906 } 907 908 static struct inode_walker_entry * 909 lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) 910 { 911 struct bch_fs *c = trans->c; 912 913 struct inode_walker_entry *i = darray_find_p(w->inodes, i, 914 bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)); 915 916 if (!i) 917 return NULL; 918 919 struct printbuf buf = PRINTBUF; 920 int ret = 0; 921 922 if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, 923 trans, snapshot_key_missing_inode_snapshot, 924 "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" 925 "unexpected because we should always update the inode when we update a key in that inode\n" 926 "%s", 927 w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, 928 (bch2_bkey_val_to_text(&buf, c, k), 929 buf.buf))) { 930 if (!i->whiteout) { 931 struct bch_inode_unpacked new = i->inode; 932 new.bi_snapshot = k.k->p.snapshot; 933 ret = __bch2_fsck_write_inode(trans, &new); 934 } else { 935 struct bkey_i whiteout; 936 bkey_init(&whiteout.k); 937 whiteout.k.type = KEY_TYPE_whiteout; 938 whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot); 939 ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, 940 &whiteout, 941 BTREE_UPDATE_internal_snapshot_node); 942 } 943 944 if (ret) 945 goto fsck_err; 946 947 ret = bch2_trans_commit(trans, NULL, NULL, 0); 948 if (ret) 949 goto fsck_err; 950 951 struct inode_walker_entry new_entry = *i; 952 953 new_entry.inode.bi_snapshot = k.k->p.snapshot; 954 new_entry.count = 0; 955 new_entry.i_size = 0; 956 957 while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) 958 --i; 959 960 size_t pos = i - w->inodes.data; 961 ret = darray_insert_item(&w->inodes, pos, new_entry); 962 if (ret) 963 goto fsck_err; 964 965 ret = bch_err_throw(c, transaction_restart_nested); 966 goto fsck_err; 967 } 968 969 printbuf_exit(&buf); 970 return i; 971 fsck_err: 972 printbuf_exit(&buf); 973 return ERR_PTR(ret); 974 } 975 976 static struct inode_walker_entry *walk_inode(struct btree_trans *trans, 977 struct inode_walker *w, 978 struct bkey_s_c k) 979 { 980 if (w->last_pos.inode != k.k->p.inode) { 981 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); 982 if (ret) 983 return ERR_PTR(ret); 984 } 985 986 w->last_pos = k.k->p; 987 988 return lookup_inode_for_snapshot(trans, w, k); 989 } 990 991 /* 992 * Prefer to delete the first one, since that will be the one at the wrong 993 * offset: 994 * return value: 0 -> delete k1, 1 -> delete k2 995 */ 996 int bch2_fsck_update_backpointers(struct btree_trans *trans, 997 struct snapshots_seen *s, 998 const struct bch_hash_desc desc, 999 struct bch_hash_info *hash_info, 1000 struct bkey_i *new) 1001 { 1002 if (new->k.type != KEY_TYPE_dirent) 1003 return 0; 1004 1005 struct bkey_i_dirent *d = bkey_i_to_dirent(new); 1006 struct inode_walker target = inode_walker_init(); 1007 int ret = 0; 1008 1009 if (d->v.d_type == DT_SUBVOL) { 1010 bch_err(trans->c, "%s does not support DT_SUBVOL", __func__); 1011 ret = -BCH_ERR_fsck_repair_unimplemented; 1012 } else { 1013 ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); 1014 if (ret) 1015 goto err; 1016 1017 darray_for_each(target.inodes, i) { 1018 i->inode.bi_dir_offset = d->k.p.offset; 1019 ret = __bch2_fsck_write_inode(trans, &i->inode); 1020 if (ret) 1021 goto err; 1022 } 1023 } 1024 err: 1025 inode_walker_exit(&target); 1026 return ret; 1027 } 1028 1029 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, 1030 struct btree_iter *iter, 1031 struct bch_inode_unpacked *inode, 1032 u32 *snapshot) 1033 { 1034 if (inode->bi_subvol) { 1035 u64 inum; 1036 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); 1037 if (ret) 1038 return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); 1039 } 1040 1041 return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); 1042 } 1043 1044 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) 1045 { 1046 struct btree_iter iter; 1047 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); 1048 int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; 1049 bch2_trans_iter_exit(trans, &iter); 1050 return ret; 1051 } 1052 1053 static int check_inode_dirent_inode(struct btree_trans *trans, 1054 struct bch_inode_unpacked *inode, 1055 bool *write_inode) 1056 { 1057 struct bch_fs *c = trans->c; 1058 struct printbuf buf = PRINTBUF; 1059 1060 u32 inode_snapshot = inode->bi_snapshot; 1061 struct btree_iter dirent_iter = {}; 1062 struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); 1063 int ret = bkey_err(d); 1064 if (ret && !bch2_err_matches(ret, ENOENT)) 1065 return ret; 1066 1067 if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) && 1068 inode->bi_subvol && 1069 (inode->bi_flags & BCH_INODE_has_child_snapshot)) { 1070 /* Older version of a renamed subvolume root: we won't have a 1071 * correct dirent for it. That's expected, see 1072 * inode_should_reattach(). 1073 * 1074 * We don't clear the backpointer field when doing the rename 1075 * because there might be arbitrarily many versions in older 1076 * snapshots. 1077 */ 1078 inode->bi_dir = 0; 1079 inode->bi_dir_offset = 0; 1080 *write_inode = true; 1081 goto out; 1082 } 1083 1084 if (fsck_err_on(ret, 1085 trans, inode_points_to_missing_dirent, 1086 "inode points to missing dirent\n%s", 1087 (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || 1088 fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode), 1089 trans, inode_points_to_wrong_dirent, 1090 "%s", 1091 (printbuf_reset(&buf), 1092 dirent_inode_mismatch_msg(&buf, c, d, inode), 1093 buf.buf))) { 1094 /* 1095 * We just clear the backpointer fields for now. If we find a 1096 * dirent that points to this inode in check_dirents(), we'll 1097 * update it then; then when we get to check_path() if the 1098 * backpointer is still 0 we'll reattach it. 1099 */ 1100 inode->bi_dir = 0; 1101 inode->bi_dir_offset = 0; 1102 *write_inode = true; 1103 } 1104 out: 1105 ret = 0; 1106 fsck_err: 1107 bch2_trans_iter_exit(trans, &dirent_iter); 1108 printbuf_exit(&buf); 1109 bch_err_fn(c, ret); 1110 return ret; 1111 } 1112 1113 static int check_inode(struct btree_trans *trans, 1114 struct btree_iter *iter, 1115 struct bkey_s_c k, 1116 struct bch_inode_unpacked *snapshot_root, 1117 struct snapshots_seen *s) 1118 { 1119 struct bch_fs *c = trans->c; 1120 struct printbuf buf = PRINTBUF; 1121 struct bch_inode_unpacked u; 1122 bool do_update = false; 1123 int ret; 1124 1125 ret = bch2_check_key_has_snapshot(trans, iter, k); 1126 if (ret < 0) 1127 goto err; 1128 if (ret) 1129 return 0; 1130 1131 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1132 if (ret) 1133 goto err; 1134 1135 if (!bkey_is_inode(k.k)) 1136 return 0; 1137 1138 ret = bch2_inode_unpack(k, &u); 1139 if (ret) 1140 goto err; 1141 1142 if (snapshot_root->bi_inum != u.bi_inum) { 1143 ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); 1144 if (ret) 1145 goto err; 1146 } 1147 1148 if (u.bi_hash_seed != snapshot_root->bi_hash_seed || 1149 INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { 1150 ret = bch2_repair_inode_hash_info(trans, snapshot_root); 1151 BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); 1152 if (ret) 1153 goto err; 1154 } 1155 1156 ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); 1157 if (ret) 1158 goto err; 1159 1160 if (bch2_inode_has_backpointer(&u)) { 1161 ret = check_inode_dirent_inode(trans, &u, &do_update); 1162 if (ret) 1163 goto err; 1164 } 1165 1166 if (fsck_err_on(bch2_inode_has_backpointer(&u) && 1167 (u.bi_flags & BCH_INODE_unlinked), 1168 trans, inode_unlinked_but_has_dirent, 1169 "inode unlinked but has dirent\n%s", 1170 (printbuf_reset(&buf), 1171 bch2_inode_unpacked_to_text(&buf, &u), 1172 buf.buf))) { 1173 u.bi_flags &= ~BCH_INODE_unlinked; 1174 do_update = true; 1175 } 1176 1177 if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) { 1178 /* Check for this early so that check_unreachable_inode() will reattach it */ 1179 1180 ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot); 1181 if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty) 1182 goto err; 1183 1184 fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty, 1185 "dir unlinked but not empty\n%s", 1186 (printbuf_reset(&buf), 1187 bch2_inode_unpacked_to_text(&buf, &u), 1188 buf.buf)); 1189 u.bi_flags &= ~BCH_INODE_unlinked; 1190 do_update = true; 1191 ret = 0; 1192 } 1193 1194 if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, 1195 trans, inode_dir_has_nonzero_i_size, 1196 "directory %llu:%u with nonzero i_size %lli", 1197 u.bi_inum, u.bi_snapshot, u.bi_size)) { 1198 u.bi_size = 0; 1199 do_update = true; 1200 } 1201 1202 ret = bch2_inode_has_child_snapshots(trans, k.k->p); 1203 if (ret < 0) 1204 goto err; 1205 1206 if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), 1207 trans, inode_has_child_snapshots_wrong, 1208 "inode has_child_snapshots flag wrong (should be %u)\n%s", 1209 ret, 1210 (printbuf_reset(&buf), 1211 bch2_inode_unpacked_to_text(&buf, &u), 1212 buf.buf))) { 1213 if (ret) 1214 u.bi_flags |= BCH_INODE_has_child_snapshot; 1215 else 1216 u.bi_flags &= ~BCH_INODE_has_child_snapshot; 1217 do_update = true; 1218 } 1219 ret = 0; 1220 1221 if ((u.bi_flags & BCH_INODE_unlinked) && 1222 !(u.bi_flags & BCH_INODE_has_child_snapshot)) { 1223 if (!test_bit(BCH_FS_started, &c->flags)) { 1224 /* 1225 * If we're not in online fsck, don't delete unlinked 1226 * inodes, just make sure they're on the deleted list. 1227 * 1228 * They might be referred to by a logged operation - 1229 * i.e. we might have crashed in the middle of a 1230 * truncate on an unlinked but open file - so we want to 1231 * let the delete_dead_inodes kill it after resuming 1232 * logged ops. 1233 */ 1234 ret = check_inode_deleted_list(trans, k.k->p); 1235 if (ret < 0) 1236 goto err_noprint; 1237 1238 fsck_err_on(!ret, 1239 trans, unlinked_inode_not_on_deleted_list, 1240 "inode %llu:%u unlinked, but not on deleted list", 1241 u.bi_inum, k.k->p.snapshot); 1242 1243 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); 1244 if (ret) 1245 goto err; 1246 } else { 1247 ret = bch2_inode_or_descendents_is_open(trans, k.k->p); 1248 if (ret < 0) 1249 goto err; 1250 1251 if (fsck_err_on(!ret, 1252 trans, inode_unlinked_and_not_open, 1253 "inode %llu:%u unlinked and not open", 1254 u.bi_inum, u.bi_snapshot)) { 1255 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); 1256 bch_err_msg(c, ret, "in fsck deleting inode"); 1257 goto err_noprint; 1258 } 1259 ret = 0; 1260 } 1261 } 1262 1263 if (fsck_err_on(u.bi_parent_subvol && 1264 (u.bi_subvol == 0 || 1265 u.bi_subvol == BCACHEFS_ROOT_SUBVOL), 1266 trans, inode_bi_parent_nonzero, 1267 "inode %llu:%u has subvol %u but nonzero parent subvol %u", 1268 u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { 1269 u.bi_parent_subvol = 0; 1270 do_update = true; 1271 } 1272 1273 if (u.bi_subvol) { 1274 struct bch_subvolume s; 1275 1276 ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); 1277 if (ret && !bch2_err_matches(ret, ENOENT)) 1278 goto err; 1279 1280 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 1281 ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); 1282 goto do_update; 1283 } 1284 1285 if (fsck_err_on(ret, 1286 trans, inode_bi_subvol_missing, 1287 "inode %llu:%u bi_subvol points to missing subvolume %u", 1288 u.bi_inum, k.k->p.snapshot, u.bi_subvol) || 1289 fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || 1290 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), 1291 k.k->p.snapshot), 1292 trans, inode_bi_subvol_wrong, 1293 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", 1294 u.bi_inum, k.k->p.snapshot, u.bi_subvol, 1295 le64_to_cpu(s.inode), 1296 le32_to_cpu(s.snapshot))) { 1297 u.bi_subvol = 0; 1298 u.bi_parent_subvol = 0; 1299 do_update = true; 1300 } 1301 } 1302 1303 if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), 1304 trans, inode_journal_seq_in_future, 1305 "inode journal seq in future (currently at %llu)\n%s", 1306 journal_cur_seq(&c->journal), 1307 (printbuf_reset(&buf), 1308 bch2_inode_unpacked_to_text(&buf, &u), 1309 buf.buf))) { 1310 u.bi_journal_seq = journal_cur_seq(&c->journal); 1311 do_update = true; 1312 } 1313 do_update: 1314 if (do_update) { 1315 ret = __bch2_fsck_write_inode(trans, &u); 1316 bch_err_msg(c, ret, "in fsck updating inode"); 1317 if (ret) 1318 goto err_noprint; 1319 } 1320 err: 1321 fsck_err: 1322 bch_err_fn(c, ret); 1323 err_noprint: 1324 printbuf_exit(&buf); 1325 return ret; 1326 } 1327 1328 int bch2_check_inodes(struct bch_fs *c) 1329 { 1330 struct bch_inode_unpacked snapshot_root = {}; 1331 struct snapshots_seen s; 1332 1333 snapshots_seen_init(&s); 1334 1335 int ret = bch2_trans_run(c, 1336 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1337 POS_MIN, 1338 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1339 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1340 check_inode(trans, &iter, k, &snapshot_root, &s))); 1341 1342 snapshots_seen_exit(&s); 1343 bch_err_fn(c, ret); 1344 return ret; 1345 } 1346 1347 static int find_oldest_inode_needs_reattach(struct btree_trans *trans, 1348 struct bch_inode_unpacked *inode) 1349 { 1350 struct bch_fs *c = trans->c; 1351 struct btree_iter iter; 1352 struct bkey_s_c k; 1353 int ret = 0; 1354 1355 /* 1356 * We look for inodes to reattach in natural key order, leaves first, 1357 * but we should do the reattach at the oldest version that needs to be 1358 * reattached: 1359 */ 1360 for_each_btree_key_norestart(trans, iter, 1361 BTREE_ID_inodes, 1362 SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), 1363 BTREE_ITER_all_snapshots, k, ret) { 1364 if (k.k->p.offset != inode->bi_inum) 1365 break; 1366 1367 if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) 1368 continue; 1369 1370 if (!bkey_is_inode(k.k)) 1371 break; 1372 1373 struct bch_inode_unpacked parent_inode; 1374 ret = bch2_inode_unpack(k, &parent_inode); 1375 if (ret) 1376 break; 1377 1378 if (!inode_should_reattach(&parent_inode)) 1379 break; 1380 1381 *inode = parent_inode; 1382 } 1383 bch2_trans_iter_exit(trans, &iter); 1384 1385 return ret; 1386 } 1387 1388 static int check_unreachable_inode(struct btree_trans *trans, 1389 struct btree_iter *iter, 1390 struct bkey_s_c k) 1391 { 1392 struct printbuf buf = PRINTBUF; 1393 int ret = 0; 1394 1395 if (!bkey_is_inode(k.k)) 1396 return 0; 1397 1398 struct bch_inode_unpacked inode; 1399 ret = bch2_inode_unpack(k, &inode); 1400 if (ret) 1401 return ret; 1402 1403 if (!inode_should_reattach(&inode)) 1404 return 0; 1405 1406 ret = find_oldest_inode_needs_reattach(trans, &inode); 1407 if (ret) 1408 return ret; 1409 1410 if (fsck_err(trans, inode_unreachable, 1411 "unreachable inode:\n%s", 1412 (bch2_inode_unpacked_to_text(&buf, &inode), 1413 buf.buf))) 1414 ret = reattach_inode(trans, &inode); 1415 fsck_err: 1416 printbuf_exit(&buf); 1417 return ret; 1418 } 1419 1420 /* 1421 * Reattach unreachable (but not unlinked) inodes 1422 * 1423 * Run after check_inodes() and check_dirents(), so we node that inode 1424 * backpointer fields point to valid dirents, and every inode that has a dirent 1425 * that points to it has its backpointer field set - so we're just looking for 1426 * non-unlinked inodes without backpointers: 1427 * 1428 * XXX: this is racy w.r.t. hardlink removal in online fsck 1429 */ 1430 int bch2_check_unreachable_inodes(struct bch_fs *c) 1431 { 1432 int ret = bch2_trans_run(c, 1433 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1434 POS_MIN, 1435 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1436 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1437 check_unreachable_inode(trans, &iter, k))); 1438 bch_err_fn(c, ret); 1439 return ret; 1440 } 1441 1442 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) 1443 { 1444 switch (btree) { 1445 case BTREE_ID_extents: 1446 return S_ISREG(mode) || S_ISLNK(mode); 1447 case BTREE_ID_dirents: 1448 return S_ISDIR(mode); 1449 case BTREE_ID_xattrs: 1450 return true; 1451 default: 1452 BUG(); 1453 } 1454 } 1455 1456 static int check_key_has_inode(struct btree_trans *trans, 1457 struct btree_iter *iter, 1458 struct inode_walker *inode, 1459 struct inode_walker_entry *i, 1460 struct bkey_s_c k) 1461 { 1462 struct bch_fs *c = trans->c; 1463 struct printbuf buf = PRINTBUF; 1464 struct btree_iter iter2 = {}; 1465 int ret = PTR_ERR_OR_ZERO(i); 1466 if (ret) 1467 return ret; 1468 1469 if (k.k->type == KEY_TYPE_whiteout) 1470 goto out; 1471 1472 bool have_inode = i && !i->whiteout; 1473 1474 if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) 1475 goto reconstruct; 1476 1477 if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode)) 1478 goto out; 1479 1480 prt_printf(&buf, ", "); 1481 1482 bool have_old_inode = false; 1483 darray_for_each(inode->inodes, i2) 1484 if (!i2->whiteout && 1485 bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) && 1486 btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) { 1487 prt_printf(&buf, "but found good inode in older snapshot\n"); 1488 bch2_inode_unpacked_to_text(&buf, &i2->inode); 1489 prt_newline(&buf); 1490 have_old_inode = true; 1491 break; 1492 } 1493 1494 struct bkey_s_c k2; 1495 unsigned nr_keys = 0; 1496 1497 prt_printf(&buf, "found keys:\n"); 1498 1499 for_each_btree_key_max_norestart(trans, iter2, iter->btree_id, 1500 SPOS(k.k->p.inode, 0, k.k->p.snapshot), 1501 POS(k.k->p.inode, U64_MAX), 1502 0, k2, ret) { 1503 nr_keys++; 1504 if (nr_keys <= 10) { 1505 bch2_bkey_val_to_text(&buf, c, k2); 1506 prt_newline(&buf); 1507 } 1508 if (nr_keys >= 100) 1509 break; 1510 } 1511 1512 if (ret) 1513 goto err; 1514 1515 if (nr_keys > 100) 1516 prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); 1517 else if (nr_keys > 10) 1518 prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); 1519 1520 if (!have_inode) { 1521 if (fsck_err_on(!have_inode, 1522 trans, key_in_missing_inode, 1523 "key in missing inode%s", buf.buf)) { 1524 /* 1525 * Maybe a deletion that raced with data move, or something 1526 * weird like that? But if we know the inode was deleted, or 1527 * it's just a few keys, we can safely delete them. 1528 * 1529 * If it's many keys, we should probably recreate the inode 1530 */ 1531 if (have_old_inode || nr_keys <= 2) 1532 goto delete; 1533 else 1534 goto reconstruct; 1535 } 1536 } else { 1537 /* 1538 * not autofix, this one would be a giant wtf - bit error in the 1539 * inode corrupting i_mode? 1540 * 1541 * may want to try repairing inode instead of deleting 1542 */ 1543 if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), 1544 trans, key_in_wrong_inode_type, 1545 "key for wrong inode mode %o%s", 1546 i->inode.bi_mode, buf.buf)) 1547 goto delete; 1548 } 1549 out: 1550 err: 1551 fsck_err: 1552 bch2_trans_iter_exit(trans, &iter2); 1553 printbuf_exit(&buf); 1554 bch_err_fn(c, ret); 1555 return ret; 1556 delete: 1557 /* 1558 * XXX: print out more info 1559 * count up extents for this inode, check if we have different inode in 1560 * an older snapshot version, perhaps decide if we want to reconstitute 1561 */ 1562 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); 1563 goto out; 1564 reconstruct: 1565 ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: 1566 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1567 if (ret) 1568 goto err; 1569 1570 inode->last_pos.inode--; 1571 ret = bch_err_throw(c, transaction_restart_nested); 1572 goto out; 1573 } 1574 1575 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) 1576 { 1577 struct bch_fs *c = trans->c; 1578 int ret = 0; 1579 s64 count2; 1580 1581 darray_for_each(w->inodes, i) { 1582 if (i->inode.bi_sectors == i->count) 1583 continue; 1584 1585 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); 1586 1587 if (w->recalculate_sums) 1588 i->count = count2; 1589 1590 if (i->count != count2) { 1591 bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1592 w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); 1593 i->count = count2; 1594 } 1595 1596 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), 1597 trans, inode_i_sectors_wrong, 1598 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", 1599 w->last_pos.inode, i->inode.bi_snapshot, 1600 i->inode.bi_sectors, i->count)) { 1601 i->inode.bi_sectors = i->count; 1602 ret = bch2_fsck_write_inode(trans, &i->inode); 1603 if (ret) 1604 break; 1605 } 1606 } 1607 fsck_err: 1608 bch_err_fn(c, ret); 1609 return ret; 1610 } 1611 1612 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) 1613 { 1614 u32 restart_count = trans->restart_count; 1615 return check_i_sectors_notnested(trans, w) ?: 1616 trans_was_restarted(trans, restart_count); 1617 } 1618 1619 struct extent_end { 1620 u32 snapshot; 1621 u64 offset; 1622 struct snapshots_seen seen; 1623 }; 1624 1625 struct extent_ends { 1626 struct bpos last_pos; 1627 DARRAY(struct extent_end) e; 1628 }; 1629 1630 static void extent_ends_reset(struct extent_ends *extent_ends) 1631 { 1632 darray_for_each(extent_ends->e, i) 1633 snapshots_seen_exit(&i->seen); 1634 extent_ends->e.nr = 0; 1635 } 1636 1637 static void extent_ends_exit(struct extent_ends *extent_ends) 1638 { 1639 extent_ends_reset(extent_ends); 1640 darray_exit(&extent_ends->e); 1641 } 1642 1643 static void extent_ends_init(struct extent_ends *extent_ends) 1644 { 1645 memset(extent_ends, 0, sizeof(*extent_ends)); 1646 } 1647 1648 static int extent_ends_at(struct bch_fs *c, 1649 struct extent_ends *extent_ends, 1650 struct snapshots_seen *seen, 1651 struct bkey_s_c k) 1652 { 1653 struct extent_end *i, n = (struct extent_end) { 1654 .offset = k.k->p.offset, 1655 .snapshot = k.k->p.snapshot, 1656 .seen = *seen, 1657 }; 1658 1659 n.seen.ids.data = kmemdup(seen->ids.data, 1660 sizeof(seen->ids.data[0]) * seen->ids.size, 1661 GFP_KERNEL); 1662 if (!n.seen.ids.data) 1663 return bch_err_throw(c, ENOMEM_fsck_extent_ends_at); 1664 1665 __darray_for_each(extent_ends->e, i) { 1666 if (i->snapshot == k.k->p.snapshot) { 1667 snapshots_seen_exit(&i->seen); 1668 *i = n; 1669 return 0; 1670 } 1671 1672 if (i->snapshot >= k.k->p.snapshot) 1673 break; 1674 } 1675 1676 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); 1677 } 1678 1679 static int overlapping_extents_found(struct btree_trans *trans, 1680 enum btree_id btree, 1681 struct bpos pos1, struct snapshots_seen *pos1_seen, 1682 struct bkey pos2, 1683 bool *fixed, 1684 struct extent_end *extent_end) 1685 { 1686 struct bch_fs *c = trans->c; 1687 struct printbuf buf = PRINTBUF; 1688 struct btree_iter iter1, iter2 = {}; 1689 struct bkey_s_c k1, k2; 1690 int ret; 1691 1692 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); 1693 1694 bch2_trans_iter_init(trans, &iter1, btree, pos1, 1695 BTREE_ITER_all_snapshots| 1696 BTREE_ITER_not_extents); 1697 k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); 1698 ret = bkey_err(k1); 1699 if (ret) 1700 goto err; 1701 1702 prt_newline(&buf); 1703 bch2_bkey_val_to_text(&buf, c, k1); 1704 1705 if (!bpos_eq(pos1, k1.k->p)) { 1706 prt_str(&buf, "\nwanted\n "); 1707 bch2_bpos_to_text(&buf, pos1); 1708 prt_str(&buf, "\n"); 1709 bch2_bkey_to_text(&buf, &pos2); 1710 1711 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", 1712 __func__, buf.buf); 1713 ret = bch_err_throw(c, internal_fsck_err); 1714 goto err; 1715 } 1716 1717 bch2_trans_copy_iter(trans, &iter2, &iter1); 1718 1719 while (1) { 1720 bch2_btree_iter_advance(trans, &iter2); 1721 1722 k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); 1723 ret = bkey_err(k2); 1724 if (ret) 1725 goto err; 1726 1727 if (bpos_ge(k2.k->p, pos2.p)) 1728 break; 1729 } 1730 1731 prt_newline(&buf); 1732 bch2_bkey_val_to_text(&buf, c, k2); 1733 1734 if (bpos_gt(k2.k->p, pos2.p) || 1735 pos2.size != k2.k->size) { 1736 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", 1737 __func__, buf.buf); 1738 ret = bch_err_throw(c, internal_fsck_err); 1739 goto err; 1740 } 1741 1742 prt_printf(&buf, "\noverwriting %s extent", 1743 pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); 1744 1745 if (fsck_err(trans, extent_overlapping, 1746 "overlapping extents%s", buf.buf)) { 1747 struct btree_iter *old_iter = &iter1; 1748 struct disk_reservation res = { 0 }; 1749 1750 if (pos1.snapshot < pos2.p.snapshot) { 1751 old_iter = &iter2; 1752 swap(k1, k2); 1753 } 1754 1755 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); 1756 1757 ret = bch2_trans_update_extent_overwrite(trans, old_iter, 1758 BTREE_UPDATE_internal_snapshot_node, 1759 k1, k2) ?: 1760 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); 1761 bch2_disk_reservation_put(c, &res); 1762 1763 bch_info(c, "repair ret %s", bch2_err_str(ret)); 1764 1765 if (ret) 1766 goto err; 1767 1768 *fixed = true; 1769 1770 if (pos1.snapshot == pos2.p.snapshot) { 1771 /* 1772 * We overwrote the first extent, and did the overwrite 1773 * in the same snapshot: 1774 */ 1775 extent_end->offset = bkey_start_offset(&pos2); 1776 } else if (pos1.snapshot > pos2.p.snapshot) { 1777 /* 1778 * We overwrote the first extent in pos2's snapshot: 1779 */ 1780 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); 1781 } else { 1782 /* 1783 * We overwrote the second extent - restart 1784 * check_extent() from the top: 1785 */ 1786 ret = bch_err_throw(c, transaction_restart_nested); 1787 } 1788 } 1789 fsck_err: 1790 err: 1791 bch2_trans_iter_exit(trans, &iter2); 1792 bch2_trans_iter_exit(trans, &iter1); 1793 printbuf_exit(&buf); 1794 return ret; 1795 } 1796 1797 static int check_overlapping_extents(struct btree_trans *trans, 1798 struct snapshots_seen *seen, 1799 struct extent_ends *extent_ends, 1800 struct bkey_s_c k, 1801 struct btree_iter *iter, 1802 bool *fixed) 1803 { 1804 struct bch_fs *c = trans->c; 1805 int ret = 0; 1806 1807 /* transaction restart, running again */ 1808 if (bpos_eq(extent_ends->last_pos, k.k->p)) 1809 return 0; 1810 1811 if (extent_ends->last_pos.inode != k.k->p.inode) 1812 extent_ends_reset(extent_ends); 1813 1814 darray_for_each(extent_ends->e, i) { 1815 if (i->offset <= bkey_start_offset(k.k)) 1816 continue; 1817 1818 if (!ref_visible2(c, 1819 k.k->p.snapshot, seen, 1820 i->snapshot, &i->seen)) 1821 continue; 1822 1823 ret = overlapping_extents_found(trans, iter->btree_id, 1824 SPOS(iter->pos.inode, 1825 i->offset, 1826 i->snapshot), 1827 &i->seen, 1828 *k.k, fixed, i); 1829 if (ret) 1830 goto err; 1831 } 1832 1833 extent_ends->last_pos = k.k->p; 1834 err: 1835 return ret; 1836 } 1837 1838 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, 1839 struct bkey_s_c k) 1840 { 1841 struct bch_fs *c = trans->c; 1842 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1843 struct bch_extent_crc_unpacked crc; 1844 const union bch_extent_entry *i; 1845 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; 1846 1847 bkey_for_each_crc(k.k, ptrs, crc, i) 1848 if (crc_is_encoded(crc) && 1849 crc.uncompressed_size > encoded_extent_max_sectors) { 1850 struct printbuf buf = PRINTBUF; 1851 1852 bch2_bkey_val_to_text(&buf, c, k); 1853 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); 1854 printbuf_exit(&buf); 1855 } 1856 1857 return 0; 1858 } 1859 1860 static int check_extent(struct btree_trans *trans, struct btree_iter *iter, 1861 struct bkey_s_c k, 1862 struct inode_walker *inode, 1863 struct snapshots_seen *s, 1864 struct extent_ends *extent_ends, 1865 struct disk_reservation *res) 1866 { 1867 struct bch_fs *c = trans->c; 1868 struct printbuf buf = PRINTBUF; 1869 int ret = 0; 1870 1871 ret = bch2_check_key_has_snapshot(trans, iter, k); 1872 if (ret) { 1873 ret = ret < 0 ? ret : 0; 1874 goto out; 1875 } 1876 1877 if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { 1878 ret = check_i_sectors(trans, inode); 1879 if (ret) 1880 goto err; 1881 } 1882 1883 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1884 if (ret) 1885 goto err; 1886 1887 struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); 1888 ret = PTR_ERR_OR_ZERO(extent_i); 1889 if (ret) 1890 goto err; 1891 1892 ret = check_key_has_inode(trans, iter, inode, extent_i, k); 1893 if (ret) 1894 goto err; 1895 1896 if (k.k->type != KEY_TYPE_whiteout) { 1897 ret = check_overlapping_extents(trans, s, extent_ends, k, iter, 1898 &inode->recalculate_sums); 1899 if (ret) 1900 goto err; 1901 1902 /* 1903 * Check inodes in reverse order, from oldest snapshots to 1904 * newest, starting from the inode that matches this extent's 1905 * snapshot. If we didn't have one, iterate over all inodes: 1906 */ 1907 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1908 inode->inodes.data && i >= inode->inodes.data; 1909 --i) { 1910 if (i->inode.bi_snapshot > k.k->p.snapshot || 1911 !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) 1912 continue; 1913 1914 u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9; 1915 1916 if (fsck_err_on(k.k->p.offset > last_block && 1917 !bkey_extent_is_reservation(k), 1918 trans, extent_past_end_of_inode, 1919 "extent type past end of inode %llu:%u, i_size %llu\n%s", 1920 i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, 1921 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1922 struct bkey_i *whiteout = bch2_trans_kmalloc(trans, sizeof(*whiteout)); 1923 ret = PTR_ERR_OR_ZERO(whiteout); 1924 if (ret) 1925 goto err; 1926 1927 bkey_init(&whiteout->k); 1928 whiteout->k.p = SPOS(k.k->p.inode, 1929 last_block, 1930 i->inode.bi_snapshot); 1931 bch2_key_resize(&whiteout->k, 1932 min(KEY_SIZE_MAX & (~0 << c->block_bits), 1933 U64_MAX - whiteout->k.p.offset)); 1934 1935 1936 /* 1937 * Need a normal (not BTREE_ITER_all_snapshots) 1938 * iterator, if we're deleting in a different 1939 * snapshot and need to emit a whiteout 1940 */ 1941 struct btree_iter iter2; 1942 bch2_trans_iter_init(trans, &iter2, BTREE_ID_extents, 1943 bkey_start_pos(&whiteout->k), 1944 BTREE_ITER_intent); 1945 ret = bch2_btree_iter_traverse(trans, &iter2) ?: 1946 bch2_trans_update(trans, &iter2, whiteout, 1947 BTREE_UPDATE_internal_snapshot_node); 1948 bch2_trans_iter_exit(trans, &iter2); 1949 if (ret) 1950 goto err; 1951 1952 iter->k.type = KEY_TYPE_whiteout; 1953 break; 1954 } 1955 } 1956 } 1957 1958 ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); 1959 if (ret) 1960 goto err; 1961 1962 if (bkey_extent_is_allocation(k.k)) { 1963 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); 1964 inode->inodes.data && i >= inode->inodes.data; 1965 --i) { 1966 if (i->whiteout || 1967 i->inode.bi_snapshot > k.k->p.snapshot || 1968 !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) 1969 continue; 1970 1971 i->count += k.k->size; 1972 } 1973 } 1974 1975 if (k.k->type != KEY_TYPE_whiteout) { 1976 ret = extent_ends_at(c, extent_ends, s, k); 1977 if (ret) 1978 goto err; 1979 } 1980 out: 1981 err: 1982 fsck_err: 1983 printbuf_exit(&buf); 1984 bch_err_fn(c, ret); 1985 return ret; 1986 } 1987 1988 /* 1989 * Walk extents: verify that extents have a corresponding S_ISREG inode, and 1990 * that i_size an i_sectors are consistent 1991 */ 1992 int bch2_check_extents(struct bch_fs *c) 1993 { 1994 struct inode_walker w = inode_walker_init(); 1995 struct snapshots_seen s; 1996 struct extent_ends extent_ends; 1997 struct disk_reservation res = { 0 }; 1998 1999 snapshots_seen_init(&s); 2000 extent_ends_init(&extent_ends); 2001 2002 int ret = bch2_trans_run(c, 2003 for_each_btree_key(trans, iter, BTREE_ID_extents, 2004 POS(BCACHEFS_ROOT_INO, 0), 2005 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ 2006 bch2_disk_reservation_put(c, &res); 2007 check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: 2008 check_extent_overbig(trans, &iter, k); 2009 })) ?: 2010 check_i_sectors_notnested(trans, &w)); 2011 2012 bch2_disk_reservation_put(c, &res); 2013 extent_ends_exit(&extent_ends); 2014 inode_walker_exit(&w); 2015 snapshots_seen_exit(&s); 2016 2017 bch_err_fn(c, ret); 2018 return ret; 2019 } 2020 2021 int bch2_check_indirect_extents(struct bch_fs *c) 2022 { 2023 struct disk_reservation res = { 0 }; 2024 2025 int ret = bch2_trans_run(c, 2026 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, 2027 POS_MIN, 2028 BTREE_ITER_prefetch, k, 2029 &res, NULL, 2030 BCH_TRANS_COMMIT_no_enospc, ({ 2031 bch2_disk_reservation_put(c, &res); 2032 check_extent_overbig(trans, &iter, k); 2033 }))); 2034 2035 bch2_disk_reservation_put(c, &res); 2036 bch_err_fn(c, ret); 2037 return ret; 2038 } 2039 2040 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) 2041 { 2042 struct bch_fs *c = trans->c; 2043 int ret = 0; 2044 s64 count2; 2045 2046 darray_for_each(w->inodes, i) { 2047 if (i->inode.bi_nlink == i->count) 2048 continue; 2049 2050 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); 2051 if (count2 < 0) 2052 return count2; 2053 2054 if (i->count != count2) { 2055 bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", 2056 w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); 2057 i->count = count2; 2058 if (i->inode.bi_nlink == i->count) 2059 continue; 2060 } 2061 2062 if (i->inode.bi_nlink != i->count) { 2063 CLASS(printbuf, buf)(); 2064 2065 lockrestart_do(trans, 2066 bch2_inum_snapshot_to_path(trans, w->last_pos.inode, 2067 i->inode.bi_snapshot, NULL, &buf)); 2068 2069 if (fsck_err_on(i->inode.bi_nlink != i->count, 2070 trans, inode_dir_wrong_nlink, 2071 "directory with wrong i_nlink: got %u, should be %llu\n%s", 2072 i->inode.bi_nlink, i->count, buf.buf)) { 2073 i->inode.bi_nlink = i->count; 2074 ret = bch2_fsck_write_inode(trans, &i->inode); 2075 if (ret) 2076 break; 2077 } 2078 } 2079 } 2080 fsck_err: 2081 bch_err_fn(c, ret); 2082 return ret; 2083 } 2084 2085 static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) 2086 { 2087 u32 restart_count = trans->restart_count; 2088 return check_subdir_count_notnested(trans, w) ?: 2089 trans_was_restarted(trans, restart_count); 2090 } 2091 2092 /* find a subvolume that's a descendent of @snapshot: */ 2093 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) 2094 { 2095 struct btree_iter iter; 2096 struct bkey_s_c k; 2097 int ret; 2098 2099 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { 2100 if (k.k->type != KEY_TYPE_subvolume) 2101 continue; 2102 2103 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2104 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { 2105 bch2_trans_iter_exit(trans, &iter); 2106 *subvolid = k.k->p.offset; 2107 goto found; 2108 } 2109 } 2110 if (!ret) 2111 ret = -ENOENT; 2112 found: 2113 bch2_trans_iter_exit(trans, &iter); 2114 return ret; 2115 } 2116 2117 noinline_for_stack 2118 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, 2119 struct bkey_s_c_dirent d) 2120 { 2121 struct bch_fs *c = trans->c; 2122 struct btree_iter subvol_iter = {}; 2123 struct bch_inode_unpacked subvol_root; 2124 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); 2125 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 2126 u32 parent_snapshot; 2127 u32 new_parent_subvol = 0; 2128 u64 parent_inum; 2129 struct printbuf buf = PRINTBUF; 2130 int ret = 0; 2131 2132 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); 2133 if (ret && !bch2_err_matches(ret, ENOENT)) 2134 return ret; 2135 2136 if (ret || 2137 (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { 2138 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); 2139 if (ret2 && !bch2_err_matches(ret, ENOENT)) 2140 return ret2; 2141 } 2142 2143 if (ret && 2144 !new_parent_subvol && 2145 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 2146 /* 2147 * Couldn't find a subvol for dirent's snapshot - but we lost 2148 * subvols, so we need to reconstruct: 2149 */ 2150 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); 2151 if (ret) 2152 return ret; 2153 2154 parent_snapshot = d.k->p.snapshot; 2155 } 2156 2157 if (fsck_err_on(ret, 2158 trans, dirent_to_missing_parent_subvol, 2159 "dirent parent_subvol points to missing subvolume\n%s", 2160 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || 2161 fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), 2162 trans, dirent_not_visible_in_parent_subvol, 2163 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", 2164 parent_snapshot, 2165 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 2166 if (!new_parent_subvol) { 2167 bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); 2168 return bch_err_throw(c, fsck_repair_unimplemented); 2169 } 2170 2171 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); 2172 ret = PTR_ERR_OR_ZERO(new_dirent); 2173 if (ret) 2174 goto err; 2175 2176 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); 2177 } 2178 2179 struct bkey_s_c_subvolume s = 2180 bch2_bkey_get_iter_typed(trans, &subvol_iter, 2181 BTREE_ID_subvolumes, POS(0, target_subvol), 2182 0, subvolume); 2183 ret = bkey_err(s.s_c); 2184 if (ret && !bch2_err_matches(ret, ENOENT)) 2185 goto err; 2186 2187 if (ret) { 2188 if (fsck_err(trans, dirent_to_missing_subvol, 2189 "dirent points to missing subvolume\n%s", 2190 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) 2191 return bch2_fsck_remove_dirent(trans, d.k->p); 2192 ret = 0; 2193 goto out; 2194 } 2195 2196 if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { 2197 printbuf_reset(&buf); 2198 2199 prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", 2200 parent_subvol); 2201 2202 ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, 2203 le64_to_cpu(s.v->inode) }, &buf); 2204 if (ret) 2205 goto err; 2206 prt_newline(&buf); 2207 bch2_bkey_val_to_text(&buf, c, s.s_c); 2208 2209 if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { 2210 struct bkey_i_subvolume *n = 2211 bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); 2212 ret = PTR_ERR_OR_ZERO(n); 2213 if (ret) 2214 goto err; 2215 2216 n->v.fs_path_parent = cpu_to_le32(parent_subvol); 2217 } 2218 } 2219 2220 u64 target_inum = le64_to_cpu(s.v->inode); 2221 u32 target_snapshot = le32_to_cpu(s.v->snapshot); 2222 2223 ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, 2224 &subvol_root, 0); 2225 if (ret && !bch2_err_matches(ret, ENOENT)) 2226 goto err; 2227 2228 if (ret) { 2229 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); 2230 ret = bch_err_throw(c, fsck_repair_unimplemented); 2231 goto err; 2232 } 2233 2234 if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, 2235 trans, inode_bi_parent_wrong, 2236 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", 2237 target_inum, 2238 subvol_root.bi_parent_subvol, parent_subvol)) { 2239 subvol_root.bi_parent_subvol = parent_subvol; 2240 subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot); 2241 ret = __bch2_fsck_write_inode(trans, &subvol_root); 2242 if (ret) 2243 goto err; 2244 } 2245 2246 ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); 2247 if (ret) 2248 goto err; 2249 out: 2250 err: 2251 fsck_err: 2252 bch2_trans_iter_exit(trans, &subvol_iter); 2253 printbuf_exit(&buf); 2254 return ret; 2255 } 2256 2257 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, 2258 struct bkey_s_c k, 2259 struct bch_hash_info *hash_info, 2260 struct inode_walker *dir, 2261 struct inode_walker *target, 2262 struct snapshots_seen *s, 2263 bool *need_second_pass) 2264 { 2265 struct bch_fs *c = trans->c; 2266 struct inode_walker_entry *i; 2267 struct printbuf buf = PRINTBUF; 2268 int ret = 0; 2269 2270 ret = bch2_check_key_has_snapshot(trans, iter, k); 2271 if (ret) { 2272 ret = ret < 0 ? ret : 0; 2273 goto out; 2274 } 2275 2276 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 2277 if (ret) 2278 goto err; 2279 2280 if (k.k->type == KEY_TYPE_whiteout) 2281 goto out; 2282 2283 if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { 2284 ret = check_subdir_dirents_count(trans, dir); 2285 if (ret) 2286 goto err; 2287 } 2288 2289 i = walk_inode(trans, dir, k); 2290 ret = PTR_ERR_OR_ZERO(i); 2291 if (ret < 0) 2292 goto err; 2293 2294 ret = check_key_has_inode(trans, iter, dir, i, k); 2295 if (ret) 2296 goto err; 2297 2298 if (!i || i->whiteout) 2299 goto out; 2300 2301 if (dir->first_this_inode) 2302 *hash_info = bch2_hash_info_init(c, &i->inode); 2303 dir->first_this_inode = false; 2304 2305 #ifdef CONFIG_UNICODE 2306 hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; 2307 #endif 2308 2309 ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, 2310 iter, k, need_second_pass); 2311 if (ret < 0) 2312 goto err; 2313 if (ret) { 2314 /* dirent has been deleted */ 2315 ret = 0; 2316 goto out; 2317 } 2318 2319 if (k.k->type != KEY_TYPE_dirent) 2320 goto out; 2321 2322 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2323 2324 /* check casefold */ 2325 if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding, 2326 trans, dirent_casefold_mismatch, 2327 "dirent casefold does not match dir casefold\n%s", 2328 (printbuf_reset(&buf), 2329 bch2_bkey_val_to_text(&buf, c, k), 2330 buf.buf))) { 2331 subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL 2332 ? le32_to_cpu(d.v->d_parent_subvol) 2333 : 0, 2334 }; 2335 u64 target = d.v->d_type == DT_SUBVOL 2336 ? le32_to_cpu(d.v->d_child_subvol) 2337 : le64_to_cpu(d.v->d_inum); 2338 struct qstr name = bch2_dirent_get_name(d); 2339 2340 struct bkey_i_dirent *new_d = 2341 bch2_dirent_create_key(trans, hash_info, dir_inum, 2342 d.v->d_type, &name, NULL, target); 2343 ret = PTR_ERR_OR_ZERO(new_d); 2344 if (ret) 2345 goto out; 2346 2347 new_d->k.p.inode = d.k->p.inode; 2348 new_d->k.p.snapshot = d.k->p.snapshot; 2349 2350 struct btree_iter dup_iter = {}; 2351 ret = bch2_hash_delete_at(trans, 2352 bch2_dirent_hash_desc, hash_info, iter, 2353 BTREE_UPDATE_internal_snapshot_node) ?: 2354 bch2_str_hash_repair_key(trans, s, 2355 &bch2_dirent_hash_desc, hash_info, 2356 iter, bkey_i_to_s_c(&new_d->k_i), 2357 &dup_iter, bkey_s_c_null, 2358 need_second_pass); 2359 goto out; 2360 } 2361 2362 if (d.v->d_type == DT_SUBVOL) { 2363 ret = check_dirent_to_subvol(trans, iter, d); 2364 if (ret) 2365 goto err; 2366 } else { 2367 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); 2368 if (ret) 2369 goto err; 2370 2371 if (fsck_err_on(!target->inodes.nr, 2372 trans, dirent_to_missing_inode, 2373 "dirent points to missing inode:\n%s", 2374 (printbuf_reset(&buf), 2375 bch2_bkey_val_to_text(&buf, c, k), 2376 buf.buf))) { 2377 ret = bch2_fsck_remove_dirent(trans, d.k->p); 2378 if (ret) 2379 goto err; 2380 } 2381 2382 darray_for_each(target->inodes, i) { 2383 ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); 2384 if (ret) 2385 goto err; 2386 } 2387 2388 darray_for_each(target->deletes, i) 2389 if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i), 2390 trans, dirent_to_overwritten_inode, 2391 "dirent points to inode overwritten in snapshot %u:\n%s", 2392 *i, 2393 (printbuf_reset(&buf), 2394 bch2_bkey_val_to_text(&buf, c, k), 2395 buf.buf))) { 2396 struct btree_iter delete_iter; 2397 bch2_trans_iter_init(trans, &delete_iter, 2398 BTREE_ID_dirents, 2399 SPOS(k.k->p.inode, k.k->p.offset, *i), 2400 BTREE_ITER_intent); 2401 ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: 2402 bch2_hash_delete_at(trans, bch2_dirent_hash_desc, 2403 hash_info, 2404 &delete_iter, 2405 BTREE_UPDATE_internal_snapshot_node); 2406 bch2_trans_iter_exit(trans, &delete_iter); 2407 if (ret) 2408 goto err; 2409 2410 } 2411 } 2412 2413 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 2414 if (ret) 2415 goto err; 2416 2417 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { 2418 if (d.v->d_type == DT_DIR) 2419 i->count++; 2420 i->i_size += bkey_bytes(d.k); 2421 } 2422 out: 2423 err: 2424 fsck_err: 2425 printbuf_exit(&buf); 2426 return ret; 2427 } 2428 2429 /* 2430 * Walk dirents: verify that they all have a corresponding S_ISDIR inode, 2431 * validate d_type 2432 */ 2433 int bch2_check_dirents(struct bch_fs *c) 2434 { 2435 struct inode_walker dir = inode_walker_init(); 2436 struct inode_walker target = inode_walker_init(); 2437 struct snapshots_seen s; 2438 struct bch_hash_info hash_info; 2439 bool need_second_pass = false, did_second_pass = false; 2440 int ret; 2441 2442 snapshots_seen_init(&s); 2443 again: 2444 ret = bch2_trans_run(c, 2445 for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, 2446 POS(BCACHEFS_ROOT_INO, 0), 2447 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 2448 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2449 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, 2450 &need_second_pass)) ?: 2451 check_subdir_count_notnested(trans, &dir)); 2452 2453 if (!ret && need_second_pass && !did_second_pass) { 2454 bch_info(c, "check_dirents requires second pass"); 2455 swap(did_second_pass, need_second_pass); 2456 goto again; 2457 } 2458 2459 if (!ret && need_second_pass) { 2460 bch_err(c, "dirents not repairing"); 2461 ret = -EINVAL; 2462 } 2463 2464 snapshots_seen_exit(&s); 2465 inode_walker_exit(&dir); 2466 inode_walker_exit(&target); 2467 bch_err_fn(c, ret); 2468 return ret; 2469 } 2470 2471 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, 2472 struct bkey_s_c k, 2473 struct bch_hash_info *hash_info, 2474 struct inode_walker *inode) 2475 { 2476 struct bch_fs *c = trans->c; 2477 2478 int ret = bch2_check_key_has_snapshot(trans, iter, k); 2479 if (ret < 0) 2480 return ret; 2481 if (ret) 2482 return 0; 2483 2484 struct inode_walker_entry *i = walk_inode(trans, inode, k); 2485 ret = PTR_ERR_OR_ZERO(i); 2486 if (ret) 2487 return ret; 2488 2489 ret = check_key_has_inode(trans, iter, inode, i, k); 2490 if (ret) 2491 return ret; 2492 2493 if (!i || i->whiteout) 2494 return 0; 2495 2496 if (inode->first_this_inode) 2497 *hash_info = bch2_hash_info_init(c, &i->inode); 2498 inode->first_this_inode = false; 2499 2500 bool need_second_pass = false; 2501 return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, 2502 iter, k, &need_second_pass); 2503 } 2504 2505 /* 2506 * Walk xattrs: verify that they all have a corresponding inode 2507 */ 2508 int bch2_check_xattrs(struct bch_fs *c) 2509 { 2510 struct inode_walker inode = inode_walker_init(); 2511 struct bch_hash_info hash_info; 2512 int ret = 0; 2513 2514 ret = bch2_trans_run(c, 2515 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, 2516 POS(BCACHEFS_ROOT_INO, 0), 2517 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, 2518 k, 2519 NULL, NULL, 2520 BCH_TRANS_COMMIT_no_enospc, 2521 check_xattr(trans, &iter, k, &hash_info, &inode))); 2522 2523 inode_walker_exit(&inode); 2524 bch_err_fn(c, ret); 2525 return ret; 2526 } 2527 2528 static int check_root_trans(struct btree_trans *trans) 2529 { 2530 struct bch_fs *c = trans->c; 2531 struct bch_inode_unpacked root_inode; 2532 u32 snapshot; 2533 u64 inum; 2534 int ret; 2535 2536 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); 2537 if (ret && !bch2_err_matches(ret, ENOENT)) 2538 return ret; 2539 2540 if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, 2541 "root subvol missing")) { 2542 struct bkey_i_subvolume *root_subvol = 2543 bch2_trans_kmalloc(trans, sizeof(*root_subvol)); 2544 ret = PTR_ERR_OR_ZERO(root_subvol); 2545 if (ret) 2546 goto err; 2547 2548 snapshot = U32_MAX; 2549 inum = BCACHEFS_ROOT_INO; 2550 2551 bkey_subvolume_init(&root_subvol->k_i); 2552 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; 2553 root_subvol->v.flags = 0; 2554 root_subvol->v.snapshot = cpu_to_le32(snapshot); 2555 root_subvol->v.inode = cpu_to_le64(inum); 2556 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); 2557 bch_err_msg(c, ret, "writing root subvol"); 2558 if (ret) 2559 goto err; 2560 } 2561 2562 ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, 2563 &root_inode, 0); 2564 if (ret && !bch2_err_matches(ret, ENOENT)) 2565 return ret; 2566 2567 if (mustfix_fsck_err_on(ret, 2568 trans, root_dir_missing, 2569 "root directory missing") || 2570 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), 2571 trans, root_inode_not_dir, 2572 "root inode not a directory")) { 2573 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 2574 0, NULL); 2575 root_inode.bi_inum = inum; 2576 root_inode.bi_snapshot = snapshot; 2577 2578 ret = __bch2_fsck_write_inode(trans, &root_inode); 2579 bch_err_msg(c, ret, "writing root inode"); 2580 } 2581 err: 2582 fsck_err: 2583 return ret; 2584 } 2585 2586 /* Get root directory, create if it doesn't exist: */ 2587 int bch2_check_root(struct bch_fs *c) 2588 { 2589 int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2590 check_root_trans(trans)); 2591 bch_err_fn(c, ret); 2592 return ret; 2593 } 2594 2595 static bool darray_u32_has(darray_u32 *d, u32 v) 2596 { 2597 darray_for_each(*d, i) 2598 if (*i == v) 2599 return true; 2600 return false; 2601 } 2602 2603 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) 2604 { 2605 struct bch_fs *c = trans->c; 2606 struct btree_iter parent_iter = {}; 2607 darray_u32 subvol_path = {}; 2608 struct printbuf buf = PRINTBUF; 2609 int ret = 0; 2610 2611 if (k.k->type != KEY_TYPE_subvolume) 2612 return 0; 2613 2614 subvol_inum start = { 2615 .subvol = k.k->p.offset, 2616 .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode), 2617 }; 2618 2619 while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { 2620 ret = darray_push(&subvol_path, k.k->p.offset); 2621 if (ret) 2622 goto err; 2623 2624 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2625 2626 struct bch_inode_unpacked subvol_root; 2627 ret = bch2_inode_find_by_inum_trans(trans, 2628 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 2629 &subvol_root); 2630 if (ret) 2631 break; 2632 2633 u32 parent = le32_to_cpu(s.v->fs_path_parent); 2634 2635 if (darray_u32_has(&subvol_path, parent)) { 2636 printbuf_reset(&buf); 2637 prt_printf(&buf, "subvolume loop: "); 2638 2639 ret = bch2_inum_to_path(trans, start, &buf); 2640 if (ret) 2641 goto err; 2642 2643 if (fsck_err(trans, subvol_loop, "%s", buf.buf)) 2644 ret = reattach_subvol(trans, s); 2645 break; 2646 } 2647 2648 bch2_trans_iter_exit(trans, &parent_iter); 2649 bch2_trans_iter_init(trans, &parent_iter, 2650 BTREE_ID_subvolumes, POS(0, parent), 0); 2651 k = bch2_btree_iter_peek_slot(trans, &parent_iter); 2652 ret = bkey_err(k); 2653 if (ret) 2654 goto err; 2655 2656 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, 2657 trans, subvol_unreachable, 2658 "unreachable subvolume %s", 2659 (printbuf_reset(&buf), 2660 bch2_bkey_val_to_text(&buf, c, s.s_c), 2661 buf.buf))) { 2662 ret = reattach_subvol(trans, s); 2663 break; 2664 } 2665 } 2666 fsck_err: 2667 err: 2668 printbuf_exit(&buf); 2669 darray_exit(&subvol_path); 2670 bch2_trans_iter_exit(trans, &parent_iter); 2671 return ret; 2672 } 2673 2674 int bch2_check_subvolume_structure(struct bch_fs *c) 2675 { 2676 int ret = bch2_trans_run(c, 2677 for_each_btree_key_commit(trans, iter, 2678 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, 2679 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2680 check_subvol_path(trans, &iter, k))); 2681 bch_err_fn(c, ret); 2682 return ret; 2683 } 2684 2685 static int bch2_bi_depth_renumber_one(struct btree_trans *trans, 2686 u64 inum, u32 snapshot, 2687 u32 new_depth) 2688 { 2689 struct btree_iter iter; 2690 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 2691 SPOS(0, inum, snapshot), 0); 2692 2693 struct bch_inode_unpacked inode; 2694 int ret = bkey_err(k) ?: 2695 !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode 2696 : bch2_inode_unpack(k, &inode); 2697 if (ret) 2698 goto err; 2699 2700 if (inode.bi_depth != new_depth) { 2701 inode.bi_depth = new_depth; 2702 ret = __bch2_fsck_write_inode(trans, &inode) ?: 2703 bch2_trans_commit(trans, NULL, NULL, 0); 2704 } 2705 err: 2706 bch2_trans_iter_exit(trans, &iter); 2707 return ret; 2708 } 2709 2710 static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path, 2711 u32 snapshot, u32 new_bi_depth) 2712 { 2713 u32 restart_count = trans->restart_count; 2714 int ret = 0; 2715 2716 darray_for_each_reverse(*path, i) { 2717 ret = nested_lockrestart_do(trans, 2718 bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth)); 2719 bch_err_fn(trans->c, ret); 2720 if (ret) 2721 break; 2722 2723 new_bi_depth++; 2724 } 2725 2726 return ret ?: trans_was_restarted(trans, restart_count); 2727 } 2728 2729 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) 2730 { 2731 struct bch_fs *c = trans->c; 2732 struct btree_iter inode_iter = {}; 2733 darray_u64 path = {}; 2734 struct printbuf buf = PRINTBUF; 2735 u32 snapshot = inode_k.k->p.snapshot; 2736 bool redo_bi_depth = false; 2737 u32 min_bi_depth = U32_MAX; 2738 int ret = 0; 2739 2740 struct bpos start = inode_k.k->p; 2741 2742 struct bch_inode_unpacked inode; 2743 ret = bch2_inode_unpack(inode_k, &inode); 2744 if (ret) 2745 return ret; 2746 2747 /* 2748 * If we're running full fsck, check_dirents() will have already ran, 2749 * and we shouldn't see any missing backpointers here - otherwise that's 2750 * handled separately, by check_unreachable_inodes 2751 */ 2752 while (!inode.bi_subvol && 2753 bch2_inode_has_backpointer(&inode)) { 2754 struct btree_iter dirent_iter; 2755 struct bkey_s_c_dirent d; 2756 2757 d = dirent_get_by_pos(trans, &dirent_iter, 2758 SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot)); 2759 ret = bkey_err(d.s_c); 2760 if (ret && !bch2_err_matches(ret, ENOENT)) 2761 goto out; 2762 2763 if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) 2764 bch2_trans_iter_exit(trans, &dirent_iter); 2765 2766 if (bch2_err_matches(ret, ENOENT)) { 2767 printbuf_reset(&buf); 2768 bch2_bkey_val_to_text(&buf, c, inode_k); 2769 bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", 2770 bch2_err_str(ret), buf.buf); 2771 goto out; 2772 } 2773 2774 bch2_trans_iter_exit(trans, &dirent_iter); 2775 2776 ret = darray_push(&path, inode.bi_inum); 2777 if (ret) 2778 return ret; 2779 2780 bch2_trans_iter_exit(trans, &inode_iter); 2781 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, 2782 SPOS(0, inode.bi_dir, snapshot), 0); 2783 2784 struct bch_inode_unpacked parent_inode; 2785 ret = bkey_err(inode_k) ?: 2786 !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode 2787 : bch2_inode_unpack(inode_k, &parent_inode); 2788 if (ret) { 2789 /* Should have been caught in dirents pass */ 2790 bch_err_msg(c, ret, "error looking up parent directory"); 2791 goto out; 2792 } 2793 2794 min_bi_depth = parent_inode.bi_depth; 2795 2796 if (parent_inode.bi_depth < inode.bi_depth && 2797 min_bi_depth < U16_MAX) 2798 break; 2799 2800 inode = parent_inode; 2801 redo_bi_depth = true; 2802 2803 if (darray_find(path, inode.bi_inum)) { 2804 printbuf_reset(&buf); 2805 prt_printf(&buf, "directory structure loop in snapshot %u: ", 2806 snapshot); 2807 2808 ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf); 2809 if (ret) 2810 goto out; 2811 2812 if (c->opts.verbose) { 2813 prt_newline(&buf); 2814 darray_for_each(path, i) 2815 prt_printf(&buf, "%llu ", *i); 2816 } 2817 2818 if (fsck_err(trans, dir_loop, "%s", buf.buf)) { 2819 ret = remove_backpointer(trans, &inode); 2820 bch_err_msg(c, ret, "removing dirent"); 2821 if (ret) 2822 break; 2823 2824 ret = reattach_inode(trans, &inode); 2825 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 2826 } 2827 2828 goto out; 2829 } 2830 } 2831 2832 if (inode.bi_subvol) 2833 min_bi_depth = 0; 2834 2835 if (redo_bi_depth) 2836 ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth); 2837 out: 2838 fsck_err: 2839 bch2_trans_iter_exit(trans, &inode_iter); 2840 darray_exit(&path); 2841 printbuf_exit(&buf); 2842 bch_err_fn(c, ret); 2843 return ret; 2844 } 2845 2846 /* 2847 * Check for loops in the directory structure: all other connectivity issues 2848 * have been fixed by prior passes 2849 */ 2850 int bch2_check_directory_structure(struct bch_fs *c) 2851 { 2852 int ret = bch2_trans_run(c, 2853 for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN, 2854 BTREE_ITER_intent| 2855 BTREE_ITER_prefetch| 2856 BTREE_ITER_all_snapshots, k, 2857 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 2858 if (!S_ISDIR(bkey_inode_mode(k))) 2859 continue; 2860 2861 if (bch2_inode_flags(k) & BCH_INODE_unlinked) 2862 continue; 2863 2864 check_path_loop(trans, k); 2865 }))); 2866 2867 bch_err_fn(c, ret); 2868 return ret; 2869 } 2870 2871 struct nlink_table { 2872 size_t nr; 2873 size_t size; 2874 2875 struct nlink { 2876 u64 inum; 2877 u32 snapshot; 2878 u32 count; 2879 } *d; 2880 }; 2881 2882 static int add_nlink(struct bch_fs *c, struct nlink_table *t, 2883 u64 inum, u32 snapshot) 2884 { 2885 if (t->nr == t->size) { 2886 size_t new_size = max_t(size_t, 128UL, t->size * 2); 2887 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); 2888 2889 if (!d) { 2890 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", 2891 new_size); 2892 return bch_err_throw(c, ENOMEM_fsck_add_nlink); 2893 } 2894 2895 if (t->d) 2896 memcpy(d, t->d, t->size * sizeof(t->d[0])); 2897 kvfree(t->d); 2898 2899 t->d = d; 2900 t->size = new_size; 2901 } 2902 2903 2904 t->d[t->nr++] = (struct nlink) { 2905 .inum = inum, 2906 .snapshot = snapshot, 2907 }; 2908 2909 return 0; 2910 } 2911 2912 static int nlink_cmp(const void *_l, const void *_r) 2913 { 2914 const struct nlink *l = _l; 2915 const struct nlink *r = _r; 2916 2917 return cmp_int(l->inum, r->inum); 2918 } 2919 2920 static void inc_link(struct bch_fs *c, struct snapshots_seen *s, 2921 struct nlink_table *links, 2922 u64 range_start, u64 range_end, u64 inum, u32 snapshot) 2923 { 2924 struct nlink *link, key = { 2925 .inum = inum, .snapshot = U32_MAX, 2926 }; 2927 2928 if (inum < range_start || inum >= range_end) 2929 return; 2930 2931 link = __inline_bsearch(&key, links->d, links->nr, 2932 sizeof(links->d[0]), nlink_cmp); 2933 if (!link) 2934 return; 2935 2936 while (link > links->d && link[0].inum == link[-1].inum) 2937 --link; 2938 2939 for (; link < links->d + links->nr && link->inum == inum; link++) 2940 if (ref_visible(c, s, snapshot, link->snapshot)) { 2941 link->count++; 2942 if (link->snapshot >= snapshot) 2943 break; 2944 } 2945 } 2946 2947 noinline_for_stack 2948 static int check_nlinks_find_hardlinks(struct bch_fs *c, 2949 struct nlink_table *t, 2950 u64 start, u64 *end) 2951 { 2952 int ret = bch2_trans_run(c, 2953 for_each_btree_key(trans, iter, BTREE_ID_inodes, 2954 POS(0, start), 2955 BTREE_ITER_intent| 2956 BTREE_ITER_prefetch| 2957 BTREE_ITER_all_snapshots, k, ({ 2958 if (!bkey_is_inode(k.k)) 2959 continue; 2960 2961 /* Should never fail, checked by bch2_inode_invalid: */ 2962 struct bch_inode_unpacked u; 2963 _ret3 = bch2_inode_unpack(k, &u); 2964 if (_ret3) 2965 break; 2966 2967 /* 2968 * Backpointer and directory structure checks are sufficient for 2969 * directories, since they can't have hardlinks: 2970 */ 2971 if (S_ISDIR(u.bi_mode)) 2972 continue; 2973 2974 /* 2975 * Previous passes ensured that bi_nlink is nonzero if 2976 * it had multiple hardlinks: 2977 */ 2978 if (!u.bi_nlink) 2979 continue; 2980 2981 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); 2982 if (ret) { 2983 *end = k.k->p.offset; 2984 ret = 0; 2985 break; 2986 } 2987 0; 2988 }))); 2989 2990 bch_err_fn(c, ret); 2991 return ret; 2992 } 2993 2994 noinline_for_stack 2995 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, 2996 u64 range_start, u64 range_end) 2997 { 2998 struct snapshots_seen s; 2999 3000 snapshots_seen_init(&s); 3001 3002 int ret = bch2_trans_run(c, 3003 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, 3004 BTREE_ITER_intent| 3005 BTREE_ITER_prefetch| 3006 BTREE_ITER_all_snapshots, k, ({ 3007 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); 3008 if (ret) 3009 break; 3010 3011 if (k.k->type == KEY_TYPE_dirent) { 3012 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 3013 3014 if (d.v->d_type != DT_DIR && 3015 d.v->d_type != DT_SUBVOL) 3016 inc_link(c, &s, links, range_start, range_end, 3017 le64_to_cpu(d.v->d_inum), d.k->p.snapshot); 3018 } 3019 0; 3020 }))); 3021 3022 snapshots_seen_exit(&s); 3023 3024 bch_err_fn(c, ret); 3025 return ret; 3026 } 3027 3028 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, 3029 struct bkey_s_c k, 3030 struct nlink_table *links, 3031 size_t *idx, u64 range_end) 3032 { 3033 struct bch_inode_unpacked u; 3034 struct nlink *link = &links->d[*idx]; 3035 int ret = 0; 3036 3037 if (k.k->p.offset >= range_end) 3038 return 1; 3039 3040 if (!bkey_is_inode(k.k)) 3041 return 0; 3042 3043 ret = bch2_inode_unpack(k, &u); 3044 if (ret) 3045 return ret; 3046 3047 if (S_ISDIR(u.bi_mode)) 3048 return 0; 3049 3050 if (!u.bi_nlink) 3051 return 0; 3052 3053 while ((cmp_int(link->inum, k.k->p.offset) ?: 3054 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { 3055 BUG_ON(*idx == links->nr); 3056 link = &links->d[++*idx]; 3057 } 3058 3059 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, 3060 trans, inode_wrong_nlink, 3061 "inode %llu type %s has wrong i_nlink (%u, should be %u)", 3062 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], 3063 bch2_inode_nlink_get(&u), link->count)) { 3064 bch2_inode_nlink_set(&u, link->count); 3065 ret = __bch2_fsck_write_inode(trans, &u); 3066 } 3067 fsck_err: 3068 return ret; 3069 } 3070 3071 noinline_for_stack 3072 static int check_nlinks_update_hardlinks(struct bch_fs *c, 3073 struct nlink_table *links, 3074 u64 range_start, u64 range_end) 3075 { 3076 size_t idx = 0; 3077 3078 int ret = bch2_trans_run(c, 3079 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 3080 POS(0, range_start), 3081 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 3082 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 3083 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); 3084 if (ret < 0) { 3085 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); 3086 return ret; 3087 } 3088 3089 return 0; 3090 } 3091 3092 int bch2_check_nlinks(struct bch_fs *c) 3093 { 3094 struct nlink_table links = { 0 }; 3095 u64 this_iter_range_start, next_iter_range_start = 0; 3096 int ret = 0; 3097 3098 do { 3099 this_iter_range_start = next_iter_range_start; 3100 next_iter_range_start = U64_MAX; 3101 3102 ret = check_nlinks_find_hardlinks(c, &links, 3103 this_iter_range_start, 3104 &next_iter_range_start); 3105 3106 ret = check_nlinks_walk_dirents(c, &links, 3107 this_iter_range_start, 3108 next_iter_range_start); 3109 if (ret) 3110 break; 3111 3112 ret = check_nlinks_update_hardlinks(c, &links, 3113 this_iter_range_start, 3114 next_iter_range_start); 3115 if (ret) 3116 break; 3117 3118 links.nr = 0; 3119 } while (next_iter_range_start != U64_MAX); 3120 3121 kvfree(links.d); 3122 bch_err_fn(c, ret); 3123 return ret; 3124 } 3125 3126 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, 3127 struct bkey_s_c k) 3128 { 3129 struct bkey_s_c_reflink_p p; 3130 struct bkey_i_reflink_p *u; 3131 3132 if (k.k->type != KEY_TYPE_reflink_p) 3133 return 0; 3134 3135 p = bkey_s_c_to_reflink_p(k); 3136 3137 if (!p.v->front_pad && !p.v->back_pad) 3138 return 0; 3139 3140 u = bch2_trans_kmalloc(trans, sizeof(*u)); 3141 int ret = PTR_ERR_OR_ZERO(u); 3142 if (ret) 3143 return ret; 3144 3145 bkey_reassemble(&u->k_i, k); 3146 u->v.front_pad = 0; 3147 u->v.back_pad = 0; 3148 3149 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); 3150 } 3151 3152 int bch2_fix_reflink_p(struct bch_fs *c) 3153 { 3154 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) 3155 return 0; 3156 3157 int ret = bch2_trans_run(c, 3158 for_each_btree_key_commit(trans, iter, 3159 BTREE_ID_extents, POS_MIN, 3160 BTREE_ITER_intent|BTREE_ITER_prefetch| 3161 BTREE_ITER_all_snapshots, k, 3162 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 3163 fix_reflink_p_key(trans, &iter, k))); 3164 bch_err_fn(c, ret); 3165 return ret; 3166 } 3167 3168 #ifndef NO_BCACHEFS_CHARDEV 3169 3170 struct fsck_thread { 3171 struct thread_with_stdio thr; 3172 struct bch_fs *c; 3173 struct bch_opts opts; 3174 }; 3175 3176 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) 3177 { 3178 struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); 3179 kfree(thr); 3180 } 3181 3182 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) 3183 { 3184 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); 3185 struct bch_fs *c = thr->c; 3186 3187 int ret = PTR_ERR_OR_ZERO(c); 3188 if (ret) 3189 return ret; 3190 3191 ret = bch2_fs_start(thr->c); 3192 if (ret) 3193 goto err; 3194 3195 if (test_bit(BCH_FS_errors_fixed, &c->flags)) { 3196 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); 3197 ret |= 1; 3198 } 3199 if (test_bit(BCH_FS_error, &c->flags)) { 3200 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); 3201 ret |= 4; 3202 } 3203 err: 3204 bch2_fs_stop(c); 3205 return ret; 3206 } 3207 3208 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { 3209 .exit = bch2_fsck_thread_exit, 3210 .fn = bch2_fsck_offline_thread_fn, 3211 }; 3212 3213 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) 3214 { 3215 struct bch_ioctl_fsck_offline arg; 3216 struct fsck_thread *thr = NULL; 3217 darray_const_str devs = {}; 3218 long ret = 0; 3219 3220 if (copy_from_user(&arg, user_arg, sizeof(arg))) 3221 return -EFAULT; 3222 3223 if (arg.flags) 3224 return -EINVAL; 3225 3226 if (!capable(CAP_SYS_ADMIN)) 3227 return -EPERM; 3228 3229 for (size_t i = 0; i < arg.nr_devs; i++) { 3230 u64 dev_u64; 3231 ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); 3232 if (ret) 3233 goto err; 3234 3235 char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); 3236 ret = PTR_ERR_OR_ZERO(dev_str); 3237 if (ret) 3238 goto err; 3239 3240 ret = darray_push(&devs, dev_str); 3241 if (ret) { 3242 kfree(dev_str); 3243 goto err; 3244 } 3245 } 3246 3247 thr = kzalloc(sizeof(*thr), GFP_KERNEL); 3248 if (!thr) { 3249 ret = -ENOMEM; 3250 goto err; 3251 } 3252 3253 thr->opts = bch2_opts_empty(); 3254 3255 if (arg.opts) { 3256 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); 3257 ret = PTR_ERR_OR_ZERO(optstr) ?: 3258 bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false); 3259 if (!IS_ERR(optstr)) 3260 kfree(optstr); 3261 3262 if (ret) 3263 goto err; 3264 } 3265 3266 opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); 3267 opt_set(thr->opts, read_only, 1); 3268 opt_set(thr->opts, ratelimit_errors, 0); 3269 3270 /* We need request_key() to be called before we punt to kthread: */ 3271 opt_set(thr->opts, nostart, true); 3272 3273 bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); 3274 3275 thr->c = bch2_fs_open(&devs, &thr->opts); 3276 3277 if (!IS_ERR(thr->c) && 3278 thr->c->opts.errors == BCH_ON_ERROR_panic) 3279 thr->c->opts.errors = BCH_ON_ERROR_ro; 3280 3281 ret = __bch2_run_thread_with_stdio(&thr->thr); 3282 out: 3283 darray_for_each(devs, i) 3284 kfree(*i); 3285 darray_exit(&devs); 3286 return ret; 3287 err: 3288 if (thr) 3289 bch2_fsck_thread_exit(&thr->thr); 3290 pr_err("ret %s", bch2_err_str(ret)); 3291 goto out; 3292 } 3293 3294 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) 3295 { 3296 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); 3297 struct bch_fs *c = thr->c; 3298 3299 c->stdio_filter = current; 3300 c->stdio = &thr->thr.stdio; 3301 3302 /* 3303 * XXX: can we figure out a way to do this without mucking with c->opts? 3304 */ 3305 unsigned old_fix_errors = c->opts.fix_errors; 3306 if (opt_defined(thr->opts, fix_errors)) 3307 c->opts.fix_errors = thr->opts.fix_errors; 3308 else 3309 c->opts.fix_errors = FSCK_FIX_ask; 3310 3311 c->opts.fsck = true; 3312 set_bit(BCH_FS_in_fsck, &c->flags); 3313 3314 int ret = bch2_run_online_recovery_passes(c, ~0ULL); 3315 3316 clear_bit(BCH_FS_in_fsck, &c->flags); 3317 bch_err_fn(c, ret); 3318 3319 c->stdio = NULL; 3320 c->stdio_filter = NULL; 3321 c->opts.fix_errors = old_fix_errors; 3322 3323 up(&c->recovery.run_lock); 3324 bch2_ro_ref_put(c); 3325 return ret; 3326 } 3327 3328 static const struct thread_with_stdio_ops bch2_online_fsck_ops = { 3329 .exit = bch2_fsck_thread_exit, 3330 .fn = bch2_fsck_online_thread_fn, 3331 }; 3332 3333 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) 3334 { 3335 struct fsck_thread *thr = NULL; 3336 long ret = 0; 3337 3338 if (arg.flags) 3339 return -EINVAL; 3340 3341 if (!capable(CAP_SYS_ADMIN)) 3342 return -EPERM; 3343 3344 if (!bch2_ro_ref_tryget(c)) 3345 return -EROFS; 3346 3347 if (down_trylock(&c->recovery.run_lock)) { 3348 bch2_ro_ref_put(c); 3349 return -EAGAIN; 3350 } 3351 3352 thr = kzalloc(sizeof(*thr), GFP_KERNEL); 3353 if (!thr) { 3354 ret = -ENOMEM; 3355 goto err; 3356 } 3357 3358 thr->c = c; 3359 thr->opts = bch2_opts_empty(); 3360 3361 if (arg.opts) { 3362 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); 3363 3364 ret = PTR_ERR_OR_ZERO(optstr) ?: 3365 bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false); 3366 if (!IS_ERR(optstr)) 3367 kfree(optstr); 3368 3369 if (ret) 3370 goto err; 3371 } 3372 3373 ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); 3374 err: 3375 if (ret < 0) { 3376 bch_err_fn(c, ret); 3377 if (thr) 3378 bch2_fsck_thread_exit(&thr->thr); 3379 up(&c->recovery.run_lock); 3380 bch2_ro_ref_put(c); 3381 } 3382 return ret; 3383 } 3384 3385 #endif /* NO_BCACHEFS_CHARDEV */ 3386