1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "backpointers.h" 5 #include "bkey_buf.h" 6 #include "alloc_background.h" 7 #include "btree_gc.h" 8 #include "btree_journal_iter.h" 9 #include "btree_update.h" 10 #include "btree_update_interior.h" 11 #include "btree_io.h" 12 #include "buckets.h" 13 #include "dirent.h" 14 #include "ec.h" 15 #include "errcode.h" 16 #include "error.h" 17 #include "fs-common.h" 18 #include "fsck.h" 19 #include "journal_io.h" 20 #include "journal_reclaim.h" 21 #include "journal_seq_blacklist.h" 22 #include "lru.h" 23 #include "logged_ops.h" 24 #include "move.h" 25 #include "quota.h" 26 #include "rebalance.h" 27 #include "recovery.h" 28 #include "replicas.h" 29 #include "sb-clean.h" 30 #include "sb-downgrade.h" 31 #include "snapshot.h" 32 #include "subvolume.h" 33 #include "super-io.h" 34 35 #include <linux/sort.h> 36 #include <linux/stat.h> 37 38 #define QSTR(n) { { { .len = strlen(n) } }, .name = n } 39 40 static bool btree_id_is_alloc(enum btree_id id) 41 { 42 switch (id) { 43 case BTREE_ID_alloc: 44 case BTREE_ID_backpointers: 45 case BTREE_ID_need_discard: 46 case BTREE_ID_freespace: 47 case BTREE_ID_bucket_gens: 48 return true; 49 default: 50 return false; 51 } 52 } 53 54 /* for -o reconstruct_alloc: */ 55 static void do_reconstruct_alloc(struct bch_fs *c) 56 { 57 bch2_journal_log_msg(c, "dropping alloc info"); 58 bch_info(c, "dropping and reconstructing all alloc info"); 59 60 mutex_lock(&c->sb_lock); 61 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 62 63 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); 64 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); 65 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); 66 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); 67 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); 68 69 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); 70 __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); 71 __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); 72 __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); 73 __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); 74 __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); 75 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); 76 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); 77 __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); 78 __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); 79 __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); 80 __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); 81 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); 82 __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); 83 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 84 85 bch2_write_super(c); 86 mutex_unlock(&c->sb_lock); 87 88 c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 89 90 struct journal_keys *keys = &c->journal_keys; 91 size_t src, dst; 92 93 move_gap(keys, keys->nr); 94 95 for (src = 0, dst = 0; src < keys->nr; src++) 96 if (!btree_id_is_alloc(keys->data[src].btree_id)) 97 keys->data[dst++] = keys->data[src]; 98 keys->nr = keys->gap = dst; 99 } 100 101 /* 102 * Btree node pointers have a field to stack a pointer to the in memory btree 103 * node; we need to zero out this field when reading in btree nodes, or when 104 * reading in keys from the journal: 105 */ 106 static void zero_out_btree_mem_ptr(struct journal_keys *keys) 107 { 108 darray_for_each(*keys, i) 109 if (i->k->k.type == KEY_TYPE_btree_ptr_v2) 110 bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; 111 } 112 113 /* journal replay: */ 114 115 static void replay_now_at(struct journal *j, u64 seq) 116 { 117 BUG_ON(seq < j->replay_journal_seq); 118 119 seq = min(seq, j->replay_journal_seq_end); 120 121 while (j->replay_journal_seq < seq) 122 bch2_journal_pin_put(j, j->replay_journal_seq++); 123 } 124 125 static int bch2_journal_replay_key(struct btree_trans *trans, 126 struct journal_key *k) 127 { 128 struct btree_iter iter; 129 unsigned iter_flags = 130 BTREE_ITER_INTENT| 131 BTREE_ITER_NOT_EXTENTS; 132 unsigned update_flags = BTREE_TRIGGER_NORUN; 133 int ret; 134 135 if (k->overwritten) 136 return 0; 137 138 trans->journal_res.seq = k->journal_seq; 139 140 /* 141 * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to 142 * keep the key cache coherent with the underlying btree. Nothing 143 * besides the allocator is doing updates yet so we don't need key cache 144 * coherency for non-alloc btrees, and key cache fills for snapshots 145 * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until 146 * the snapshots recovery pass runs. 147 */ 148 if (!k->level && k->btree_id == BTREE_ID_alloc) 149 iter_flags |= BTREE_ITER_CACHED; 150 else 151 update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; 152 153 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 154 BTREE_MAX_DEPTH, k->level, 155 iter_flags); 156 ret = bch2_btree_iter_traverse(&iter); 157 if (ret) 158 goto out; 159 160 struct btree_path *path = btree_iter_path(trans, &iter); 161 if (unlikely(!btree_path_node(path, k->level))) { 162 bch2_trans_iter_exit(trans, &iter); 163 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 164 BTREE_MAX_DEPTH, 0, iter_flags); 165 ret = bch2_btree_iter_traverse(&iter) ?: 166 bch2_btree_increase_depth(trans, iter.path, 0) ?: 167 -BCH_ERR_transaction_restart_nested; 168 goto out; 169 } 170 171 /* Must be checked with btree locked: */ 172 if (k->overwritten) 173 goto out; 174 175 ret = bch2_trans_update(trans, &iter, k->k, update_flags); 176 out: 177 bch2_trans_iter_exit(trans, &iter); 178 return ret; 179 } 180 181 static int journal_sort_seq_cmp(const void *_l, const void *_r) 182 { 183 const struct journal_key *l = *((const struct journal_key **)_l); 184 const struct journal_key *r = *((const struct journal_key **)_r); 185 186 return cmp_int(l->journal_seq, r->journal_seq); 187 } 188 189 static int bch2_journal_replay(struct bch_fs *c) 190 { 191 struct journal_keys *keys = &c->journal_keys; 192 DARRAY(struct journal_key *) keys_sorted = { 0 }; 193 struct journal *j = &c->journal; 194 u64 start_seq = c->journal_replay_seq_start; 195 u64 end_seq = c->journal_replay_seq_start; 196 struct btree_trans *trans = bch2_trans_get(c); 197 int ret = 0; 198 199 if (keys->nr) { 200 ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", 201 keys->nr, start_seq, end_seq); 202 if (ret) 203 goto err; 204 } 205 206 BUG_ON(!atomic_read(&keys->ref)); 207 208 move_gap(keys, keys->nr); 209 210 /* 211 * First, attempt to replay keys in sorted order. This is more 212 * efficient - better locality of btree access - but some might fail if 213 * that would cause a journal deadlock. 214 */ 215 darray_for_each(*keys, k) { 216 cond_resched(); 217 218 /* Skip fastpath if we're low on space in the journal */ 219 ret = c->journal.watermark ? -1 : 220 commit_do(trans, NULL, NULL, 221 BCH_TRANS_COMMIT_no_enospc| 222 BCH_TRANS_COMMIT_journal_reclaim| 223 (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), 224 bch2_journal_replay_key(trans, k)); 225 BUG_ON(!ret && !k->overwritten); 226 if (ret) { 227 ret = darray_push(&keys_sorted, k); 228 if (ret) 229 goto err; 230 } 231 } 232 233 /* 234 * Now, replay any remaining keys in the order in which they appear in 235 * the journal, unpinning those journal entries as we go: 236 */ 237 sort(keys_sorted.data, keys_sorted.nr, 238 sizeof(keys_sorted.data[0]), 239 journal_sort_seq_cmp, NULL); 240 241 darray_for_each(keys_sorted, kp) { 242 cond_resched(); 243 244 struct journal_key *k = *kp; 245 246 replay_now_at(j, k->journal_seq); 247 248 ret = commit_do(trans, NULL, NULL, 249 BCH_TRANS_COMMIT_no_enospc| 250 (!k->allocated 251 ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim 252 : 0), 253 bch2_journal_replay_key(trans, k)); 254 bch_err_msg(c, ret, "while replaying key at btree %s level %u:", 255 bch2_btree_id_str(k->btree_id), k->level); 256 if (ret) 257 goto err; 258 259 BUG_ON(!k->overwritten); 260 } 261 262 /* 263 * We need to put our btree_trans before calling flush_all_pins(), since 264 * that will use a btree_trans internally 265 */ 266 bch2_trans_put(trans); 267 trans = NULL; 268 269 if (!c->opts.keep_journal) 270 bch2_journal_keys_put_initial(c); 271 272 replay_now_at(j, j->replay_journal_seq_end); 273 j->replay_journal_seq = 0; 274 275 bch2_journal_set_replay_done(j); 276 277 if (keys->nr) 278 bch2_journal_log_msg(c, "journal replay finished"); 279 err: 280 if (trans) 281 bch2_trans_put(trans); 282 darray_exit(&keys_sorted); 283 bch_err_fn(c, ret); 284 return ret; 285 } 286 287 /* journal replay early: */ 288 289 static int journal_replay_entry_early(struct bch_fs *c, 290 struct jset_entry *entry) 291 { 292 int ret = 0; 293 294 switch (entry->type) { 295 case BCH_JSET_ENTRY_btree_root: { 296 struct btree_root *r; 297 298 while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { 299 ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); 300 if (ret) 301 return ret; 302 } 303 304 r = bch2_btree_id_root(c, entry->btree_id); 305 306 if (entry->u64s) { 307 r->level = entry->level; 308 bkey_copy(&r->key, (struct bkey_i *) entry->start); 309 r->error = 0; 310 } else { 311 r->error = -BCH_ERR_btree_node_read_error; 312 } 313 r->alive = true; 314 break; 315 } 316 case BCH_JSET_ENTRY_usage: { 317 struct jset_entry_usage *u = 318 container_of(entry, struct jset_entry_usage, entry); 319 320 switch (entry->btree_id) { 321 case BCH_FS_USAGE_reserved: 322 if (entry->level < BCH_REPLICAS_MAX) 323 c->usage_base->persistent_reserved[entry->level] = 324 le64_to_cpu(u->v); 325 break; 326 case BCH_FS_USAGE_inodes: 327 c->usage_base->b.nr_inodes = le64_to_cpu(u->v); 328 break; 329 case BCH_FS_USAGE_key_version: 330 atomic64_set(&c->key_version, 331 le64_to_cpu(u->v)); 332 break; 333 } 334 335 break; 336 } 337 case BCH_JSET_ENTRY_data_usage: { 338 struct jset_entry_data_usage *u = 339 container_of(entry, struct jset_entry_data_usage, entry); 340 341 ret = bch2_replicas_set_usage(c, &u->r, 342 le64_to_cpu(u->v)); 343 break; 344 } 345 case BCH_JSET_ENTRY_dev_usage: { 346 struct jset_entry_dev_usage *u = 347 container_of(entry, struct jset_entry_dev_usage, entry); 348 struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); 349 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 350 351 for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { 352 ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); 353 ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); 354 ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); 355 } 356 357 break; 358 } 359 case BCH_JSET_ENTRY_blacklist: { 360 struct jset_entry_blacklist *bl_entry = 361 container_of(entry, struct jset_entry_blacklist, entry); 362 363 ret = bch2_journal_seq_blacklist_add(c, 364 le64_to_cpu(bl_entry->seq), 365 le64_to_cpu(bl_entry->seq) + 1); 366 break; 367 } 368 case BCH_JSET_ENTRY_blacklist_v2: { 369 struct jset_entry_blacklist_v2 *bl_entry = 370 container_of(entry, struct jset_entry_blacklist_v2, entry); 371 372 ret = bch2_journal_seq_blacklist_add(c, 373 le64_to_cpu(bl_entry->start), 374 le64_to_cpu(bl_entry->end) + 1); 375 break; 376 } 377 case BCH_JSET_ENTRY_clock: { 378 struct jset_entry_clock *clock = 379 container_of(entry, struct jset_entry_clock, entry); 380 381 atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); 382 } 383 } 384 385 return ret; 386 } 387 388 static int journal_replay_early(struct bch_fs *c, 389 struct bch_sb_field_clean *clean) 390 { 391 if (clean) { 392 for (struct jset_entry *entry = clean->start; 393 entry != vstruct_end(&clean->field); 394 entry = vstruct_next(entry)) { 395 int ret = journal_replay_entry_early(c, entry); 396 if (ret) 397 return ret; 398 } 399 } else { 400 struct genradix_iter iter; 401 struct journal_replay *i, **_i; 402 403 genradix_for_each(&c->journal_entries, iter, _i) { 404 i = *_i; 405 406 if (journal_replay_ignore(i)) 407 continue; 408 409 vstruct_for_each(&i->j, entry) { 410 int ret = journal_replay_entry_early(c, entry); 411 if (ret) 412 return ret; 413 } 414 } 415 } 416 417 bch2_fs_usage_initialize(c); 418 419 return 0; 420 } 421 422 /* sb clean section: */ 423 424 static int read_btree_roots(struct bch_fs *c) 425 { 426 unsigned i; 427 int ret = 0; 428 429 for (i = 0; i < btree_id_nr_alive(c); i++) { 430 struct btree_root *r = bch2_btree_id_root(c, i); 431 432 if (!r->alive) 433 continue; 434 435 if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) 436 continue; 437 438 if (r->error) { 439 __fsck_err(c, 440 btree_id_is_alloc(i) 441 ? FSCK_CAN_IGNORE : 0, 442 btree_root_bkey_invalid, 443 "invalid btree root %s", 444 bch2_btree_id_str(i)); 445 if (i == BTREE_ID_alloc) 446 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 447 } 448 449 ret = bch2_btree_root_read(c, i, &r->key, r->level); 450 if (ret) { 451 fsck_err(c, 452 btree_root_read_error, 453 "error reading btree root %s", 454 bch2_btree_id_str(i)); 455 if (btree_id_is_alloc(i)) 456 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 457 ret = 0; 458 } 459 } 460 461 for (i = 0; i < BTREE_ID_NR; i++) { 462 struct btree_root *r = bch2_btree_id_root(c, i); 463 464 if (!r->b) { 465 r->alive = false; 466 r->level = 0; 467 bch2_btree_root_alloc(c, i); 468 } 469 } 470 fsck_err: 471 return ret; 472 } 473 474 static int bch2_initialize_subvolumes(struct bch_fs *c) 475 { 476 struct bkey_i_snapshot_tree root_tree; 477 struct bkey_i_snapshot root_snapshot; 478 struct bkey_i_subvolume root_volume; 479 int ret; 480 481 bkey_snapshot_tree_init(&root_tree.k_i); 482 root_tree.k.p.offset = 1; 483 root_tree.v.master_subvol = cpu_to_le32(1); 484 root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); 485 486 bkey_snapshot_init(&root_snapshot.k_i); 487 root_snapshot.k.p.offset = U32_MAX; 488 root_snapshot.v.flags = 0; 489 root_snapshot.v.parent = 0; 490 root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); 491 root_snapshot.v.tree = cpu_to_le32(1); 492 SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); 493 494 bkey_subvolume_init(&root_volume.k_i); 495 root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; 496 root_volume.v.flags = 0; 497 root_volume.v.snapshot = cpu_to_le32(U32_MAX); 498 root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); 499 500 ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: 501 bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: 502 bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); 503 bch_err_fn(c, ret); 504 return ret; 505 } 506 507 static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) 508 { 509 struct btree_iter iter; 510 struct bkey_s_c k; 511 struct bch_inode_unpacked inode; 512 int ret; 513 514 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 515 SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); 516 ret = bkey_err(k); 517 if (ret) 518 return ret; 519 520 if (!bkey_is_inode(k.k)) { 521 bch_err(trans->c, "root inode not found"); 522 ret = -BCH_ERR_ENOENT_inode; 523 goto err; 524 } 525 526 ret = bch2_inode_unpack(k, &inode); 527 BUG_ON(ret); 528 529 inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; 530 531 ret = bch2_inode_write(trans, &iter, &inode); 532 err: 533 bch2_trans_iter_exit(trans, &iter); 534 return ret; 535 } 536 537 /* set bi_subvol on root inode */ 538 noinline_for_stack 539 static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) 540 { 541 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, 542 __bch2_fs_upgrade_for_subvolumes(trans)); 543 bch_err_fn(c, ret); 544 return ret; 545 } 546 547 const char * const bch2_recovery_passes[] = { 548 #define x(_fn, ...) #_fn, 549 BCH_RECOVERY_PASSES() 550 #undef x 551 NULL 552 }; 553 554 static int bch2_check_allocations(struct bch_fs *c) 555 { 556 return bch2_gc(c, true, c->opts.norecovery); 557 } 558 559 static int bch2_set_may_go_rw(struct bch_fs *c) 560 { 561 struct journal_keys *keys = &c->journal_keys; 562 563 /* 564 * After we go RW, the journal keys buffer can't be modified (except for 565 * setting journal_key->overwritten: it will be accessed by multiple 566 * threads 567 */ 568 move_gap(keys, keys->nr); 569 570 set_bit(BCH_FS_may_go_rw, &c->flags); 571 572 if (keys->nr || c->opts.fsck || !c->sb.clean) 573 return bch2_fs_read_write_early(c); 574 return 0; 575 } 576 577 struct recovery_pass_fn { 578 int (*fn)(struct bch_fs *); 579 unsigned when; 580 }; 581 582 static struct recovery_pass_fn recovery_pass_fns[] = { 583 #define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, 584 BCH_RECOVERY_PASSES() 585 #undef x 586 }; 587 588 u64 bch2_recovery_passes_to_stable(u64 v) 589 { 590 static const u8 map[] = { 591 #define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, 592 BCH_RECOVERY_PASSES() 593 #undef x 594 }; 595 596 u64 ret = 0; 597 for (unsigned i = 0; i < ARRAY_SIZE(map); i++) 598 if (v & BIT_ULL(i)) 599 ret |= BIT_ULL(map[i]); 600 return ret; 601 } 602 603 u64 bch2_recovery_passes_from_stable(u64 v) 604 { 605 static const u8 map[] = { 606 #define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, 607 BCH_RECOVERY_PASSES() 608 #undef x 609 }; 610 611 u64 ret = 0; 612 for (unsigned i = 0; i < ARRAY_SIZE(map); i++) 613 if (v & BIT_ULL(i)) 614 ret |= BIT_ULL(map[i]); 615 return ret; 616 } 617 618 static bool check_version_upgrade(struct bch_fs *c) 619 { 620 unsigned latest_version = bcachefs_metadata_version_current; 621 unsigned latest_compatible = min(latest_version, 622 bch2_latest_compatible_version(c->sb.version)); 623 unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; 624 unsigned new_version = 0; 625 626 if (old_version < bcachefs_metadata_required_upgrade_below) { 627 if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || 628 latest_compatible < bcachefs_metadata_required_upgrade_below) 629 new_version = latest_version; 630 else 631 new_version = latest_compatible; 632 } else { 633 switch (c->opts.version_upgrade) { 634 case BCH_VERSION_UPGRADE_compatible: 635 new_version = latest_compatible; 636 break; 637 case BCH_VERSION_UPGRADE_incompatible: 638 new_version = latest_version; 639 break; 640 case BCH_VERSION_UPGRADE_none: 641 new_version = min(old_version, latest_version); 642 break; 643 } 644 } 645 646 if (new_version > old_version) { 647 struct printbuf buf = PRINTBUF; 648 649 if (old_version < bcachefs_metadata_required_upgrade_below) 650 prt_str(&buf, "Version upgrade required:\n"); 651 652 if (old_version != c->sb.version) { 653 prt_str(&buf, "Version upgrade from "); 654 bch2_version_to_text(&buf, c->sb.version_upgrade_complete); 655 prt_str(&buf, " to "); 656 bch2_version_to_text(&buf, c->sb.version); 657 prt_str(&buf, " incomplete\n"); 658 } 659 660 prt_printf(&buf, "Doing %s version upgrade from ", 661 BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) 662 ? "incompatible" : "compatible"); 663 bch2_version_to_text(&buf, old_version); 664 prt_str(&buf, " to "); 665 bch2_version_to_text(&buf, new_version); 666 prt_newline(&buf); 667 668 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 669 __le64 passes = ext->recovery_passes_required[0]; 670 bch2_sb_set_upgrade(c, old_version, new_version); 671 passes = ext->recovery_passes_required[0] & ~passes; 672 673 if (passes) { 674 prt_str(&buf, " running recovery passes: "); 675 prt_bitflags(&buf, bch2_recovery_passes, 676 bch2_recovery_passes_from_stable(le64_to_cpu(passes))); 677 } 678 679 bch_info(c, "%s", buf.buf); 680 681 bch2_sb_upgrade(c, new_version); 682 683 printbuf_exit(&buf); 684 return true; 685 } 686 687 return false; 688 } 689 690 u64 bch2_fsck_recovery_passes(void) 691 { 692 u64 ret = 0; 693 694 for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) 695 if (recovery_pass_fns[i].when & PASS_FSCK) 696 ret |= BIT_ULL(i); 697 return ret; 698 } 699 700 static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) 701 { 702 struct recovery_pass_fn *p = recovery_pass_fns + pass; 703 704 if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) 705 return false; 706 if (c->recovery_passes_explicit & BIT_ULL(pass)) 707 return true; 708 if ((p->when & PASS_FSCK) && c->opts.fsck) 709 return true; 710 if ((p->when & PASS_UNCLEAN) && !c->sb.clean) 711 return true; 712 if (p->when & PASS_ALWAYS) 713 return true; 714 return false; 715 } 716 717 static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) 718 { 719 struct recovery_pass_fn *p = recovery_pass_fns + pass; 720 int ret; 721 722 if (!(p->when & PASS_SILENT)) 723 bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), 724 bch2_recovery_passes[pass]); 725 ret = p->fn(c); 726 if (ret) 727 return ret; 728 if (!(p->when & PASS_SILENT)) 729 bch2_print(c, KERN_CONT " done\n"); 730 731 return 0; 732 } 733 734 static int bch2_run_recovery_passes(struct bch_fs *c) 735 { 736 int ret = 0; 737 738 while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { 739 if (should_run_recovery_pass(c, c->curr_recovery_pass)) { 740 unsigned pass = c->curr_recovery_pass; 741 742 ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); 743 if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || 744 (ret && c->curr_recovery_pass < pass)) 745 continue; 746 if (ret) 747 break; 748 749 c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); 750 } 751 c->curr_recovery_pass++; 752 c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); 753 } 754 755 return ret; 756 } 757 758 int bch2_run_online_recovery_passes(struct bch_fs *c) 759 { 760 int ret = 0; 761 762 for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { 763 struct recovery_pass_fn *p = recovery_pass_fns + i; 764 765 if (!(p->when & PASS_ONLINE)) 766 continue; 767 768 ret = bch2_run_recovery_pass(c, i); 769 if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { 770 i = c->curr_recovery_pass; 771 continue; 772 } 773 if (ret) 774 break; 775 } 776 777 return ret; 778 } 779 780 int bch2_fs_recovery(struct bch_fs *c) 781 { 782 struct bch_sb_field_clean *clean = NULL; 783 struct jset *last_journal_entry = NULL; 784 u64 last_seq = 0, blacklist_seq, journal_seq; 785 int ret = 0; 786 787 if (c->sb.clean) { 788 clean = bch2_read_superblock_clean(c); 789 ret = PTR_ERR_OR_ZERO(clean); 790 if (ret) 791 goto err; 792 793 bch_info(c, "recovering from clean shutdown, journal seq %llu", 794 le64_to_cpu(clean->journal_seq)); 795 } else { 796 bch_info(c, "recovering from unclean shutdown"); 797 } 798 799 if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { 800 bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); 801 ret = -EINVAL; 802 goto err; 803 } 804 805 if (!c->sb.clean && 806 !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { 807 bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); 808 ret = -EINVAL; 809 goto err; 810 } 811 812 if (c->opts.fsck && c->opts.norecovery) { 813 bch_err(c, "cannot select both norecovery and fsck"); 814 ret = -EINVAL; 815 goto err; 816 } 817 818 if (!c->opts.nochanges) { 819 mutex_lock(&c->sb_lock); 820 bool write_sb = false; 821 822 struct bch_sb_field_ext *ext = 823 bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); 824 if (!ext) { 825 ret = -BCH_ERR_ENOSPC_sb; 826 mutex_unlock(&c->sb_lock); 827 goto err; 828 } 829 830 if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { 831 ext->recovery_passes_required[0] |= 832 cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); 833 write_sb = true; 834 } 835 836 u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 837 if (sb_passes) { 838 struct printbuf buf = PRINTBUF; 839 prt_str(&buf, "superblock requires following recovery passes to be run:\n "); 840 prt_bitflags(&buf, bch2_recovery_passes, sb_passes); 841 bch_info(c, "%s", buf.buf); 842 printbuf_exit(&buf); 843 } 844 845 if (bch2_check_version_downgrade(c)) { 846 struct printbuf buf = PRINTBUF; 847 848 prt_str(&buf, "Version downgrade required:"); 849 850 __le64 passes = ext->recovery_passes_required[0]; 851 bch2_sb_set_downgrade(c, 852 BCH_VERSION_MINOR(bcachefs_metadata_version_current), 853 BCH_VERSION_MINOR(c->sb.version)); 854 passes = ext->recovery_passes_required[0] & ~passes; 855 if (passes) { 856 prt_str(&buf, "\n running recovery passes: "); 857 prt_bitflags(&buf, bch2_recovery_passes, 858 bch2_recovery_passes_from_stable(le64_to_cpu(passes))); 859 } 860 861 bch_info(c, "%s", buf.buf); 862 printbuf_exit(&buf); 863 write_sb = true; 864 } 865 866 if (check_version_upgrade(c)) 867 write_sb = true; 868 869 if (write_sb) 870 bch2_write_super(c); 871 872 c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 873 mutex_unlock(&c->sb_lock); 874 } 875 876 if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) 877 c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); 878 879 if (c->opts.fsck) 880 set_bit(BCH_FS_fsck_running, &c->flags); 881 882 ret = bch2_blacklist_table_initialize(c); 883 if (ret) { 884 bch_err(c, "error initializing blacklist table"); 885 goto err; 886 } 887 888 if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { 889 struct genradix_iter iter; 890 struct journal_replay **i; 891 892 bch_verbose(c, "starting journal read"); 893 ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); 894 if (ret) 895 goto err; 896 897 /* 898 * note: cmd_list_journal needs the blacklist table fully up to date so 899 * it can asterisk ignored journal entries: 900 */ 901 if (c->opts.read_journal_only) 902 goto out; 903 904 genradix_for_each_reverse(&c->journal_entries, iter, i) 905 if (!journal_replay_ignore(*i)) { 906 last_journal_entry = &(*i)->j; 907 break; 908 } 909 910 if (mustfix_fsck_err_on(c->sb.clean && 911 last_journal_entry && 912 !journal_entry_empty(last_journal_entry), c, 913 clean_but_journal_not_empty, 914 "filesystem marked clean but journal not empty")) { 915 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 916 SET_BCH_SB_CLEAN(c->disk_sb.sb, false); 917 c->sb.clean = false; 918 } 919 920 if (!last_journal_entry) { 921 fsck_err_on(!c->sb.clean, c, 922 dirty_but_no_journal_entries, 923 "no journal entries found"); 924 if (clean) 925 goto use_clean; 926 927 genradix_for_each_reverse(&c->journal_entries, iter, i) 928 if (*i) { 929 last_journal_entry = &(*i)->j; 930 (*i)->ignore_blacklisted = false; 931 (*i)->ignore_not_dirty= false; 932 /* 933 * This was probably a NO_FLUSH entry, 934 * so last_seq was garbage - but we know 935 * we're only using a single journal 936 * entry, set it here: 937 */ 938 (*i)->j.last_seq = (*i)->j.seq; 939 break; 940 } 941 } 942 943 ret = bch2_journal_keys_sort(c); 944 if (ret) 945 goto err; 946 947 if (c->sb.clean && last_journal_entry) { 948 ret = bch2_verify_superblock_clean(c, &clean, 949 last_journal_entry); 950 if (ret) 951 goto err; 952 } 953 } else { 954 use_clean: 955 if (!clean) { 956 bch_err(c, "no superblock clean section found"); 957 ret = -BCH_ERR_fsck_repair_impossible; 958 goto err; 959 960 } 961 blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; 962 } 963 964 c->journal_replay_seq_start = last_seq; 965 c->journal_replay_seq_end = blacklist_seq - 1; 966 967 if (c->opts.reconstruct_alloc) 968 do_reconstruct_alloc(c); 969 970 zero_out_btree_mem_ptr(&c->journal_keys); 971 972 ret = journal_replay_early(c, clean); 973 if (ret) 974 goto err; 975 976 /* 977 * After an unclean shutdown, skip then next few journal sequence 978 * numbers as they may have been referenced by btree writes that 979 * happened before their corresponding journal writes - those btree 980 * writes need to be ignored, by skipping and blacklisting the next few 981 * journal sequence numbers: 982 */ 983 if (!c->sb.clean) 984 journal_seq += 8; 985 986 if (blacklist_seq != journal_seq) { 987 ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", 988 blacklist_seq, journal_seq) ?: 989 bch2_journal_seq_blacklist_add(c, 990 blacklist_seq, journal_seq); 991 if (ret) { 992 bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); 993 goto err; 994 } 995 } 996 997 ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", 998 journal_seq, last_seq, blacklist_seq - 1) ?: 999 bch2_fs_journal_start(&c->journal, journal_seq); 1000 if (ret) 1001 goto err; 1002 1003 /* 1004 * Skip past versions that might have possibly been used (as nonces), 1005 * but hadn't had their pointers written: 1006 */ 1007 if (c->sb.encryption_type && !c->sb.clean) 1008 atomic64_add(1 << 16, &c->key_version); 1009 1010 ret = read_btree_roots(c); 1011 if (ret) 1012 goto err; 1013 1014 ret = bch2_run_recovery_passes(c); 1015 if (ret) 1016 goto err; 1017 1018 clear_bit(BCH_FS_fsck_running, &c->flags); 1019 1020 /* If we fixed errors, verify that fs is actually clean now: */ 1021 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && 1022 test_bit(BCH_FS_errors_fixed, &c->flags) && 1023 !test_bit(BCH_FS_errors_not_fixed, &c->flags) && 1024 !test_bit(BCH_FS_error, &c->flags)) { 1025 bch2_flush_fsck_errs(c); 1026 1027 bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); 1028 clear_bit(BCH_FS_errors_fixed, &c->flags); 1029 1030 c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; 1031 1032 ret = bch2_run_recovery_passes(c); 1033 if (ret) 1034 goto err; 1035 1036 if (test_bit(BCH_FS_errors_fixed, &c->flags) || 1037 test_bit(BCH_FS_errors_not_fixed, &c->flags)) { 1038 bch_err(c, "Second fsck run was not clean"); 1039 set_bit(BCH_FS_errors_not_fixed, &c->flags); 1040 } 1041 1042 set_bit(BCH_FS_errors_fixed, &c->flags); 1043 } 1044 1045 if (enabled_qtypes(c)) { 1046 bch_verbose(c, "reading quotas"); 1047 ret = bch2_fs_quota_read(c); 1048 if (ret) 1049 goto err; 1050 bch_verbose(c, "quotas done"); 1051 } 1052 1053 mutex_lock(&c->sb_lock); 1054 bool write_sb = false; 1055 1056 if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { 1057 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); 1058 write_sb = true; 1059 } 1060 1061 if (!test_bit(BCH_FS_error, &c->flags) && 1062 !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { 1063 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); 1064 write_sb = true; 1065 } 1066 1067 if (!test_bit(BCH_FS_error, &c->flags)) { 1068 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 1069 if (ext && 1070 (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || 1071 !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { 1072 memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); 1073 memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); 1074 write_sb = true; 1075 } 1076 } 1077 1078 if (c->opts.fsck && 1079 !test_bit(BCH_FS_error, &c->flags) && 1080 !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { 1081 SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); 1082 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); 1083 write_sb = true; 1084 } 1085 1086 if (write_sb) 1087 bch2_write_super(c); 1088 mutex_unlock(&c->sb_lock); 1089 1090 if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || 1091 c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { 1092 struct bch_move_stats stats; 1093 1094 bch2_move_stats_init(&stats, "recovery"); 1095 1096 struct printbuf buf = PRINTBUF; 1097 bch2_version_to_text(&buf, c->sb.version_min); 1098 bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); 1099 printbuf_exit(&buf); 1100 1101 ret = bch2_fs_read_write_early(c) ?: 1102 bch2_scan_old_btree_nodes(c, &stats); 1103 if (ret) 1104 goto err; 1105 bch_info(c, "scanning for old btree nodes done"); 1106 } 1107 1108 if (c->journal_seq_blacklist_table && 1109 c->journal_seq_blacklist_table->nr > 128) 1110 queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); 1111 1112 ret = 0; 1113 out: 1114 bch2_flush_fsck_errs(c); 1115 1116 if (!c->opts.keep_journal && 1117 test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) 1118 bch2_journal_keys_put_initial(c); 1119 kfree(clean); 1120 1121 if (!ret && 1122 test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && 1123 !c->opts.nochanges) { 1124 bch2_fs_read_write_early(c); 1125 bch2_delete_dead_snapshots_async(c); 1126 } 1127 1128 bch_err_fn(c, ret); 1129 return ret; 1130 err: 1131 fsck_err: 1132 bch2_fs_emergency_read_only(c); 1133 goto out; 1134 } 1135 1136 int bch2_fs_initialize(struct bch_fs *c) 1137 { 1138 struct bch_inode_unpacked root_inode, lostfound_inode; 1139 struct bkey_inode_buf packed_inode; 1140 struct qstr lostfound = QSTR("lost+found"); 1141 int ret; 1142 1143 bch_notice(c, "initializing new filesystem"); 1144 1145 mutex_lock(&c->sb_lock); 1146 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); 1147 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); 1148 1149 bch2_check_version_downgrade(c); 1150 1151 if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { 1152 bch2_sb_upgrade(c, bcachefs_metadata_version_current); 1153 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); 1154 bch2_write_super(c); 1155 } 1156 mutex_unlock(&c->sb_lock); 1157 1158 c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); 1159 set_bit(BCH_FS_may_go_rw, &c->flags); 1160 1161 for (unsigned i = 0; i < BTREE_ID_NR; i++) 1162 bch2_btree_root_alloc(c, i); 1163 1164 for_each_member_device(c, ca) 1165 bch2_dev_usage_init(ca); 1166 1167 ret = bch2_fs_journal_alloc(c); 1168 if (ret) 1169 goto err; 1170 1171 /* 1172 * journal_res_get() will crash if called before this has 1173 * set up the journal.pin FIFO and journal.cur pointer: 1174 */ 1175 bch2_fs_journal_start(&c->journal, 1); 1176 bch2_journal_set_replay_done(&c->journal); 1177 1178 ret = bch2_fs_read_write_early(c); 1179 if (ret) 1180 goto err; 1181 1182 /* 1183 * Write out the superblock and journal buckets, now that we can do 1184 * btree updates 1185 */ 1186 bch_verbose(c, "marking superblocks"); 1187 ret = bch2_trans_mark_dev_sbs(c); 1188 bch_err_msg(c, ret, "marking superblocks"); 1189 if (ret) 1190 goto err; 1191 1192 for_each_online_member(c, ca) 1193 ca->new_fs_bucket_idx = 0; 1194 1195 ret = bch2_fs_freespace_init(c); 1196 if (ret) 1197 goto err; 1198 1199 ret = bch2_initialize_subvolumes(c); 1200 if (ret) 1201 goto err; 1202 1203 bch_verbose(c, "reading snapshots table"); 1204 ret = bch2_snapshots_read(c); 1205 if (ret) 1206 goto err; 1207 bch_verbose(c, "reading snapshots done"); 1208 1209 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); 1210 root_inode.bi_inum = BCACHEFS_ROOT_INO; 1211 root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; 1212 bch2_inode_pack(&packed_inode, &root_inode); 1213 packed_inode.inode.k.p.snapshot = U32_MAX; 1214 1215 ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0); 1216 bch_err_msg(c, ret, "creating root directory"); 1217 if (ret) 1218 goto err; 1219 1220 bch2_inode_init_early(c, &lostfound_inode); 1221 1222 ret = bch2_trans_do(c, NULL, NULL, 0, 1223 bch2_create_trans(trans, 1224 BCACHEFS_ROOT_SUBVOL_INUM, 1225 &root_inode, &lostfound_inode, 1226 &lostfound, 1227 0, 0, S_IFDIR|0700, 0, 1228 NULL, NULL, (subvol_inum) { 0 }, 0)); 1229 bch_err_msg(c, ret, "creating lost+found"); 1230 if (ret) 1231 goto err; 1232 1233 c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1; 1234 1235 if (enabled_qtypes(c)) { 1236 ret = bch2_fs_quota_read(c); 1237 if (ret) 1238 goto err; 1239 } 1240 1241 ret = bch2_journal_flush(&c->journal); 1242 bch_err_msg(c, ret, "writing first journal entry"); 1243 if (ret) 1244 goto err; 1245 1246 mutex_lock(&c->sb_lock); 1247 SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); 1248 SET_BCH_SB_CLEAN(c->disk_sb.sb, false); 1249 1250 bch2_write_super(c); 1251 mutex_unlock(&c->sb_lock); 1252 1253 return 0; 1254 err: 1255 bch_err_fn(c, ret); 1256 return ret; 1257 } 1258