1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_background.h" 5 #include "bkey_buf.h" 6 #include "btree_journal_iter.h" 7 #include "btree_node_scan.h" 8 #include "btree_update.h" 9 #include "btree_update_interior.h" 10 #include "btree_io.h" 11 #include "buckets.h" 12 #include "dirent.h" 13 #include "disk_accounting.h" 14 #include "errcode.h" 15 #include "error.h" 16 #include "fs-common.h" 17 #include "journal_io.h" 18 #include "journal_reclaim.h" 19 #include "journal_seq_blacklist.h" 20 #include "logged_ops.h" 21 #include "move.h" 22 #include "quota.h" 23 #include "rebalance.h" 24 #include "recovery.h" 25 #include "recovery_passes.h" 26 #include "replicas.h" 27 #include "sb-clean.h" 28 #include "sb-downgrade.h" 29 #include "snapshot.h" 30 #include "super-io.h" 31 32 #include <linux/sort.h> 33 #include <linux/stat.h> 34 35 #define QSTR(n) { { { .len = strlen(n) } }, .name = n } 36 37 void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) 38 { 39 if (btree >= BTREE_ID_NR_MAX) 40 return; 41 42 u64 b = BIT_ULL(btree); 43 44 if (!(c->sb.btrees_lost_data & b)) { 45 bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree)); 46 47 mutex_lock(&c->sb_lock); 48 bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); 49 bch2_write_super(c); 50 mutex_unlock(&c->sb_lock); 51 } 52 } 53 54 /* for -o reconstruct_alloc: */ 55 static void bch2_reconstruct_alloc(struct bch_fs *c) 56 { 57 bch2_journal_log_msg(c, "dropping alloc info"); 58 bch_info(c, "dropping and reconstructing all alloc info"); 59 60 mutex_lock(&c->sb_lock); 61 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 62 63 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); 64 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); 65 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); 66 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); 67 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); 68 69 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); 70 __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); 71 __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); 72 73 __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent); 74 __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent); 75 __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent); 76 77 __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent); 78 __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent); 79 __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); 80 __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); 81 82 __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); 83 __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); 84 __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); 85 __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); 86 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); 87 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); 88 __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); 89 __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); 90 __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); 91 __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); 92 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); 93 __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); 94 __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); 95 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 96 97 bch2_write_super(c); 98 mutex_unlock(&c->sb_lock); 99 100 c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 101 102 103 bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, 104 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 105 bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, 106 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 107 bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, 108 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 109 bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, 110 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 111 bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, 112 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 113 } 114 115 /* 116 * Btree node pointers have a field to stack a pointer to the in memory btree 117 * node; we need to zero out this field when reading in btree nodes, or when 118 * reading in keys from the journal: 119 */ 120 static void zero_out_btree_mem_ptr(struct journal_keys *keys) 121 { 122 darray_for_each(*keys, i) 123 if (i->k->k.type == KEY_TYPE_btree_ptr_v2) 124 bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; 125 } 126 127 /* journal replay: */ 128 129 static void replay_now_at(struct journal *j, u64 seq) 130 { 131 BUG_ON(seq < j->replay_journal_seq); 132 133 seq = min(seq, j->replay_journal_seq_end); 134 135 while (j->replay_journal_seq < seq) 136 bch2_journal_pin_put(j, j->replay_journal_seq++); 137 } 138 139 static int bch2_journal_replay_accounting_key(struct btree_trans *trans, 140 struct journal_key *k) 141 { 142 struct btree_iter iter; 143 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 144 BTREE_MAX_DEPTH, k->level, 145 BTREE_ITER_intent); 146 int ret = bch2_btree_iter_traverse(&iter); 147 if (ret) 148 goto out; 149 150 struct bkey u; 151 struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); 152 153 /* Has this delta already been applied to the btree? */ 154 if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) { 155 ret = 0; 156 goto out; 157 } 158 159 struct bkey_i *new = k->k; 160 if (old.k->type == KEY_TYPE_accounting) { 161 new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k)); 162 ret = PTR_ERR_OR_ZERO(new); 163 if (ret) 164 goto out; 165 166 bch2_accounting_accumulate(bkey_i_to_accounting(new), 167 bkey_s_c_to_accounting(old)); 168 } 169 170 trans->journal_res.seq = k->journal_seq; 171 172 ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun); 173 out: 174 bch2_trans_iter_exit(trans, &iter); 175 return ret; 176 } 177 178 static int bch2_journal_replay_key(struct btree_trans *trans, 179 struct journal_key *k) 180 { 181 struct btree_iter iter; 182 unsigned iter_flags = 183 BTREE_ITER_intent| 184 BTREE_ITER_not_extents; 185 unsigned update_flags = BTREE_TRIGGER_norun; 186 int ret; 187 188 if (k->overwritten) 189 return 0; 190 191 trans->journal_res.seq = k->journal_seq; 192 193 /* 194 * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to 195 * keep the key cache coherent with the underlying btree. Nothing 196 * besides the allocator is doing updates yet so we don't need key cache 197 * coherency for non-alloc btrees, and key cache fills for snapshots 198 * btrees use BTREE_ITER_filter_snapshots, which isn't available until 199 * the snapshots recovery pass runs. 200 */ 201 if (!k->level && k->btree_id == BTREE_ID_alloc) 202 iter_flags |= BTREE_ITER_cached; 203 else 204 update_flags |= BTREE_UPDATE_key_cache_reclaim; 205 206 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 207 BTREE_MAX_DEPTH, k->level, 208 iter_flags); 209 ret = bch2_btree_iter_traverse(&iter); 210 if (ret) 211 goto out; 212 213 struct btree_path *path = btree_iter_path(trans, &iter); 214 if (unlikely(!btree_path_node(path, k->level))) { 215 bch2_trans_iter_exit(trans, &iter); 216 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 217 BTREE_MAX_DEPTH, 0, iter_flags); 218 ret = bch2_btree_iter_traverse(&iter) ?: 219 bch2_btree_increase_depth(trans, iter.path, 0) ?: 220 -BCH_ERR_transaction_restart_nested; 221 goto out; 222 } 223 224 /* Must be checked with btree locked: */ 225 if (k->overwritten) 226 goto out; 227 228 if (k->k->k.type == KEY_TYPE_accounting) { 229 ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k); 230 goto out; 231 } 232 233 ret = bch2_trans_update(trans, &iter, k->k, update_flags); 234 out: 235 bch2_trans_iter_exit(trans, &iter); 236 return ret; 237 } 238 239 static int journal_sort_seq_cmp(const void *_l, const void *_r) 240 { 241 const struct journal_key *l = *((const struct journal_key **)_l); 242 const struct journal_key *r = *((const struct journal_key **)_r); 243 244 /* 245 * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last 246 * 247 * journal_seq == 0 means that the key comes from early repair, and 248 * should be inserted last so as to avoid overflowing the journal 249 */ 250 return cmp_int(l->journal_seq - 1, r->journal_seq - 1); 251 } 252 253 int bch2_journal_replay(struct bch_fs *c) 254 { 255 struct journal_keys *keys = &c->journal_keys; 256 DARRAY(struct journal_key *) keys_sorted = { 0 }; 257 struct journal *j = &c->journal; 258 u64 start_seq = c->journal_replay_seq_start; 259 u64 end_seq = c->journal_replay_seq_start; 260 struct btree_trans *trans = NULL; 261 bool immediate_flush = false; 262 int ret = 0; 263 264 if (keys->nr) { 265 ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", 266 keys->nr, start_seq, end_seq); 267 if (ret) 268 goto err; 269 } 270 271 BUG_ON(!atomic_read(&keys->ref)); 272 273 move_gap(keys, keys->nr); 274 trans = bch2_trans_get(c); 275 276 /* 277 * Replay accounting keys first: we can't allow the write buffer to 278 * flush accounting keys until we're done 279 */ 280 darray_for_each(*keys, k) { 281 if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated)) 282 continue; 283 284 cond_resched(); 285 286 ret = commit_do(trans, NULL, NULL, 287 BCH_TRANS_COMMIT_no_enospc| 288 BCH_TRANS_COMMIT_journal_reclaim| 289 BCH_TRANS_COMMIT_skip_accounting_apply| 290 BCH_TRANS_COMMIT_no_journal_res| 291 BCH_WATERMARK_reclaim, 292 bch2_journal_replay_accounting_key(trans, k)); 293 if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) 294 goto err; 295 296 k->overwritten = true; 297 } 298 299 set_bit(BCH_FS_accounting_replay_done, &c->flags); 300 301 /* 302 * First, attempt to replay keys in sorted order. This is more 303 * efficient - better locality of btree access - but some might fail if 304 * that would cause a journal deadlock. 305 */ 306 darray_for_each(*keys, k) { 307 cond_resched(); 308 309 /* 310 * k->allocated means the key wasn't read in from the journal, 311 * rather it was from early repair code 312 */ 313 if (k->allocated) 314 immediate_flush = true; 315 316 /* Skip fastpath if we're low on space in the journal */ 317 ret = c->journal.watermark ? -1 : 318 commit_do(trans, NULL, NULL, 319 BCH_TRANS_COMMIT_no_enospc| 320 BCH_TRANS_COMMIT_journal_reclaim| 321 BCH_TRANS_COMMIT_skip_accounting_apply| 322 (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), 323 bch2_journal_replay_key(trans, k)); 324 BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting); 325 if (ret) { 326 ret = darray_push(&keys_sorted, k); 327 if (ret) 328 goto err; 329 } 330 } 331 332 bch2_trans_unlock_long(trans); 333 /* 334 * Now, replay any remaining keys in the order in which they appear in 335 * the journal, unpinning those journal entries as we go: 336 */ 337 sort(keys_sorted.data, keys_sorted.nr, 338 sizeof(keys_sorted.data[0]), 339 journal_sort_seq_cmp, NULL); 340 341 darray_for_each(keys_sorted, kp) { 342 cond_resched(); 343 344 struct journal_key *k = *kp; 345 346 if (k->journal_seq) 347 replay_now_at(j, k->journal_seq); 348 else 349 replay_now_at(j, j->replay_journal_seq_end); 350 351 ret = commit_do(trans, NULL, NULL, 352 BCH_TRANS_COMMIT_no_enospc| 353 BCH_TRANS_COMMIT_skip_accounting_apply| 354 (!k->allocated 355 ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim 356 : 0), 357 bch2_journal_replay_key(trans, k)); 358 bch_err_msg(c, ret, "while replaying key at btree %s level %u:", 359 bch2_btree_id_str(k->btree_id), k->level); 360 if (ret) 361 goto err; 362 363 BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); 364 } 365 366 /* 367 * We need to put our btree_trans before calling flush_all_pins(), since 368 * that will use a btree_trans internally 369 */ 370 bch2_trans_put(trans); 371 trans = NULL; 372 373 if (!c->opts.retain_recovery_info && 374 c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) 375 bch2_journal_keys_put_initial(c); 376 377 replay_now_at(j, j->replay_journal_seq_end); 378 j->replay_journal_seq = 0; 379 380 bch2_journal_set_replay_done(j); 381 382 /* if we did any repair, flush it immediately */ 383 if (immediate_flush) { 384 bch2_journal_flush_all_pins(&c->journal); 385 ret = bch2_journal_meta(&c->journal); 386 } 387 388 if (keys->nr) 389 bch2_journal_log_msg(c, "journal replay finished"); 390 err: 391 if (trans) 392 bch2_trans_put(trans); 393 darray_exit(&keys_sorted); 394 bch_err_fn(c, ret); 395 return ret; 396 } 397 398 /* journal replay early: */ 399 400 static int journal_replay_entry_early(struct bch_fs *c, 401 struct jset_entry *entry) 402 { 403 int ret = 0; 404 405 switch (entry->type) { 406 case BCH_JSET_ENTRY_btree_root: { 407 struct btree_root *r; 408 409 if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, 410 c, invalid_btree_id, 411 "invalid btree id %u (max %u)", 412 entry->btree_id, BTREE_ID_NR_MAX)) 413 return 0; 414 415 while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { 416 ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); 417 if (ret) 418 return ret; 419 } 420 421 r = bch2_btree_id_root(c, entry->btree_id); 422 423 if (entry->u64s) { 424 r->level = entry->level; 425 bkey_copy(&r->key, (struct bkey_i *) entry->start); 426 r->error = 0; 427 } else { 428 r->error = -BCH_ERR_btree_node_read_error; 429 } 430 r->alive = true; 431 break; 432 } 433 case BCH_JSET_ENTRY_usage: { 434 struct jset_entry_usage *u = 435 container_of(entry, struct jset_entry_usage, entry); 436 437 switch (entry->btree_id) { 438 case BCH_FS_USAGE_key_version: 439 atomic64_set(&c->key_version, le64_to_cpu(u->v)); 440 break; 441 } 442 break; 443 } 444 case BCH_JSET_ENTRY_blacklist: { 445 struct jset_entry_blacklist *bl_entry = 446 container_of(entry, struct jset_entry_blacklist, entry); 447 448 ret = bch2_journal_seq_blacklist_add(c, 449 le64_to_cpu(bl_entry->seq), 450 le64_to_cpu(bl_entry->seq) + 1); 451 break; 452 } 453 case BCH_JSET_ENTRY_blacklist_v2: { 454 struct jset_entry_blacklist_v2 *bl_entry = 455 container_of(entry, struct jset_entry_blacklist_v2, entry); 456 457 ret = bch2_journal_seq_blacklist_add(c, 458 le64_to_cpu(bl_entry->start), 459 le64_to_cpu(bl_entry->end) + 1); 460 break; 461 } 462 case BCH_JSET_ENTRY_clock: { 463 struct jset_entry_clock *clock = 464 container_of(entry, struct jset_entry_clock, entry); 465 466 atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); 467 } 468 } 469 fsck_err: 470 return ret; 471 } 472 473 static int journal_replay_early(struct bch_fs *c, 474 struct bch_sb_field_clean *clean) 475 { 476 if (clean) { 477 for (struct jset_entry *entry = clean->start; 478 entry != vstruct_end(&clean->field); 479 entry = vstruct_next(entry)) { 480 int ret = journal_replay_entry_early(c, entry); 481 if (ret) 482 return ret; 483 } 484 } else { 485 struct genradix_iter iter; 486 struct journal_replay *i, **_i; 487 488 genradix_for_each(&c->journal_entries, iter, _i) { 489 i = *_i; 490 491 if (journal_replay_ignore(i)) 492 continue; 493 494 vstruct_for_each(&i->j, entry) { 495 int ret = journal_replay_entry_early(c, entry); 496 if (ret) 497 return ret; 498 } 499 } 500 } 501 502 return 0; 503 } 504 505 /* sb clean section: */ 506 507 static int read_btree_roots(struct bch_fs *c) 508 { 509 int ret = 0; 510 511 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 512 struct btree_root *r = bch2_btree_id_root(c, i); 513 514 if (!r->alive) 515 continue; 516 517 if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) 518 continue; 519 520 if (mustfix_fsck_err_on((ret = r->error), 521 c, btree_root_bkey_invalid, 522 "invalid btree root %s", 523 bch2_btree_id_str(i)) || 524 mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), 525 c, btree_root_read_error, 526 "error reading btree root %s l=%u: %s", 527 bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { 528 if (btree_id_is_alloc(i)) { 529 c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); 530 c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); 531 c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); 532 c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); 533 c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); 534 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 535 r->error = 0; 536 } else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { 537 bch_info(c, "will run btree node scan"); 538 c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); 539 c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); 540 } 541 542 ret = 0; 543 bch2_btree_lost_data(c, i); 544 } 545 } 546 547 for (unsigned i = 0; i < BTREE_ID_NR; i++) { 548 struct btree_root *r = bch2_btree_id_root(c, i); 549 550 if (!r->b && !r->error) { 551 r->alive = false; 552 r->level = 0; 553 bch2_btree_root_alloc_fake(c, i, 0); 554 } 555 } 556 fsck_err: 557 return ret; 558 } 559 560 static bool check_version_upgrade(struct bch_fs *c) 561 { 562 unsigned latest_version = bcachefs_metadata_version_current; 563 unsigned latest_compatible = min(latest_version, 564 bch2_latest_compatible_version(c->sb.version)); 565 unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; 566 unsigned new_version = 0; 567 568 if (old_version < bcachefs_metadata_required_upgrade_below) { 569 if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || 570 latest_compatible < bcachefs_metadata_required_upgrade_below) 571 new_version = latest_version; 572 else 573 new_version = latest_compatible; 574 } else { 575 switch (c->opts.version_upgrade) { 576 case BCH_VERSION_UPGRADE_compatible: 577 new_version = latest_compatible; 578 break; 579 case BCH_VERSION_UPGRADE_incompatible: 580 new_version = latest_version; 581 break; 582 case BCH_VERSION_UPGRADE_none: 583 new_version = min(old_version, latest_version); 584 break; 585 } 586 } 587 588 if (new_version > old_version) { 589 struct printbuf buf = PRINTBUF; 590 591 if (old_version < bcachefs_metadata_required_upgrade_below) 592 prt_str(&buf, "Version upgrade required:\n"); 593 594 if (old_version != c->sb.version) { 595 prt_str(&buf, "Version upgrade from "); 596 bch2_version_to_text(&buf, c->sb.version_upgrade_complete); 597 prt_str(&buf, " to "); 598 bch2_version_to_text(&buf, c->sb.version); 599 prt_str(&buf, " incomplete\n"); 600 } 601 602 prt_printf(&buf, "Doing %s version upgrade from ", 603 BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) 604 ? "incompatible" : "compatible"); 605 bch2_version_to_text(&buf, old_version); 606 prt_str(&buf, " to "); 607 bch2_version_to_text(&buf, new_version); 608 prt_newline(&buf); 609 610 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 611 __le64 passes = ext->recovery_passes_required[0]; 612 bch2_sb_set_upgrade(c, old_version, new_version); 613 passes = ext->recovery_passes_required[0] & ~passes; 614 615 if (passes) { 616 prt_str(&buf, " running recovery passes: "); 617 prt_bitflags(&buf, bch2_recovery_passes, 618 bch2_recovery_passes_from_stable(le64_to_cpu(passes))); 619 } 620 621 bch_info(c, "%s", buf.buf); 622 623 bch2_sb_upgrade(c, new_version); 624 625 printbuf_exit(&buf); 626 return true; 627 } 628 629 return false; 630 } 631 632 int bch2_fs_recovery(struct bch_fs *c) 633 { 634 struct bch_sb_field_clean *clean = NULL; 635 struct jset *last_journal_entry = NULL; 636 u64 last_seq = 0, blacklist_seq, journal_seq; 637 int ret = 0; 638 639 if (c->sb.clean) { 640 clean = bch2_read_superblock_clean(c); 641 ret = PTR_ERR_OR_ZERO(clean); 642 if (ret) 643 goto err; 644 645 bch_info(c, "recovering from clean shutdown, journal seq %llu", 646 le64_to_cpu(clean->journal_seq)); 647 } else { 648 bch_info(c, "recovering from unclean shutdown"); 649 } 650 651 if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { 652 bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); 653 ret = -EINVAL; 654 goto err; 655 } 656 657 if (!c->sb.clean && 658 !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { 659 bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); 660 ret = -EINVAL; 661 goto err; 662 } 663 664 if (c->opts.norecovery) 665 c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; 666 667 mutex_lock(&c->sb_lock); 668 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 669 bool write_sb = false; 670 671 if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { 672 ext->recovery_passes_required[0] |= 673 cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); 674 write_sb = true; 675 } 676 677 u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 678 if (sb_passes) { 679 struct printbuf buf = PRINTBUF; 680 prt_str(&buf, "superblock requires following recovery passes to be run:\n "); 681 prt_bitflags(&buf, bch2_recovery_passes, sb_passes); 682 bch_info(c, "%s", buf.buf); 683 printbuf_exit(&buf); 684 } 685 686 if (bch2_check_version_downgrade(c)) { 687 struct printbuf buf = PRINTBUF; 688 689 prt_str(&buf, "Version downgrade required:"); 690 691 __le64 passes = ext->recovery_passes_required[0]; 692 bch2_sb_set_downgrade(c, 693 BCH_VERSION_MINOR(bcachefs_metadata_version_current), 694 BCH_VERSION_MINOR(c->sb.version)); 695 passes = ext->recovery_passes_required[0] & ~passes; 696 if (passes) { 697 prt_str(&buf, "\n running recovery passes: "); 698 prt_bitflags(&buf, bch2_recovery_passes, 699 bch2_recovery_passes_from_stable(le64_to_cpu(passes))); 700 } 701 702 bch_info(c, "%s", buf.buf); 703 printbuf_exit(&buf); 704 write_sb = true; 705 } 706 707 if (check_version_upgrade(c)) 708 write_sb = true; 709 710 c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 711 712 if (write_sb) 713 bch2_write_super(c); 714 mutex_unlock(&c->sb_lock); 715 716 if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) 717 c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); 718 719 if (c->opts.fsck) 720 set_bit(BCH_FS_fsck_running, &c->flags); 721 if (c->sb.clean) 722 set_bit(BCH_FS_clean_recovery, &c->flags); 723 724 ret = bch2_blacklist_table_initialize(c); 725 if (ret) { 726 bch_err(c, "error initializing blacklist table"); 727 goto err; 728 } 729 730 bch2_journal_pos_from_member_info_resume(c); 731 732 if (!c->sb.clean || c->opts.retain_recovery_info) { 733 struct genradix_iter iter; 734 struct journal_replay **i; 735 736 bch_verbose(c, "starting journal read"); 737 ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); 738 if (ret) 739 goto err; 740 741 /* 742 * note: cmd_list_journal needs the blacklist table fully up to date so 743 * it can asterisk ignored journal entries: 744 */ 745 if (c->opts.read_journal_only) 746 goto out; 747 748 genradix_for_each_reverse(&c->journal_entries, iter, i) 749 if (!journal_replay_ignore(*i)) { 750 last_journal_entry = &(*i)->j; 751 break; 752 } 753 754 if (mustfix_fsck_err_on(c->sb.clean && 755 last_journal_entry && 756 !journal_entry_empty(last_journal_entry), c, 757 clean_but_journal_not_empty, 758 "filesystem marked clean but journal not empty")) { 759 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 760 SET_BCH_SB_CLEAN(c->disk_sb.sb, false); 761 c->sb.clean = false; 762 } 763 764 if (!last_journal_entry) { 765 fsck_err_on(!c->sb.clean, c, 766 dirty_but_no_journal_entries, 767 "no journal entries found"); 768 if (clean) 769 goto use_clean; 770 771 genradix_for_each_reverse(&c->journal_entries, iter, i) 772 if (*i) { 773 last_journal_entry = &(*i)->j; 774 (*i)->ignore_blacklisted = false; 775 (*i)->ignore_not_dirty= false; 776 /* 777 * This was probably a NO_FLUSH entry, 778 * so last_seq was garbage - but we know 779 * we're only using a single journal 780 * entry, set it here: 781 */ 782 (*i)->j.last_seq = (*i)->j.seq; 783 break; 784 } 785 } 786 787 ret = bch2_journal_keys_sort(c); 788 if (ret) 789 goto err; 790 791 if (c->sb.clean && last_journal_entry) { 792 ret = bch2_verify_superblock_clean(c, &clean, 793 last_journal_entry); 794 if (ret) 795 goto err; 796 } 797 } else { 798 use_clean: 799 if (!clean) { 800 bch_err(c, "no superblock clean section found"); 801 ret = -BCH_ERR_fsck_repair_impossible; 802 goto err; 803 804 } 805 blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; 806 } 807 808 c->journal_replay_seq_start = last_seq; 809 c->journal_replay_seq_end = blacklist_seq - 1; 810 811 if (c->opts.reconstruct_alloc) 812 bch2_reconstruct_alloc(c); 813 814 zero_out_btree_mem_ptr(&c->journal_keys); 815 816 ret = journal_replay_early(c, clean); 817 if (ret) 818 goto err; 819 820 /* 821 * After an unclean shutdown, skip then next few journal sequence 822 * numbers as they may have been referenced by btree writes that 823 * happened before their corresponding journal writes - those btree 824 * writes need to be ignored, by skipping and blacklisting the next few 825 * journal sequence numbers: 826 */ 827 if (!c->sb.clean) 828 journal_seq += 8; 829 830 if (blacklist_seq != journal_seq) { 831 ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", 832 blacklist_seq, journal_seq) ?: 833 bch2_journal_seq_blacklist_add(c, 834 blacklist_seq, journal_seq); 835 if (ret) { 836 bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); 837 goto err; 838 } 839 } 840 841 ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", 842 journal_seq, last_seq, blacklist_seq - 1) ?: 843 bch2_fs_journal_start(&c->journal, journal_seq); 844 if (ret) 845 goto err; 846 847 /* 848 * Skip past versions that might have possibly been used (as nonces), 849 * but hadn't had their pointers written: 850 */ 851 if (c->sb.encryption_type && !c->sb.clean) 852 atomic64_add(1 << 16, &c->key_version); 853 854 ret = read_btree_roots(c); 855 if (ret) 856 goto err; 857 858 set_bit(BCH_FS_btree_running, &c->flags); 859 860 ret = bch2_sb_set_upgrade_extra(c); 861 862 ret = bch2_run_recovery_passes(c); 863 if (ret) 864 goto err; 865 866 clear_bit(BCH_FS_fsck_running, &c->flags); 867 868 /* in case we don't run journal replay, i.e. norecovery mode */ 869 set_bit(BCH_FS_accounting_replay_done, &c->flags); 870 871 /* fsync if we fixed errors */ 872 if (test_bit(BCH_FS_errors_fixed, &c->flags) && 873 bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) { 874 bch2_journal_flush_all_pins(&c->journal); 875 bch2_journal_meta(&c->journal); 876 bch2_write_ref_put(c, BCH_WRITE_REF_fsync); 877 } 878 879 /* If we fixed errors, verify that fs is actually clean now: */ 880 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && 881 test_bit(BCH_FS_errors_fixed, &c->flags) && 882 !test_bit(BCH_FS_errors_not_fixed, &c->flags) && 883 !test_bit(BCH_FS_error, &c->flags)) { 884 bch2_flush_fsck_errs(c); 885 886 bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); 887 clear_bit(BCH_FS_errors_fixed, &c->flags); 888 889 c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; 890 891 ret = bch2_run_recovery_passes(c); 892 if (ret) 893 goto err; 894 895 if (test_bit(BCH_FS_errors_fixed, &c->flags) || 896 test_bit(BCH_FS_errors_not_fixed, &c->flags)) { 897 bch_err(c, "Second fsck run was not clean"); 898 set_bit(BCH_FS_errors_not_fixed, &c->flags); 899 } 900 901 set_bit(BCH_FS_errors_fixed, &c->flags); 902 } 903 904 if (enabled_qtypes(c)) { 905 bch_verbose(c, "reading quotas"); 906 ret = bch2_fs_quota_read(c); 907 if (ret) 908 goto err; 909 bch_verbose(c, "quotas done"); 910 } 911 912 mutex_lock(&c->sb_lock); 913 ext = bch2_sb_field_get(c->disk_sb.sb, ext); 914 write_sb = false; 915 916 if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { 917 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); 918 write_sb = true; 919 } 920 921 if (!test_bit(BCH_FS_error, &c->flags) && 922 !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { 923 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); 924 write_sb = true; 925 } 926 927 if (!test_bit(BCH_FS_error, &c->flags) && 928 !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { 929 memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); 930 write_sb = true; 931 } 932 933 if (c->opts.fsck && 934 !test_bit(BCH_FS_error, &c->flags) && 935 c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && 936 ext->btrees_lost_data) { 937 ext->btrees_lost_data = 0; 938 write_sb = true; 939 } 940 941 if (c->opts.fsck && 942 !test_bit(BCH_FS_error, &c->flags) && 943 !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { 944 SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); 945 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); 946 write_sb = true; 947 } 948 949 if (bch2_blacklist_entries_gc(c)) 950 write_sb = true; 951 952 if (write_sb) 953 bch2_write_super(c); 954 mutex_unlock(&c->sb_lock); 955 956 if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || 957 c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { 958 struct bch_move_stats stats; 959 960 bch2_move_stats_init(&stats, "recovery"); 961 962 struct printbuf buf = PRINTBUF; 963 bch2_version_to_text(&buf, c->sb.version_min); 964 bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); 965 printbuf_exit(&buf); 966 967 ret = bch2_fs_read_write_early(c) ?: 968 bch2_scan_old_btree_nodes(c, &stats); 969 if (ret) 970 goto err; 971 bch_info(c, "scanning for old btree nodes done"); 972 } 973 974 ret = 0; 975 out: 976 bch2_flush_fsck_errs(c); 977 978 if (!c->opts.retain_recovery_info) { 979 bch2_journal_keys_put_initial(c); 980 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 981 } 982 if (!IS_ERR(clean)) 983 kfree(clean); 984 985 if (!ret && 986 test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && 987 !c->opts.nochanges) { 988 bch2_fs_read_write_early(c); 989 bch2_delete_dead_snapshots_async(c); 990 } 991 992 bch_err_fn(c, ret); 993 return ret; 994 err: 995 fsck_err: 996 bch2_fs_emergency_read_only(c); 997 goto out; 998 } 999 1000 int bch2_fs_initialize(struct bch_fs *c) 1001 { 1002 struct bch_inode_unpacked root_inode, lostfound_inode; 1003 struct bkey_inode_buf packed_inode; 1004 struct qstr lostfound = QSTR("lost+found"); 1005 int ret; 1006 1007 bch_notice(c, "initializing new filesystem"); 1008 set_bit(BCH_FS_new_fs, &c->flags); 1009 1010 mutex_lock(&c->sb_lock); 1011 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); 1012 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); 1013 1014 bch2_check_version_downgrade(c); 1015 1016 if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { 1017 bch2_sb_upgrade(c, bcachefs_metadata_version_current); 1018 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); 1019 bch2_write_super(c); 1020 } 1021 mutex_unlock(&c->sb_lock); 1022 1023 c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; 1024 set_bit(BCH_FS_btree_running, &c->flags); 1025 set_bit(BCH_FS_may_go_rw, &c->flags); 1026 1027 for (unsigned i = 0; i < BTREE_ID_NR; i++) 1028 bch2_btree_root_alloc_fake(c, i, 0); 1029 1030 ret = bch2_fs_journal_alloc(c); 1031 if (ret) 1032 goto err; 1033 1034 /* 1035 * journal_res_get() will crash if called before this has 1036 * set up the journal.pin FIFO and journal.cur pointer: 1037 */ 1038 bch2_fs_journal_start(&c->journal, 1); 1039 set_bit(BCH_FS_accounting_replay_done, &c->flags); 1040 bch2_journal_set_replay_done(&c->journal); 1041 1042 ret = bch2_fs_read_write_early(c); 1043 if (ret) 1044 goto err; 1045 1046 for_each_member_device(c, ca) { 1047 ret = bch2_dev_usage_init(ca, false); 1048 if (ret) { 1049 bch2_dev_put(ca); 1050 goto err; 1051 } 1052 } 1053 1054 /* 1055 * Write out the superblock and journal buckets, now that we can do 1056 * btree updates 1057 */ 1058 bch_verbose(c, "marking superblocks"); 1059 ret = bch2_trans_mark_dev_sbs(c); 1060 bch_err_msg(c, ret, "marking superblocks"); 1061 if (ret) 1062 goto err; 1063 1064 for_each_online_member(c, ca) 1065 ca->new_fs_bucket_idx = 0; 1066 1067 ret = bch2_fs_freespace_init(c); 1068 if (ret) 1069 goto err; 1070 1071 ret = bch2_initialize_subvolumes(c); 1072 if (ret) 1073 goto err; 1074 1075 bch_verbose(c, "reading snapshots table"); 1076 ret = bch2_snapshots_read(c); 1077 if (ret) 1078 goto err; 1079 bch_verbose(c, "reading snapshots done"); 1080 1081 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); 1082 root_inode.bi_inum = BCACHEFS_ROOT_INO; 1083 root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; 1084 bch2_inode_pack(&packed_inode, &root_inode); 1085 packed_inode.inode.k.p.snapshot = U32_MAX; 1086 1087 ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0); 1088 bch_err_msg(c, ret, "creating root directory"); 1089 if (ret) 1090 goto err; 1091 1092 bch2_inode_init_early(c, &lostfound_inode); 1093 1094 ret = bch2_trans_do(c, NULL, NULL, 0, 1095 bch2_create_trans(trans, 1096 BCACHEFS_ROOT_SUBVOL_INUM, 1097 &root_inode, &lostfound_inode, 1098 &lostfound, 1099 0, 0, S_IFDIR|0700, 0, 1100 NULL, NULL, (subvol_inum) { 0 }, 0)); 1101 bch_err_msg(c, ret, "creating lost+found"); 1102 if (ret) 1103 goto err; 1104 1105 c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; 1106 1107 if (enabled_qtypes(c)) { 1108 ret = bch2_fs_quota_read(c); 1109 if (ret) 1110 goto err; 1111 } 1112 1113 ret = bch2_journal_flush(&c->journal); 1114 bch_err_msg(c, ret, "writing first journal entry"); 1115 if (ret) 1116 goto err; 1117 1118 mutex_lock(&c->sb_lock); 1119 SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); 1120 SET_BCH_SB_CLEAN(c->disk_sb.sb, false); 1121 1122 bch2_write_super(c); 1123 mutex_unlock(&c->sb_lock); 1124 1125 return 0; 1126 err: 1127 bch_err_fn(c, ret); 1128 return ret; 1129 } 1130