1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_background.h" 5 #include "bkey_buf.h" 6 #include "btree_journal_iter.h" 7 #include "btree_node_scan.h" 8 #include "btree_update.h" 9 #include "btree_update_interior.h" 10 #include "btree_io.h" 11 #include "buckets.h" 12 #include "dirent.h" 13 #include "disk_accounting.h" 14 #include "errcode.h" 15 #include "error.h" 16 #include "fs-common.h" 17 #include "journal_io.h" 18 #include "journal_reclaim.h" 19 #include "journal_seq_blacklist.h" 20 #include "logged_ops.h" 21 #include "move.h" 22 #include "quota.h" 23 #include "rebalance.h" 24 #include "recovery.h" 25 #include "recovery_passes.h" 26 #include "replicas.h" 27 #include "sb-clean.h" 28 #include "sb-downgrade.h" 29 #include "snapshot.h" 30 #include "super-io.h" 31 32 #include <linux/sort.h> 33 #include <linux/stat.h> 34 35 #define QSTR(n) { { { .len = strlen(n) } }, .name = n } 36 37 int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) 38 { 39 u64 b = BIT_ULL(btree); 40 int ret = 0; 41 42 mutex_lock(&c->sb_lock); 43 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 44 45 if (!(c->sb.btrees_lost_data & b)) { 46 struct printbuf buf = PRINTBUF; 47 bch2_btree_id_to_text(&buf, btree); 48 bch_err(c, "flagging btree %s lost data", buf.buf); 49 printbuf_exit(&buf); 50 ext->btrees_lost_data |= cpu_to_le64(b); 51 } 52 53 /* Once we have runtime self healing for topology errors we won't need this: */ 54 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; 55 56 /* Btree node accounting will be off: */ 57 __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); 58 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; 59 60 #ifdef CONFIG_BCACHEFS_DEBUG 61 /* 62 * These are much more minor, and don't need to be corrected right away, 63 * but in debug mode we want the next fsck run to be clean: 64 */ 65 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; 66 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; 67 #endif 68 69 switch (btree) { 70 case BTREE_ID_alloc: 71 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; 72 73 __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); 74 __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); 75 __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); 76 __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); 77 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); 78 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); 79 goto out; 80 case BTREE_ID_backpointers: 81 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; 82 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; 83 goto out; 84 case BTREE_ID_need_discard: 85 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; 86 goto out; 87 case BTREE_ID_freespace: 88 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; 89 goto out; 90 case BTREE_ID_bucket_gens: 91 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; 92 goto out; 93 case BTREE_ID_lru: 94 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; 95 goto out; 96 case BTREE_ID_accounting: 97 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; 98 goto out; 99 default: 100 ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; 101 goto out; 102 } 103 out: 104 bch2_write_super(c); 105 mutex_unlock(&c->sb_lock); 106 107 return ret; 108 } 109 110 static void kill_btree(struct bch_fs *c, enum btree_id btree) 111 { 112 bch2_btree_id_root(c, btree)->alive = false; 113 bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 114 } 115 116 /* for -o reconstruct_alloc: */ 117 static void bch2_reconstruct_alloc(struct bch_fs *c) 118 { 119 bch2_journal_log_msg(c, "dropping alloc info"); 120 bch_info(c, "dropping and reconstructing all alloc info"); 121 122 mutex_lock(&c->sb_lock); 123 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 124 125 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); 126 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); 127 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); 128 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); 129 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); 130 131 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); 132 __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); 133 __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); 134 135 __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent); 136 __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent); 137 __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent); 138 139 __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent); 140 __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent); 141 __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); 142 __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); 143 144 __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent); 145 146 __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); 147 __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); 148 __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); 149 __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); 150 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); 151 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); 152 __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); 153 __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); 154 __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); 155 __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); 156 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); 157 __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); 158 __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); 159 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 160 161 c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 162 163 bch2_write_super(c); 164 mutex_unlock(&c->sb_lock); 165 166 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) 167 if (btree_id_is_alloc(i)) 168 kill_btree(c, i); 169 } 170 171 /* 172 * Btree node pointers have a field to stack a pointer to the in memory btree 173 * node; we need to zero out this field when reading in btree nodes, or when 174 * reading in keys from the journal: 175 */ 176 static void zero_out_btree_mem_ptr(struct journal_keys *keys) 177 { 178 darray_for_each(*keys, i) 179 if (i->k->k.type == KEY_TYPE_btree_ptr_v2) 180 bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; 181 } 182 183 /* journal replay: */ 184 185 static void replay_now_at(struct journal *j, u64 seq) 186 { 187 BUG_ON(seq < j->replay_journal_seq); 188 189 seq = min(seq, j->replay_journal_seq_end); 190 191 while (j->replay_journal_seq < seq) 192 bch2_journal_pin_put(j, j->replay_journal_seq++); 193 } 194 195 static int bch2_journal_replay_accounting_key(struct btree_trans *trans, 196 struct journal_key *k) 197 { 198 struct btree_iter iter; 199 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 200 BTREE_MAX_DEPTH, k->level, 201 BTREE_ITER_intent); 202 int ret = bch2_btree_iter_traverse(&iter); 203 if (ret) 204 goto out; 205 206 struct bkey u; 207 struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); 208 209 /* Has this delta already been applied to the btree? */ 210 if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) { 211 ret = 0; 212 goto out; 213 } 214 215 struct bkey_i *new = k->k; 216 if (old.k->type == KEY_TYPE_accounting) { 217 new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k)); 218 ret = PTR_ERR_OR_ZERO(new); 219 if (ret) 220 goto out; 221 222 bch2_accounting_accumulate(bkey_i_to_accounting(new), 223 bkey_s_c_to_accounting(old)); 224 } 225 226 trans->journal_res.seq = k->journal_seq; 227 228 ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun); 229 out: 230 bch2_trans_iter_exit(trans, &iter); 231 return ret; 232 } 233 234 static int bch2_journal_replay_key(struct btree_trans *trans, 235 struct journal_key *k) 236 { 237 struct btree_iter iter; 238 unsigned iter_flags = 239 BTREE_ITER_intent| 240 BTREE_ITER_not_extents; 241 unsigned update_flags = BTREE_TRIGGER_norun; 242 int ret; 243 244 if (k->overwritten) 245 return 0; 246 247 trans->journal_res.seq = k->journal_seq; 248 249 /* 250 * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to 251 * keep the key cache coherent with the underlying btree. Nothing 252 * besides the allocator is doing updates yet so we don't need key cache 253 * coherency for non-alloc btrees, and key cache fills for snapshots 254 * btrees use BTREE_ITER_filter_snapshots, which isn't available until 255 * the snapshots recovery pass runs. 256 */ 257 if (!k->level && k->btree_id == BTREE_ID_alloc) 258 iter_flags |= BTREE_ITER_cached; 259 else 260 update_flags |= BTREE_UPDATE_key_cache_reclaim; 261 262 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 263 BTREE_MAX_DEPTH, k->level, 264 iter_flags); 265 ret = bch2_btree_iter_traverse(&iter); 266 if (ret) 267 goto out; 268 269 struct btree_path *path = btree_iter_path(trans, &iter); 270 if (unlikely(!btree_path_node(path, k->level))) { 271 bch2_trans_iter_exit(trans, &iter); 272 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 273 BTREE_MAX_DEPTH, 0, iter_flags); 274 ret = bch2_btree_iter_traverse(&iter) ?: 275 bch2_btree_increase_depth(trans, iter.path, 0) ?: 276 -BCH_ERR_transaction_restart_nested; 277 goto out; 278 } 279 280 /* Must be checked with btree locked: */ 281 if (k->overwritten) 282 goto out; 283 284 if (k->k->k.type == KEY_TYPE_accounting) { 285 ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k); 286 goto out; 287 } 288 289 ret = bch2_trans_update(trans, &iter, k->k, update_flags); 290 out: 291 bch2_trans_iter_exit(trans, &iter); 292 return ret; 293 } 294 295 static int journal_sort_seq_cmp(const void *_l, const void *_r) 296 { 297 const struct journal_key *l = *((const struct journal_key **)_l); 298 const struct journal_key *r = *((const struct journal_key **)_r); 299 300 /* 301 * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last 302 * 303 * journal_seq == 0 means that the key comes from early repair, and 304 * should be inserted last so as to avoid overflowing the journal 305 */ 306 return cmp_int(l->journal_seq - 1, r->journal_seq - 1); 307 } 308 309 int bch2_journal_replay(struct bch_fs *c) 310 { 311 struct journal_keys *keys = &c->journal_keys; 312 DARRAY(struct journal_key *) keys_sorted = { 0 }; 313 struct journal *j = &c->journal; 314 u64 start_seq = c->journal_replay_seq_start; 315 u64 end_seq = c->journal_replay_seq_start; 316 struct btree_trans *trans = NULL; 317 bool immediate_flush = false; 318 int ret = 0; 319 320 if (keys->nr) { 321 ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", 322 keys->nr, start_seq, end_seq); 323 if (ret) 324 goto err; 325 } 326 327 BUG_ON(!atomic_read(&keys->ref)); 328 329 move_gap(keys, keys->nr); 330 trans = bch2_trans_get(c); 331 332 /* 333 * Replay accounting keys first: we can't allow the write buffer to 334 * flush accounting keys until we're done 335 */ 336 darray_for_each(*keys, k) { 337 if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated)) 338 continue; 339 340 cond_resched(); 341 342 ret = commit_do(trans, NULL, NULL, 343 BCH_TRANS_COMMIT_no_enospc| 344 BCH_TRANS_COMMIT_journal_reclaim| 345 BCH_TRANS_COMMIT_skip_accounting_apply| 346 BCH_TRANS_COMMIT_no_journal_res| 347 BCH_WATERMARK_reclaim, 348 bch2_journal_replay_accounting_key(trans, k)); 349 if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) 350 goto err; 351 352 k->overwritten = true; 353 } 354 355 set_bit(BCH_FS_accounting_replay_done, &c->flags); 356 357 /* 358 * First, attempt to replay keys in sorted order. This is more 359 * efficient - better locality of btree access - but some might fail if 360 * that would cause a journal deadlock. 361 */ 362 darray_for_each(*keys, k) { 363 cond_resched(); 364 365 /* 366 * k->allocated means the key wasn't read in from the journal, 367 * rather it was from early repair code 368 */ 369 if (k->allocated) 370 immediate_flush = true; 371 372 /* Skip fastpath if we're low on space in the journal */ 373 ret = c->journal.watermark ? -1 : 374 commit_do(trans, NULL, NULL, 375 BCH_TRANS_COMMIT_no_enospc| 376 BCH_TRANS_COMMIT_journal_reclaim| 377 BCH_TRANS_COMMIT_skip_accounting_apply| 378 (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), 379 bch2_journal_replay_key(trans, k)); 380 BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting); 381 if (ret) { 382 ret = darray_push(&keys_sorted, k); 383 if (ret) 384 goto err; 385 } 386 } 387 388 bch2_trans_unlock_long(trans); 389 /* 390 * Now, replay any remaining keys in the order in which they appear in 391 * the journal, unpinning those journal entries as we go: 392 */ 393 sort(keys_sorted.data, keys_sorted.nr, 394 sizeof(keys_sorted.data[0]), 395 journal_sort_seq_cmp, NULL); 396 397 darray_for_each(keys_sorted, kp) { 398 cond_resched(); 399 400 struct journal_key *k = *kp; 401 402 if (k->journal_seq) 403 replay_now_at(j, k->journal_seq); 404 else 405 replay_now_at(j, j->replay_journal_seq_end); 406 407 ret = commit_do(trans, NULL, NULL, 408 BCH_TRANS_COMMIT_no_enospc| 409 BCH_TRANS_COMMIT_skip_accounting_apply| 410 (!k->allocated 411 ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim 412 : 0), 413 bch2_journal_replay_key(trans, k)); 414 if (ret) { 415 struct printbuf buf = PRINTBUF; 416 bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); 417 bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); 418 printbuf_exit(&buf); 419 goto err; 420 } 421 422 BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); 423 } 424 425 /* 426 * We need to put our btree_trans before calling flush_all_pins(), since 427 * that will use a btree_trans internally 428 */ 429 bch2_trans_put(trans); 430 trans = NULL; 431 432 if (!c->opts.retain_recovery_info && 433 c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) 434 bch2_journal_keys_put_initial(c); 435 436 replay_now_at(j, j->replay_journal_seq_end); 437 j->replay_journal_seq = 0; 438 439 bch2_journal_set_replay_done(j); 440 441 /* if we did any repair, flush it immediately */ 442 if (immediate_flush) { 443 bch2_journal_flush_all_pins(&c->journal); 444 ret = bch2_journal_meta(&c->journal); 445 } 446 447 if (keys->nr) 448 bch2_journal_log_msg(c, "journal replay finished"); 449 err: 450 if (trans) 451 bch2_trans_put(trans); 452 darray_exit(&keys_sorted); 453 bch_err_fn(c, ret); 454 return ret; 455 } 456 457 /* journal replay early: */ 458 459 static int journal_replay_entry_early(struct bch_fs *c, 460 struct jset_entry *entry) 461 { 462 int ret = 0; 463 464 switch (entry->type) { 465 case BCH_JSET_ENTRY_btree_root: { 466 467 if (unlikely(!entry->u64s)) 468 return 0; 469 470 if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, 471 c, invalid_btree_id, 472 "invalid btree id %u (max %u)", 473 entry->btree_id, BTREE_ID_NR_MAX)) 474 return 0; 475 476 while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { 477 ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); 478 if (ret) 479 return ret; 480 } 481 482 struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); 483 484 r->level = entry->level; 485 bkey_copy(&r->key, (struct bkey_i *) entry->start); 486 r->error = 0; 487 r->alive = true; 488 break; 489 } 490 case BCH_JSET_ENTRY_usage: { 491 struct jset_entry_usage *u = 492 container_of(entry, struct jset_entry_usage, entry); 493 494 switch (entry->btree_id) { 495 case BCH_FS_USAGE_key_version: 496 atomic64_set(&c->key_version, le64_to_cpu(u->v)); 497 break; 498 } 499 break; 500 } 501 case BCH_JSET_ENTRY_blacklist: { 502 struct jset_entry_blacklist *bl_entry = 503 container_of(entry, struct jset_entry_blacklist, entry); 504 505 ret = bch2_journal_seq_blacklist_add(c, 506 le64_to_cpu(bl_entry->seq), 507 le64_to_cpu(bl_entry->seq) + 1); 508 break; 509 } 510 case BCH_JSET_ENTRY_blacklist_v2: { 511 struct jset_entry_blacklist_v2 *bl_entry = 512 container_of(entry, struct jset_entry_blacklist_v2, entry); 513 514 ret = bch2_journal_seq_blacklist_add(c, 515 le64_to_cpu(bl_entry->start), 516 le64_to_cpu(bl_entry->end) + 1); 517 break; 518 } 519 case BCH_JSET_ENTRY_clock: { 520 struct jset_entry_clock *clock = 521 container_of(entry, struct jset_entry_clock, entry); 522 523 atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); 524 } 525 } 526 fsck_err: 527 return ret; 528 } 529 530 static int journal_replay_early(struct bch_fs *c, 531 struct bch_sb_field_clean *clean) 532 { 533 if (clean) { 534 for (struct jset_entry *entry = clean->start; 535 entry != vstruct_end(&clean->field); 536 entry = vstruct_next(entry)) { 537 int ret = journal_replay_entry_early(c, entry); 538 if (ret) 539 return ret; 540 } 541 } else { 542 struct genradix_iter iter; 543 struct journal_replay *i, **_i; 544 545 genradix_for_each(&c->journal_entries, iter, _i) { 546 i = *_i; 547 548 if (journal_replay_ignore(i)) 549 continue; 550 551 vstruct_for_each(&i->j, entry) { 552 int ret = journal_replay_entry_early(c, entry); 553 if (ret) 554 return ret; 555 } 556 } 557 } 558 559 return 0; 560 } 561 562 /* sb clean section: */ 563 564 static int read_btree_roots(struct bch_fs *c) 565 { 566 struct printbuf buf = PRINTBUF; 567 int ret = 0; 568 569 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 570 struct btree_root *r = bch2_btree_id_root(c, i); 571 572 if (!r->alive) 573 continue; 574 575 printbuf_reset(&buf); 576 bch2_btree_id_level_to_text(&buf, i, r->level); 577 578 if (mustfix_fsck_err_on((ret = r->error), 579 c, btree_root_bkey_invalid, 580 "invalid btree root %s", 581 buf.buf) || 582 mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), 583 c, btree_root_read_error, 584 "error reading btree root %s: %s", 585 buf.buf, bch2_err_str(ret))) { 586 if (btree_id_is_alloc(i)) 587 r->error = 0; 588 589 ret = bch2_btree_lost_data(c, i); 590 BUG_ON(ret); 591 } 592 } 593 594 for (unsigned i = 0; i < BTREE_ID_NR; i++) { 595 struct btree_root *r = bch2_btree_id_root(c, i); 596 597 if (!r->b && !r->error) { 598 r->alive = false; 599 r->level = 0; 600 bch2_btree_root_alloc_fake(c, i, 0); 601 } 602 } 603 fsck_err: 604 printbuf_exit(&buf); 605 return ret; 606 } 607 608 static bool check_version_upgrade(struct bch_fs *c) 609 { 610 unsigned latest_version = bcachefs_metadata_version_current; 611 unsigned latest_compatible = min(latest_version, 612 bch2_latest_compatible_version(c->sb.version)); 613 unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; 614 unsigned new_version = 0; 615 bool ret = false; 616 617 if (old_version < bcachefs_metadata_required_upgrade_below) { 618 if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || 619 latest_compatible < bcachefs_metadata_required_upgrade_below) 620 new_version = latest_version; 621 else 622 new_version = latest_compatible; 623 } else { 624 switch (c->opts.version_upgrade) { 625 case BCH_VERSION_UPGRADE_compatible: 626 new_version = latest_compatible; 627 break; 628 case BCH_VERSION_UPGRADE_incompatible: 629 new_version = latest_version; 630 break; 631 case BCH_VERSION_UPGRADE_none: 632 new_version = min(old_version, latest_version); 633 break; 634 } 635 } 636 637 if (new_version > old_version) { 638 struct printbuf buf = PRINTBUF; 639 640 if (old_version < bcachefs_metadata_required_upgrade_below) 641 prt_str(&buf, "Version upgrade required:\n"); 642 643 if (old_version != c->sb.version) { 644 prt_str(&buf, "Version upgrade from "); 645 bch2_version_to_text(&buf, c->sb.version_upgrade_complete); 646 prt_str(&buf, " to "); 647 bch2_version_to_text(&buf, c->sb.version); 648 prt_str(&buf, " incomplete\n"); 649 } 650 651 prt_printf(&buf, "Doing %s version upgrade from ", 652 BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) 653 ? "incompatible" : "compatible"); 654 bch2_version_to_text(&buf, old_version); 655 prt_str(&buf, " to "); 656 bch2_version_to_text(&buf, new_version); 657 prt_newline(&buf); 658 659 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 660 __le64 passes = ext->recovery_passes_required[0]; 661 bch2_sb_set_upgrade(c, old_version, new_version); 662 passes = ext->recovery_passes_required[0] & ~passes; 663 664 if (passes) { 665 prt_str(&buf, " running recovery passes: "); 666 prt_bitflags(&buf, bch2_recovery_passes, 667 bch2_recovery_passes_from_stable(le64_to_cpu(passes))); 668 } 669 670 bch_info(c, "%s", buf.buf); 671 printbuf_exit(&buf); 672 673 ret = true; 674 } 675 676 if (new_version > c->sb.version_incompat && 677 c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { 678 struct printbuf buf = PRINTBUF; 679 680 prt_str(&buf, "Now allowing incompatible features up to "); 681 bch2_version_to_text(&buf, new_version); 682 prt_str(&buf, ", previously allowed up to "); 683 bch2_version_to_text(&buf, c->sb.version_incompat_allowed); 684 prt_newline(&buf); 685 686 bch_info(c, "%s", buf.buf); 687 printbuf_exit(&buf); 688 689 ret = true; 690 } 691 692 if (ret) 693 bch2_sb_upgrade(c, new_version, 694 c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); 695 696 return ret; 697 } 698 699 int bch2_fs_recovery(struct bch_fs *c) 700 { 701 struct bch_sb_field_clean *clean = NULL; 702 struct jset *last_journal_entry = NULL; 703 u64 last_seq = 0, blacklist_seq, journal_seq; 704 int ret = 0; 705 706 if (c->sb.clean) { 707 clean = bch2_read_superblock_clean(c); 708 ret = PTR_ERR_OR_ZERO(clean); 709 if (ret) 710 goto err; 711 712 bch_info(c, "recovering from clean shutdown, journal seq %llu", 713 le64_to_cpu(clean->journal_seq)); 714 } else { 715 bch_info(c, "recovering from unclean shutdown"); 716 } 717 718 if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { 719 bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); 720 ret = -EINVAL; 721 goto err; 722 } 723 724 if (!c->sb.clean && 725 !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { 726 bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); 727 ret = -EINVAL; 728 goto err; 729 } 730 731 if (c->opts.norecovery) { 732 c->opts.recovery_pass_last = c->opts.recovery_pass_last 733 ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) 734 : BCH_RECOVERY_PASS_snapshots_read; 735 c->opts.nochanges = true; 736 c->opts.read_only = true; 737 } 738 739 mutex_lock(&c->sb_lock); 740 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 741 bool write_sb = false; 742 743 if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { 744 ext->recovery_passes_required[0] |= 745 cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); 746 write_sb = true; 747 } 748 749 u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 750 if (sb_passes) { 751 struct printbuf buf = PRINTBUF; 752 prt_str(&buf, "superblock requires following recovery passes to be run:\n "); 753 prt_bitflags(&buf, bch2_recovery_passes, sb_passes); 754 bch_info(c, "%s", buf.buf); 755 printbuf_exit(&buf); 756 } 757 758 if (bch2_check_version_downgrade(c)) { 759 struct printbuf buf = PRINTBUF; 760 761 prt_str(&buf, "Version downgrade required:"); 762 763 __le64 passes = ext->recovery_passes_required[0]; 764 bch2_sb_set_downgrade(c, 765 BCH_VERSION_MINOR(bcachefs_metadata_version_current), 766 BCH_VERSION_MINOR(c->sb.version)); 767 passes = ext->recovery_passes_required[0] & ~passes; 768 if (passes) { 769 prt_str(&buf, "\n running recovery passes: "); 770 prt_bitflags(&buf, bch2_recovery_passes, 771 bch2_recovery_passes_from_stable(le64_to_cpu(passes))); 772 } 773 774 bch_info(c, "%s", buf.buf); 775 printbuf_exit(&buf); 776 write_sb = true; 777 } 778 779 if (check_version_upgrade(c)) 780 write_sb = true; 781 782 c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 783 784 if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { 785 SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); 786 write_sb = true; 787 } 788 789 if (write_sb) 790 bch2_write_super(c); 791 mutex_unlock(&c->sb_lock); 792 793 if (c->opts.fsck) 794 set_bit(BCH_FS_fsck_running, &c->flags); 795 if (c->sb.clean) 796 set_bit(BCH_FS_clean_recovery, &c->flags); 797 set_bit(BCH_FS_recovery_running, &c->flags); 798 799 ret = bch2_blacklist_table_initialize(c); 800 if (ret) { 801 bch_err(c, "error initializing blacklist table"); 802 goto err; 803 } 804 805 bch2_journal_pos_from_member_info_resume(c); 806 807 if (!c->sb.clean || c->opts.retain_recovery_info) { 808 struct genradix_iter iter; 809 struct journal_replay **i; 810 811 bch_verbose(c, "starting journal read"); 812 ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); 813 if (ret) 814 goto err; 815 816 /* 817 * note: cmd_list_journal needs the blacklist table fully up to date so 818 * it can asterisk ignored journal entries: 819 */ 820 if (c->opts.read_journal_only) 821 goto out; 822 823 genradix_for_each_reverse(&c->journal_entries, iter, i) 824 if (!journal_replay_ignore(*i)) { 825 last_journal_entry = &(*i)->j; 826 break; 827 } 828 829 if (mustfix_fsck_err_on(c->sb.clean && 830 last_journal_entry && 831 !journal_entry_empty(last_journal_entry), c, 832 clean_but_journal_not_empty, 833 "filesystem marked clean but journal not empty")) { 834 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 835 SET_BCH_SB_CLEAN(c->disk_sb.sb, false); 836 c->sb.clean = false; 837 } 838 839 if (!last_journal_entry) { 840 fsck_err_on(!c->sb.clean, c, 841 dirty_but_no_journal_entries, 842 "no journal entries found"); 843 if (clean) 844 goto use_clean; 845 846 genradix_for_each_reverse(&c->journal_entries, iter, i) 847 if (*i) { 848 last_journal_entry = &(*i)->j; 849 (*i)->ignore_blacklisted = false; 850 (*i)->ignore_not_dirty= false; 851 /* 852 * This was probably a NO_FLUSH entry, 853 * so last_seq was garbage - but we know 854 * we're only using a single journal 855 * entry, set it here: 856 */ 857 (*i)->j.last_seq = (*i)->j.seq; 858 break; 859 } 860 } 861 862 ret = bch2_journal_keys_sort(c); 863 if (ret) 864 goto err; 865 866 if (c->sb.clean && last_journal_entry) { 867 ret = bch2_verify_superblock_clean(c, &clean, 868 last_journal_entry); 869 if (ret) 870 goto err; 871 } 872 } else { 873 use_clean: 874 if (!clean) { 875 bch_err(c, "no superblock clean section found"); 876 ret = -BCH_ERR_fsck_repair_impossible; 877 goto err; 878 879 } 880 blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; 881 } 882 883 c->journal_replay_seq_start = last_seq; 884 c->journal_replay_seq_end = blacklist_seq - 1; 885 886 zero_out_btree_mem_ptr(&c->journal_keys); 887 888 ret = journal_replay_early(c, clean); 889 if (ret) 890 goto err; 891 892 if (c->opts.reconstruct_alloc) 893 bch2_reconstruct_alloc(c); 894 895 /* 896 * After an unclean shutdown, skip then next few journal sequence 897 * numbers as they may have been referenced by btree writes that 898 * happened before their corresponding journal writes - those btree 899 * writes need to be ignored, by skipping and blacklisting the next few 900 * journal sequence numbers: 901 */ 902 if (!c->sb.clean) 903 journal_seq += 8; 904 905 if (blacklist_seq != journal_seq) { 906 ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", 907 blacklist_seq, journal_seq) ?: 908 bch2_journal_seq_blacklist_add(c, 909 blacklist_seq, journal_seq); 910 if (ret) { 911 bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); 912 goto err; 913 } 914 } 915 916 ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", 917 journal_seq, last_seq, blacklist_seq - 1) ?: 918 bch2_fs_journal_start(&c->journal, journal_seq); 919 if (ret) 920 goto err; 921 922 /* 923 * Skip past versions that might have possibly been used (as nonces), 924 * but hadn't had their pointers written: 925 */ 926 if (c->sb.encryption_type && !c->sb.clean) 927 atomic64_add(1 << 16, &c->key_version); 928 929 ret = read_btree_roots(c); 930 if (ret) 931 goto err; 932 933 set_bit(BCH_FS_btree_running, &c->flags); 934 935 ret = bch2_sb_set_upgrade_extra(c); 936 937 ret = bch2_run_recovery_passes(c); 938 if (ret) 939 goto err; 940 941 /* 942 * Normally set by the appropriate recovery pass: when cleared, this 943 * indicates we're in early recovery and btree updates should be done by 944 * being applied to the journal replay keys. _Must_ be cleared before 945 * multithreaded use: 946 */ 947 set_bit(BCH_FS_may_go_rw, &c->flags); 948 clear_bit(BCH_FS_fsck_running, &c->flags); 949 clear_bit(BCH_FS_recovery_running, &c->flags); 950 951 /* in case we don't run journal replay, i.e. norecovery mode */ 952 set_bit(BCH_FS_accounting_replay_done, &c->flags); 953 954 bch2_async_btree_node_rewrites_flush(c); 955 956 /* fsync if we fixed errors */ 957 if (test_bit(BCH_FS_errors_fixed, &c->flags)) { 958 bch2_journal_flush_all_pins(&c->journal); 959 bch2_journal_meta(&c->journal); 960 } 961 962 /* If we fixed errors, verify that fs is actually clean now: */ 963 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && 964 test_bit(BCH_FS_errors_fixed, &c->flags) && 965 !test_bit(BCH_FS_errors_not_fixed, &c->flags) && 966 !test_bit(BCH_FS_error, &c->flags)) { 967 bch2_flush_fsck_errs(c); 968 969 bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); 970 clear_bit(BCH_FS_errors_fixed, &c->flags); 971 972 c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; 973 974 ret = bch2_run_recovery_passes(c); 975 if (ret) 976 goto err; 977 978 if (test_bit(BCH_FS_errors_fixed, &c->flags) || 979 test_bit(BCH_FS_errors_not_fixed, &c->flags)) { 980 bch_err(c, "Second fsck run was not clean"); 981 set_bit(BCH_FS_errors_not_fixed, &c->flags); 982 } 983 984 set_bit(BCH_FS_errors_fixed, &c->flags); 985 } 986 987 if (enabled_qtypes(c)) { 988 bch_verbose(c, "reading quotas"); 989 ret = bch2_fs_quota_read(c); 990 if (ret) 991 goto err; 992 bch_verbose(c, "quotas done"); 993 } 994 995 mutex_lock(&c->sb_lock); 996 ext = bch2_sb_field_get(c->disk_sb.sb, ext); 997 write_sb = false; 998 999 if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { 1000 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); 1001 write_sb = true; 1002 } 1003 1004 if (!test_bit(BCH_FS_error, &c->flags) && 1005 !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { 1006 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); 1007 write_sb = true; 1008 } 1009 1010 if (!test_bit(BCH_FS_error, &c->flags) && 1011 !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { 1012 memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); 1013 write_sb = true; 1014 } 1015 1016 if (c->opts.fsck && 1017 !test_bit(BCH_FS_error, &c->flags) && 1018 c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && 1019 ext->btrees_lost_data) { 1020 ext->btrees_lost_data = 0; 1021 write_sb = true; 1022 } 1023 1024 if (c->opts.fsck && 1025 !test_bit(BCH_FS_error, &c->flags) && 1026 !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { 1027 SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); 1028 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); 1029 write_sb = true; 1030 } 1031 1032 if (bch2_blacklist_entries_gc(c)) 1033 write_sb = true; 1034 1035 if (write_sb) 1036 bch2_write_super(c); 1037 mutex_unlock(&c->sb_lock); 1038 1039 if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || 1040 c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { 1041 struct bch_move_stats stats; 1042 1043 bch2_move_stats_init(&stats, "recovery"); 1044 1045 struct printbuf buf = PRINTBUF; 1046 bch2_version_to_text(&buf, c->sb.version_min); 1047 bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); 1048 printbuf_exit(&buf); 1049 1050 ret = bch2_fs_read_write_early(c) ?: 1051 bch2_scan_old_btree_nodes(c, &stats); 1052 if (ret) 1053 goto err; 1054 bch_info(c, "scanning for old btree nodes done"); 1055 } 1056 1057 ret = 0; 1058 out: 1059 bch2_flush_fsck_errs(c); 1060 1061 if (!c->opts.retain_recovery_info) { 1062 bch2_journal_keys_put_initial(c); 1063 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 1064 } 1065 if (!IS_ERR(clean)) 1066 kfree(clean); 1067 1068 if (!ret && 1069 test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && 1070 !c->opts.nochanges) { 1071 bch2_fs_read_write_early(c); 1072 bch2_delete_dead_snapshots_async(c); 1073 } 1074 1075 bch_err_fn(c, ret); 1076 return ret; 1077 err: 1078 fsck_err: 1079 bch2_fs_emergency_read_only(c); 1080 goto out; 1081 } 1082 1083 int bch2_fs_initialize(struct bch_fs *c) 1084 { 1085 struct bch_inode_unpacked root_inode, lostfound_inode; 1086 struct bkey_inode_buf packed_inode; 1087 struct qstr lostfound = QSTR("lost+found"); 1088 struct bch_member *m; 1089 int ret; 1090 1091 bch_notice(c, "initializing new filesystem"); 1092 set_bit(BCH_FS_new_fs, &c->flags); 1093 1094 mutex_lock(&c->sb_lock); 1095 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); 1096 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); 1097 1098 bch2_check_version_downgrade(c); 1099 1100 if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { 1101 bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); 1102 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); 1103 bch2_write_super(c); 1104 } 1105 1106 for_each_member_device(c, ca) { 1107 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 1108 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); 1109 ca->mi = bch2_mi_to_cpu(m); 1110 } 1111 1112 bch2_write_super(c); 1113 mutex_unlock(&c->sb_lock); 1114 1115 set_bit(BCH_FS_btree_running, &c->flags); 1116 set_bit(BCH_FS_may_go_rw, &c->flags); 1117 1118 for (unsigned i = 0; i < BTREE_ID_NR; i++) 1119 bch2_btree_root_alloc_fake(c, i, 0); 1120 1121 ret = bch2_fs_journal_alloc(c); 1122 if (ret) 1123 goto err; 1124 1125 /* 1126 * journal_res_get() will crash if called before this has 1127 * set up the journal.pin FIFO and journal.cur pointer: 1128 */ 1129 bch2_fs_journal_start(&c->journal, 1); 1130 set_bit(BCH_FS_accounting_replay_done, &c->flags); 1131 bch2_journal_set_replay_done(&c->journal); 1132 1133 ret = bch2_fs_read_write_early(c); 1134 if (ret) 1135 goto err; 1136 1137 for_each_member_device(c, ca) { 1138 ret = bch2_dev_usage_init(ca, false); 1139 if (ret) { 1140 bch2_dev_put(ca); 1141 goto err; 1142 } 1143 } 1144 1145 /* 1146 * Write out the superblock and journal buckets, now that we can do 1147 * btree updates 1148 */ 1149 bch_verbose(c, "marking superblocks"); 1150 ret = bch2_trans_mark_dev_sbs(c); 1151 bch_err_msg(c, ret, "marking superblocks"); 1152 if (ret) 1153 goto err; 1154 1155 ret = bch2_fs_freespace_init(c); 1156 if (ret) 1157 goto err; 1158 1159 ret = bch2_initialize_subvolumes(c); 1160 if (ret) 1161 goto err; 1162 1163 bch_verbose(c, "reading snapshots table"); 1164 ret = bch2_snapshots_read(c); 1165 if (ret) 1166 goto err; 1167 bch_verbose(c, "reading snapshots done"); 1168 1169 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); 1170 root_inode.bi_inum = BCACHEFS_ROOT_INO; 1171 root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; 1172 bch2_inode_pack(&packed_inode, &root_inode); 1173 packed_inode.inode.k.p.snapshot = U32_MAX; 1174 1175 ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0); 1176 bch_err_msg(c, ret, "creating root directory"); 1177 if (ret) 1178 goto err; 1179 1180 bch2_inode_init_early(c, &lostfound_inode); 1181 1182 ret = bch2_trans_commit_do(c, NULL, NULL, 0, 1183 bch2_create_trans(trans, 1184 BCACHEFS_ROOT_SUBVOL_INUM, 1185 &root_inode, &lostfound_inode, 1186 &lostfound, 1187 0, 0, S_IFDIR|0700, 0, 1188 NULL, NULL, (subvol_inum) { 0 }, 0)); 1189 bch_err_msg(c, ret, "creating lost+found"); 1190 if (ret) 1191 goto err; 1192 1193 c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; 1194 1195 if (enabled_qtypes(c)) { 1196 ret = bch2_fs_quota_read(c); 1197 if (ret) 1198 goto err; 1199 } 1200 1201 ret = bch2_journal_flush(&c->journal); 1202 bch_err_msg(c, ret, "writing first journal entry"); 1203 if (ret) 1204 goto err; 1205 1206 mutex_lock(&c->sb_lock); 1207 SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); 1208 SET_BCH_SB_CLEAN(c->disk_sb.sb, false); 1209 1210 bch2_write_super(c); 1211 mutex_unlock(&c->sb_lock); 1212 1213 c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; 1214 return 0; 1215 err: 1216 bch_err_fn(c, ret); 1217 return ret; 1218 } 1219