1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_background.h" 5 #include "bkey_buf.h" 6 #include "btree_journal_iter.h" 7 #include "btree_node_scan.h" 8 #include "btree_update.h" 9 #include "btree_update_interior.h" 10 #include "btree_io.h" 11 #include "buckets.h" 12 #include "dirent.h" 13 #include "disk_accounting.h" 14 #include "errcode.h" 15 #include "error.h" 16 #include "journal_io.h" 17 #include "journal_reclaim.h" 18 #include "journal_seq_blacklist.h" 19 #include "logged_ops.h" 20 #include "move.h" 21 #include "movinggc.h" 22 #include "namei.h" 23 #include "quota.h" 24 #include "rebalance.h" 25 #include "recovery.h" 26 #include "recovery_passes.h" 27 #include "replicas.h" 28 #include "sb-clean.h" 29 #include "sb-downgrade.h" 30 #include "snapshot.h" 31 #include "super-io.h" 32 33 #include <linux/sort.h> 34 #include <linux/stat.h> 35 36 int bch2_btree_lost_data(struct bch_fs *c, 37 struct printbuf *msg, 38 enum btree_id btree) 39 { 40 u64 b = BIT_ULL(btree); 41 int ret = 0; 42 43 mutex_lock(&c->sb_lock); 44 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 45 46 if (!(c->sb.btrees_lost_data & b)) { 47 prt_printf(msg, "flagging btree "); 48 bch2_btree_id_to_text(msg, btree); 49 prt_printf(msg, " lost data\n"); 50 51 ext->btrees_lost_data |= cpu_to_le64(b); 52 } 53 54 /* Once we have runtime self healing for topology errors we won't need this: */ 55 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; 56 57 /* Btree node accounting will be off: */ 58 __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); 59 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; 60 61 #ifdef CONFIG_BCACHEFS_DEBUG 62 /* 63 * These are much more minor, and don't need to be corrected right away, 64 * but in debug mode we want the next fsck run to be clean: 65 */ 66 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret; 67 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret; 68 #endif 69 70 switch (btree) { 71 case BTREE_ID_alloc: 72 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; 73 74 __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); 75 __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); 76 __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); 77 __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); 78 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); 79 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); 80 goto out; 81 case BTREE_ID_backpointers: 82 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret; 83 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret; 84 goto out; 85 case BTREE_ID_need_discard: 86 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; 87 goto out; 88 case BTREE_ID_freespace: 89 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; 90 goto out; 91 case BTREE_ID_bucket_gens: 92 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; 93 goto out; 94 case BTREE_ID_lru: 95 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; 96 goto out; 97 case BTREE_ID_accounting: 98 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; 99 goto out; 100 case BTREE_ID_snapshots: 101 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; 102 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; 103 goto out; 104 default: 105 ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; 106 goto out; 107 } 108 out: 109 bch2_write_super(c); 110 mutex_unlock(&c->sb_lock); 111 112 return ret; 113 } 114 115 static void kill_btree(struct bch_fs *c, enum btree_id btree) 116 { 117 bch2_btree_id_root(c, btree)->alive = false; 118 bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 119 } 120 121 /* for -o reconstruct_alloc: */ 122 void bch2_reconstruct_alloc(struct bch_fs *c) 123 { 124 mutex_lock(&c->sb_lock); 125 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 126 127 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); 128 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); 129 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); 130 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); 131 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); 132 133 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); 134 __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); 135 __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); 136 137 __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent); 138 __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent); 139 __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent); 140 141 __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent); 142 __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent); 143 __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); 144 __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); 145 146 __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent); 147 148 __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); 149 __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); 150 __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); 151 __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); 152 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); 153 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); 154 __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); 155 __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); 156 __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); 157 __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); 158 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); 159 __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); 160 __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); 161 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 162 163 c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 164 165 c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info)); 166 167 bch2_write_super(c); 168 mutex_unlock(&c->sb_lock); 169 170 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) 171 if (btree_id_is_alloc(i)) 172 kill_btree(c, i); 173 } 174 175 /* 176 * Btree node pointers have a field to stack a pointer to the in memory btree 177 * node; we need to zero out this field when reading in btree nodes, or when 178 * reading in keys from the journal: 179 */ 180 static void zero_out_btree_mem_ptr(struct journal_keys *keys) 181 { 182 darray_for_each(*keys, i) 183 if (i->k->k.type == KEY_TYPE_btree_ptr_v2) 184 bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; 185 } 186 187 /* journal replay: */ 188 189 static void replay_now_at(struct journal *j, u64 seq) 190 { 191 BUG_ON(seq < j->replay_journal_seq); 192 193 seq = min(seq, j->replay_journal_seq_end); 194 195 while (j->replay_journal_seq < seq) 196 bch2_journal_pin_put(j, j->replay_journal_seq++); 197 } 198 199 static int bch2_journal_replay_accounting_key(struct btree_trans *trans, 200 struct journal_key *k) 201 { 202 struct btree_iter iter; 203 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 204 BTREE_MAX_DEPTH, k->level, 205 BTREE_ITER_intent); 206 int ret = bch2_btree_iter_traverse(trans, &iter); 207 if (ret) 208 goto out; 209 210 struct bkey u; 211 struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); 212 213 /* Has this delta already been applied to the btree? */ 214 if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) { 215 ret = 0; 216 goto out; 217 } 218 219 struct bkey_i *new = k->k; 220 if (old.k->type == KEY_TYPE_accounting) { 221 new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k)); 222 ret = PTR_ERR_OR_ZERO(new); 223 if (ret) 224 goto out; 225 226 bch2_accounting_accumulate(bkey_i_to_accounting(new), 227 bkey_s_c_to_accounting(old)); 228 } 229 230 trans->journal_res.seq = k->journal_seq; 231 232 ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun); 233 out: 234 bch2_trans_iter_exit(trans, &iter); 235 return ret; 236 } 237 238 static int bch2_journal_replay_key(struct btree_trans *trans, 239 struct journal_key *k) 240 { 241 struct btree_iter iter; 242 unsigned iter_flags = 243 BTREE_ITER_intent| 244 BTREE_ITER_not_extents; 245 unsigned update_flags = BTREE_TRIGGER_norun; 246 int ret; 247 248 if (k->overwritten) 249 return 0; 250 251 trans->journal_res.seq = k->journal_seq; 252 253 /* 254 * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to 255 * keep the key cache coherent with the underlying btree. Nothing 256 * besides the allocator is doing updates yet so we don't need key cache 257 * coherency for non-alloc btrees, and key cache fills for snapshots 258 * btrees use BTREE_ITER_filter_snapshots, which isn't available until 259 * the snapshots recovery pass runs. 260 */ 261 if (!k->level && k->btree_id == BTREE_ID_alloc) 262 iter_flags |= BTREE_ITER_cached; 263 else 264 update_flags |= BTREE_UPDATE_key_cache_reclaim; 265 266 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 267 BTREE_MAX_DEPTH, k->level, 268 iter_flags); 269 ret = bch2_btree_iter_traverse(trans, &iter); 270 if (ret) 271 goto out; 272 273 struct btree_path *path = btree_iter_path(trans, &iter); 274 if (unlikely(!btree_path_node(path, k->level))) { 275 bch2_trans_iter_exit(trans, &iter); 276 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 277 BTREE_MAX_DEPTH, 0, iter_flags); 278 ret = bch2_btree_iter_traverse(trans, &iter) ?: 279 bch2_btree_increase_depth(trans, iter.path, 0) ?: 280 -BCH_ERR_transaction_restart_nested; 281 goto out; 282 } 283 284 /* Must be checked with btree locked: */ 285 if (k->overwritten) 286 goto out; 287 288 if (k->k->k.type == KEY_TYPE_accounting) { 289 struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s); 290 ret = PTR_ERR_OR_ZERO(n); 291 if (ret) 292 goto out; 293 294 bkey_copy(n, k->k); 295 goto out; 296 } 297 298 ret = bch2_trans_update(trans, &iter, k->k, update_flags); 299 out: 300 bch2_trans_iter_exit(trans, &iter); 301 return ret; 302 } 303 304 static int journal_sort_seq_cmp(const void *_l, const void *_r) 305 { 306 const struct journal_key *l = *((const struct journal_key **)_l); 307 const struct journal_key *r = *((const struct journal_key **)_r); 308 309 /* 310 * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last 311 * 312 * journal_seq == 0 means that the key comes from early repair, and 313 * should be inserted last so as to avoid overflowing the journal 314 */ 315 return cmp_int(l->journal_seq - 1, r->journal_seq - 1); 316 } 317 318 int bch2_journal_replay(struct bch_fs *c) 319 { 320 struct journal_keys *keys = &c->journal_keys; 321 DARRAY(struct journal_key *) keys_sorted = { 0 }; 322 struct journal *j = &c->journal; 323 u64 start_seq = c->journal_replay_seq_start; 324 u64 end_seq = c->journal_replay_seq_start; 325 struct btree_trans *trans = NULL; 326 bool immediate_flush = false; 327 int ret = 0; 328 329 if (keys->nr) { 330 ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", 331 keys->nr, start_seq, end_seq); 332 if (ret) 333 goto err; 334 } 335 336 BUG_ON(!atomic_read(&keys->ref)); 337 338 move_gap(keys, keys->nr); 339 trans = bch2_trans_get(c); 340 341 /* 342 * Replay accounting keys first: we can't allow the write buffer to 343 * flush accounting keys until we're done 344 */ 345 darray_for_each(*keys, k) { 346 if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated)) 347 continue; 348 349 cond_resched(); 350 351 ret = commit_do(trans, NULL, NULL, 352 BCH_TRANS_COMMIT_no_enospc| 353 BCH_TRANS_COMMIT_journal_reclaim| 354 BCH_TRANS_COMMIT_skip_accounting_apply| 355 BCH_TRANS_COMMIT_no_journal_res| 356 BCH_WATERMARK_reclaim, 357 bch2_journal_replay_accounting_key(trans, k)); 358 if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) 359 goto err; 360 361 k->overwritten = true; 362 } 363 364 set_bit(BCH_FS_accounting_replay_done, &c->flags); 365 366 /* 367 * First, attempt to replay keys in sorted order. This is more 368 * efficient - better locality of btree access - but some might fail if 369 * that would cause a journal deadlock. 370 */ 371 darray_for_each(*keys, k) { 372 cond_resched(); 373 374 /* 375 * k->allocated means the key wasn't read in from the journal, 376 * rather it was from early repair code 377 */ 378 if (k->allocated) 379 immediate_flush = true; 380 381 /* Skip fastpath if we're low on space in the journal */ 382 ret = c->journal.watermark ? -1 : 383 commit_do(trans, NULL, NULL, 384 BCH_TRANS_COMMIT_no_enospc| 385 BCH_TRANS_COMMIT_journal_reclaim| 386 BCH_TRANS_COMMIT_skip_accounting_apply| 387 (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), 388 bch2_journal_replay_key(trans, k)); 389 BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting); 390 if (ret) { 391 ret = darray_push(&keys_sorted, k); 392 if (ret) 393 goto err; 394 } 395 } 396 397 bch2_trans_unlock_long(trans); 398 /* 399 * Now, replay any remaining keys in the order in which they appear in 400 * the journal, unpinning those journal entries as we go: 401 */ 402 sort_nonatomic(keys_sorted.data, keys_sorted.nr, 403 sizeof(keys_sorted.data[0]), 404 journal_sort_seq_cmp, NULL); 405 406 darray_for_each(keys_sorted, kp) { 407 cond_resched(); 408 409 struct journal_key *k = *kp; 410 411 if (k->journal_seq) 412 replay_now_at(j, k->journal_seq); 413 else 414 replay_now_at(j, j->replay_journal_seq_end); 415 416 ret = commit_do(trans, NULL, NULL, 417 BCH_TRANS_COMMIT_no_enospc| 418 BCH_TRANS_COMMIT_skip_accounting_apply| 419 (!k->allocated 420 ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim 421 : 0), 422 bch2_journal_replay_key(trans, k)); 423 if (ret) { 424 struct printbuf buf = PRINTBUF; 425 bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); 426 bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); 427 printbuf_exit(&buf); 428 goto err; 429 } 430 431 BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); 432 } 433 434 /* 435 * We need to put our btree_trans before calling flush_all_pins(), since 436 * that will use a btree_trans internally 437 */ 438 bch2_trans_put(trans); 439 trans = NULL; 440 441 if (!c->opts.retain_recovery_info && 442 c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) 443 bch2_journal_keys_put_initial(c); 444 445 replay_now_at(j, j->replay_journal_seq_end); 446 j->replay_journal_seq = 0; 447 448 bch2_journal_set_replay_done(j); 449 450 /* if we did any repair, flush it immediately */ 451 if (immediate_flush) { 452 bch2_journal_flush_all_pins(&c->journal); 453 ret = bch2_journal_meta(&c->journal); 454 } 455 456 if (keys->nr) 457 bch2_journal_log_msg(c, "journal replay finished"); 458 err: 459 if (trans) 460 bch2_trans_put(trans); 461 darray_exit(&keys_sorted); 462 bch_err_fn(c, ret); 463 return ret; 464 } 465 466 /* journal replay early: */ 467 468 static int journal_replay_entry_early(struct bch_fs *c, 469 struct jset_entry *entry) 470 { 471 int ret = 0; 472 473 switch (entry->type) { 474 case BCH_JSET_ENTRY_btree_root: { 475 476 if (unlikely(!entry->u64s)) 477 return 0; 478 479 if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, 480 c, invalid_btree_id, 481 "invalid btree id %u (max %u)", 482 entry->btree_id, BTREE_ID_NR_MAX)) 483 return 0; 484 485 while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { 486 ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); 487 if (ret) 488 return ret; 489 } 490 491 struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); 492 493 r->level = entry->level; 494 bkey_copy(&r->key, (struct bkey_i *) entry->start); 495 r->error = 0; 496 r->alive = true; 497 break; 498 } 499 case BCH_JSET_ENTRY_usage: { 500 struct jset_entry_usage *u = 501 container_of(entry, struct jset_entry_usage, entry); 502 503 switch (entry->btree_id) { 504 case BCH_FS_USAGE_key_version: 505 atomic64_set(&c->key_version, le64_to_cpu(u->v)); 506 break; 507 } 508 break; 509 } 510 case BCH_JSET_ENTRY_blacklist: { 511 struct jset_entry_blacklist *bl_entry = 512 container_of(entry, struct jset_entry_blacklist, entry); 513 514 ret = bch2_journal_seq_blacklist_add(c, 515 le64_to_cpu(bl_entry->seq), 516 le64_to_cpu(bl_entry->seq) + 1); 517 break; 518 } 519 case BCH_JSET_ENTRY_blacklist_v2: { 520 struct jset_entry_blacklist_v2 *bl_entry = 521 container_of(entry, struct jset_entry_blacklist_v2, entry); 522 523 ret = bch2_journal_seq_blacklist_add(c, 524 le64_to_cpu(bl_entry->start), 525 le64_to_cpu(bl_entry->end) + 1); 526 break; 527 } 528 case BCH_JSET_ENTRY_clock: { 529 struct jset_entry_clock *clock = 530 container_of(entry, struct jset_entry_clock, entry); 531 532 atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); 533 } 534 } 535 fsck_err: 536 return ret; 537 } 538 539 static int journal_replay_early(struct bch_fs *c, 540 struct bch_sb_field_clean *clean) 541 { 542 if (clean) { 543 for (struct jset_entry *entry = clean->start; 544 entry != vstruct_end(&clean->field); 545 entry = vstruct_next(entry)) { 546 int ret = journal_replay_entry_early(c, entry); 547 if (ret) 548 return ret; 549 } 550 } else { 551 struct genradix_iter iter; 552 struct journal_replay *i, **_i; 553 554 genradix_for_each(&c->journal_entries, iter, _i) { 555 i = *_i; 556 557 if (journal_replay_ignore(i)) 558 continue; 559 560 vstruct_for_each(&i->j, entry) { 561 int ret = journal_replay_entry_early(c, entry); 562 if (ret) 563 return ret; 564 } 565 } 566 } 567 568 return 0; 569 } 570 571 /* sb clean section: */ 572 573 static int read_btree_roots(struct bch_fs *c) 574 { 575 struct printbuf buf = PRINTBUF; 576 int ret = 0; 577 578 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 579 struct btree_root *r = bch2_btree_id_root(c, i); 580 581 if (!r->alive) 582 continue; 583 584 printbuf_reset(&buf); 585 bch2_btree_id_level_to_text(&buf, i, r->level); 586 587 if (mustfix_fsck_err_on((ret = r->error), 588 c, btree_root_bkey_invalid, 589 "invalid btree root %s", 590 buf.buf) || 591 mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), 592 c, btree_root_read_error, 593 "error reading btree root %s: %s", 594 buf.buf, bch2_err_str(ret))) { 595 if (btree_id_is_alloc(i)) 596 r->error = 0; 597 } 598 } 599 600 for (unsigned i = 0; i < BTREE_ID_NR; i++) { 601 struct btree_root *r = bch2_btree_id_root(c, i); 602 603 if (!r->b && !r->error) { 604 r->alive = false; 605 r->level = 0; 606 bch2_btree_root_alloc_fake(c, i, 0); 607 } 608 } 609 fsck_err: 610 printbuf_exit(&buf); 611 return ret; 612 } 613 614 static bool check_version_upgrade(struct bch_fs *c) 615 { 616 unsigned latest_version = bcachefs_metadata_version_current; 617 unsigned latest_compatible = min(latest_version, 618 bch2_latest_compatible_version(c->sb.version)); 619 unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; 620 unsigned new_version = 0; 621 bool ret = false; 622 623 if (old_version < bcachefs_metadata_required_upgrade_below) { 624 if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || 625 latest_compatible < bcachefs_metadata_required_upgrade_below) 626 new_version = latest_version; 627 else 628 new_version = latest_compatible; 629 } else { 630 switch (c->opts.version_upgrade) { 631 case BCH_VERSION_UPGRADE_compatible: 632 new_version = latest_compatible; 633 break; 634 case BCH_VERSION_UPGRADE_incompatible: 635 new_version = latest_version; 636 break; 637 case BCH_VERSION_UPGRADE_none: 638 new_version = min(old_version, latest_version); 639 break; 640 } 641 } 642 643 if (new_version > old_version) { 644 struct printbuf buf = PRINTBUF; 645 646 if (old_version < bcachefs_metadata_required_upgrade_below) 647 prt_str(&buf, "Version upgrade required:\n"); 648 649 if (old_version != c->sb.version) { 650 prt_str(&buf, "Version upgrade from "); 651 bch2_version_to_text(&buf, c->sb.version_upgrade_complete); 652 prt_str(&buf, " to "); 653 bch2_version_to_text(&buf, c->sb.version); 654 prt_str(&buf, " incomplete\n"); 655 } 656 657 prt_printf(&buf, "Doing %s version upgrade from ", 658 BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) 659 ? "incompatible" : "compatible"); 660 bch2_version_to_text(&buf, old_version); 661 prt_str(&buf, " to "); 662 bch2_version_to_text(&buf, new_version); 663 prt_newline(&buf); 664 665 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 666 __le64 passes = ext->recovery_passes_required[0]; 667 bch2_sb_set_upgrade(c, old_version, new_version); 668 passes = ext->recovery_passes_required[0] & ~passes; 669 670 if (passes) { 671 prt_str(&buf, " running recovery passes: "); 672 prt_bitflags(&buf, bch2_recovery_passes, 673 bch2_recovery_passes_from_stable(le64_to_cpu(passes))); 674 } 675 676 bch_notice(c, "%s", buf.buf); 677 printbuf_exit(&buf); 678 679 ret = true; 680 } 681 682 if (new_version > c->sb.version_incompat && 683 c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { 684 struct printbuf buf = PRINTBUF; 685 686 prt_str(&buf, "Now allowing incompatible features up to "); 687 bch2_version_to_text(&buf, new_version); 688 prt_str(&buf, ", previously allowed up to "); 689 bch2_version_to_text(&buf, c->sb.version_incompat_allowed); 690 prt_newline(&buf); 691 692 bch_notice(c, "%s", buf.buf); 693 printbuf_exit(&buf); 694 695 ret = true; 696 } 697 698 if (ret) 699 bch2_sb_upgrade(c, new_version, 700 c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); 701 702 return ret; 703 } 704 705 int bch2_fs_recovery(struct bch_fs *c) 706 { 707 struct bch_sb_field_clean *clean = NULL; 708 struct jset *last_journal_entry = NULL; 709 u64 last_seq = 0, blacklist_seq, journal_seq; 710 int ret = 0; 711 712 if (c->sb.clean) { 713 clean = bch2_read_superblock_clean(c); 714 ret = PTR_ERR_OR_ZERO(clean); 715 if (ret) 716 goto err; 717 718 bch_info(c, "recovering from clean shutdown, journal seq %llu", 719 le64_to_cpu(clean->journal_seq)); 720 } else { 721 bch_info(c, "recovering from unclean shutdown"); 722 } 723 724 if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { 725 bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); 726 ret = -EINVAL; 727 goto err; 728 } 729 730 if (!c->sb.clean && 731 !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { 732 bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); 733 ret = -EINVAL; 734 goto err; 735 } 736 737 if (c->opts.norecovery) { 738 c->opts.recovery_pass_last = c->opts.recovery_pass_last 739 ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) 740 : BCH_RECOVERY_PASS_snapshots_read; 741 c->opts.nochanges = true; 742 c->opts.read_only = true; 743 } 744 745 mutex_lock(&c->sb_lock); 746 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 747 bool write_sb = false; 748 749 if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { 750 ext->recovery_passes_required[0] |= 751 cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); 752 write_sb = true; 753 } 754 755 u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 756 if (sb_passes) { 757 struct printbuf buf = PRINTBUF; 758 prt_str(&buf, "superblock requires following recovery passes to be run:\n "); 759 prt_bitflags(&buf, bch2_recovery_passes, sb_passes); 760 bch_info(c, "%s", buf.buf); 761 printbuf_exit(&buf); 762 } 763 764 if (bch2_check_version_downgrade(c)) { 765 struct printbuf buf = PRINTBUF; 766 767 prt_str(&buf, "Version downgrade required:"); 768 769 __le64 passes = ext->recovery_passes_required[0]; 770 bch2_sb_set_downgrade(c, 771 BCH_VERSION_MINOR(bcachefs_metadata_version_current), 772 BCH_VERSION_MINOR(c->sb.version)); 773 passes = ext->recovery_passes_required[0] & ~passes; 774 if (passes) { 775 prt_str(&buf, "\n running recovery passes: "); 776 prt_bitflags(&buf, bch2_recovery_passes, 777 bch2_recovery_passes_from_stable(le64_to_cpu(passes))); 778 } 779 780 bch_info(c, "%s", buf.buf); 781 printbuf_exit(&buf); 782 write_sb = true; 783 } 784 785 if (check_version_upgrade(c)) 786 write_sb = true; 787 788 c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 789 790 if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { 791 SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); 792 write_sb = true; 793 } 794 795 if (write_sb) 796 bch2_write_super(c); 797 mutex_unlock(&c->sb_lock); 798 799 if (c->sb.clean) 800 set_bit(BCH_FS_clean_recovery, &c->flags); 801 if (c->opts.fsck) 802 set_bit(BCH_FS_in_fsck, &c->flags); 803 set_bit(BCH_FS_in_recovery, &c->flags); 804 805 ret = bch2_blacklist_table_initialize(c); 806 if (ret) { 807 bch_err(c, "error initializing blacklist table"); 808 goto err; 809 } 810 811 bch2_journal_pos_from_member_info_resume(c); 812 813 if (!c->sb.clean || c->opts.retain_recovery_info) { 814 struct genradix_iter iter; 815 struct journal_replay **i; 816 817 bch_verbose(c, "starting journal read"); 818 ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); 819 if (ret) 820 goto err; 821 822 /* 823 * note: cmd_list_journal needs the blacklist table fully up to date so 824 * it can asterisk ignored journal entries: 825 */ 826 if (c->opts.read_journal_only) 827 goto out; 828 829 genradix_for_each_reverse(&c->journal_entries, iter, i) 830 if (!journal_replay_ignore(*i)) { 831 last_journal_entry = &(*i)->j; 832 break; 833 } 834 835 if (mustfix_fsck_err_on(c->sb.clean && 836 last_journal_entry && 837 !journal_entry_empty(last_journal_entry), c, 838 clean_but_journal_not_empty, 839 "filesystem marked clean but journal not empty")) { 840 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 841 SET_BCH_SB_CLEAN(c->disk_sb.sb, false); 842 c->sb.clean = false; 843 } 844 845 if (!last_journal_entry) { 846 fsck_err_on(!c->sb.clean, c, 847 dirty_but_no_journal_entries, 848 "no journal entries found"); 849 if (clean) 850 goto use_clean; 851 852 genradix_for_each_reverse(&c->journal_entries, iter, i) 853 if (*i) { 854 last_journal_entry = &(*i)->j; 855 (*i)->ignore_blacklisted = false; 856 (*i)->ignore_not_dirty= false; 857 /* 858 * This was probably a NO_FLUSH entry, 859 * so last_seq was garbage - but we know 860 * we're only using a single journal 861 * entry, set it here: 862 */ 863 (*i)->j.last_seq = (*i)->j.seq; 864 break; 865 } 866 } 867 868 ret = bch2_journal_keys_sort(c); 869 if (ret) 870 goto err; 871 872 if (c->sb.clean && last_journal_entry) { 873 ret = bch2_verify_superblock_clean(c, &clean, 874 last_journal_entry); 875 if (ret) 876 goto err; 877 } 878 } else { 879 use_clean: 880 if (!clean) { 881 bch_err(c, "no superblock clean section found"); 882 ret = bch_err_throw(c, fsck_repair_impossible); 883 goto err; 884 885 } 886 blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; 887 } 888 889 c->journal_replay_seq_start = last_seq; 890 c->journal_replay_seq_end = blacklist_seq - 1; 891 892 zero_out_btree_mem_ptr(&c->journal_keys); 893 894 ret = journal_replay_early(c, clean); 895 if (ret) 896 goto err; 897 898 ret = bch2_fs_resize_on_mount(c); 899 if (ret) { 900 up_write(&c->state_lock); 901 goto err; 902 } 903 904 if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { 905 bch_info(c, "filesystem is an unresized image file, mounting ro"); 906 c->opts.read_only = true; 907 } 908 909 if (!c->opts.read_only && 910 (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { 911 bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); 912 913 bch2_reconstruct_alloc(c); 914 } else if (c->opts.reconstruct_alloc) { 915 bch2_journal_log_msg(c, "dropping alloc info"); 916 bch_info(c, "dropping and reconstructing all alloc info"); 917 918 bch2_reconstruct_alloc(c); 919 } 920 921 if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { 922 /* We can't go RW to fix errors without alloc info */ 923 if (c->opts.fix_errors == FSCK_FIX_yes || 924 c->opts.fix_errors == FSCK_FIX_ask) 925 c->opts.fix_errors = FSCK_FIX_no; 926 if (c->opts.errors == BCH_ON_ERROR_fix_safe) 927 c->opts.errors = BCH_ON_ERROR_continue; 928 } 929 930 /* 931 * After an unclean shutdown, skip then next few journal sequence 932 * numbers as they may have been referenced by btree writes that 933 * happened before their corresponding journal writes - those btree 934 * writes need to be ignored, by skipping and blacklisting the next few 935 * journal sequence numbers: 936 */ 937 if (!c->sb.clean) 938 journal_seq += JOURNAL_BUF_NR * 4; 939 940 if (blacklist_seq != journal_seq) { 941 ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", 942 blacklist_seq, journal_seq) ?: 943 bch2_journal_seq_blacklist_add(c, 944 blacklist_seq, journal_seq); 945 if (ret) { 946 bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); 947 goto err; 948 } 949 } 950 951 ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", 952 journal_seq, last_seq, blacklist_seq - 1) ?: 953 bch2_fs_journal_start(&c->journal, journal_seq); 954 if (ret) 955 goto err; 956 957 /* 958 * Skip past versions that might have possibly been used (as nonces), 959 * but hadn't had their pointers written: 960 */ 961 if (c->sb.encryption_type && !c->sb.clean) 962 atomic64_add(1 << 16, &c->key_version); 963 964 ret = read_btree_roots(c); 965 if (ret) 966 goto err; 967 968 set_bit(BCH_FS_btree_running, &c->flags); 969 970 ret = bch2_sb_set_upgrade_extra(c); 971 if (ret) 972 goto err; 973 974 ret = bch2_run_recovery_passes(c, 0); 975 if (ret) 976 goto err; 977 978 /* 979 * Normally set by the appropriate recovery pass: when cleared, this 980 * indicates we're in early recovery and btree updates should be done by 981 * being applied to the journal replay keys. _Must_ be cleared before 982 * multithreaded use: 983 */ 984 set_bit(BCH_FS_may_go_rw, &c->flags); 985 clear_bit(BCH_FS_in_fsck, &c->flags); 986 987 /* in case we don't run journal replay, i.e. norecovery mode */ 988 set_bit(BCH_FS_accounting_replay_done, &c->flags); 989 990 bch2_async_btree_node_rewrites_flush(c); 991 992 /* fsync if we fixed errors */ 993 if (test_bit(BCH_FS_errors_fixed, &c->flags)) { 994 bch2_journal_flush_all_pins(&c->journal); 995 bch2_journal_meta(&c->journal); 996 } 997 998 /* If we fixed errors, verify that fs is actually clean now: */ 999 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && 1000 test_bit(BCH_FS_errors_fixed, &c->flags) && 1001 !test_bit(BCH_FS_errors_not_fixed, &c->flags) && 1002 !test_bit(BCH_FS_error, &c->flags)) { 1003 bch2_flush_fsck_errs(c); 1004 1005 bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); 1006 clear_bit(BCH_FS_errors_fixed, &c->flags); 1007 1008 ret = bch2_run_recovery_passes(c, 1009 BCH_RECOVERY_PASS_check_alloc_info); 1010 if (ret) 1011 goto err; 1012 1013 if (test_bit(BCH_FS_errors_fixed, &c->flags) || 1014 test_bit(BCH_FS_errors_not_fixed, &c->flags)) { 1015 bch_err(c, "Second fsck run was not clean"); 1016 set_bit(BCH_FS_errors_not_fixed, &c->flags); 1017 } 1018 1019 set_bit(BCH_FS_errors_fixed, &c->flags); 1020 } 1021 1022 if (enabled_qtypes(c)) { 1023 bch_verbose(c, "reading quotas"); 1024 ret = bch2_fs_quota_read(c); 1025 if (ret) 1026 goto err; 1027 bch_verbose(c, "quotas done"); 1028 } 1029 1030 mutex_lock(&c->sb_lock); 1031 ext = bch2_sb_field_get(c->disk_sb.sb, ext); 1032 write_sb = false; 1033 1034 if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { 1035 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); 1036 write_sb = true; 1037 } 1038 1039 if (!test_bit(BCH_FS_error, &c->flags) && 1040 !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { 1041 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); 1042 write_sb = true; 1043 } 1044 1045 if (!test_bit(BCH_FS_error, &c->flags) && 1046 !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { 1047 memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); 1048 write_sb = true; 1049 } 1050 1051 if (c->opts.fsck && 1052 !test_bit(BCH_FS_error, &c->flags) && 1053 c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 && 1054 ext->btrees_lost_data) { 1055 ext->btrees_lost_data = 0; 1056 write_sb = true; 1057 } 1058 1059 if (c->opts.fsck && 1060 !test_bit(BCH_FS_error, &c->flags) && 1061 !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { 1062 SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); 1063 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); 1064 write_sb = true; 1065 } 1066 1067 if (bch2_blacklist_entries_gc(c)) 1068 write_sb = true; 1069 1070 if (write_sb) 1071 bch2_write_super(c); 1072 mutex_unlock(&c->sb_lock); 1073 1074 if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || 1075 c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { 1076 struct bch_move_stats stats; 1077 1078 bch2_move_stats_init(&stats, "recovery"); 1079 1080 struct printbuf buf = PRINTBUF; 1081 bch2_version_to_text(&buf, c->sb.version_min); 1082 bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); 1083 printbuf_exit(&buf); 1084 1085 ret = bch2_fs_read_write_early(c) ?: 1086 bch2_scan_old_btree_nodes(c, &stats); 1087 if (ret) 1088 goto err; 1089 bch_info(c, "scanning for old btree nodes done"); 1090 } 1091 1092 ret = 0; 1093 out: 1094 bch2_flush_fsck_errs(c); 1095 1096 if (!IS_ERR(clean)) 1097 kfree(clean); 1098 1099 if (!ret && 1100 test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && 1101 !c->opts.nochanges) { 1102 bch2_fs_read_write_early(c); 1103 bch2_delete_dead_snapshots_async(c); 1104 } 1105 1106 bch_err_fn(c, ret); 1107 return ret; 1108 err: 1109 fsck_err: 1110 { 1111 struct printbuf buf = PRINTBUF; 1112 bch2_log_msg_start(c, &buf); 1113 1114 prt_printf(&buf, "error in recovery: %s", bch2_err_str(ret)); 1115 bch2_fs_emergency_read_only2(c, &buf); 1116 1117 bch2_print_str(c, KERN_ERR, buf.buf); 1118 printbuf_exit(&buf); 1119 } 1120 return ret; 1121 } 1122 1123 int bch2_fs_initialize(struct bch_fs *c) 1124 { 1125 struct bch_inode_unpacked root_inode, lostfound_inode; 1126 struct bkey_inode_buf packed_inode; 1127 struct qstr lostfound = QSTR("lost+found"); 1128 struct bch_member *m; 1129 int ret; 1130 1131 bch_notice(c, "initializing new filesystem"); 1132 set_bit(BCH_FS_new_fs, &c->flags); 1133 1134 mutex_lock(&c->sb_lock); 1135 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); 1136 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); 1137 1138 bch2_check_version_downgrade(c); 1139 1140 if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { 1141 bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); 1142 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); 1143 bch2_write_super(c); 1144 } 1145 1146 for_each_member_device(c, ca) { 1147 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 1148 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); 1149 ca->mi = bch2_mi_to_cpu(m); 1150 } 1151 1152 bch2_write_super(c); 1153 mutex_unlock(&c->sb_lock); 1154 1155 set_bit(BCH_FS_btree_running, &c->flags); 1156 set_bit(BCH_FS_may_go_rw, &c->flags); 1157 1158 for (unsigned i = 0; i < BTREE_ID_NR; i++) 1159 bch2_btree_root_alloc_fake(c, i, 0); 1160 1161 ret = bch2_fs_journal_alloc(c); 1162 if (ret) 1163 goto err; 1164 1165 /* 1166 * journal_res_get() will crash if called before this has 1167 * set up the journal.pin FIFO and journal.cur pointer: 1168 */ 1169 ret = bch2_fs_journal_start(&c->journal, 1); 1170 if (ret) 1171 goto err; 1172 1173 ret = bch2_fs_read_write_early(c); 1174 if (ret) 1175 goto err; 1176 1177 set_bit(BCH_FS_accounting_replay_done, &c->flags); 1178 bch2_journal_set_replay_done(&c->journal); 1179 1180 for_each_member_device(c, ca) { 1181 ret = bch2_dev_usage_init(ca, false); 1182 if (ret) { 1183 bch2_dev_put(ca); 1184 goto err; 1185 } 1186 } 1187 1188 /* 1189 * Write out the superblock and journal buckets, now that we can do 1190 * btree updates 1191 */ 1192 bch_verbose(c, "marking superblocks"); 1193 ret = bch2_trans_mark_dev_sbs(c); 1194 bch_err_msg(c, ret, "marking superblocks"); 1195 if (ret) 1196 goto err; 1197 1198 ret = bch2_fs_freespace_init(c); 1199 if (ret) 1200 goto err; 1201 1202 ret = bch2_initialize_subvolumes(c); 1203 if (ret) 1204 goto err; 1205 1206 bch_verbose(c, "reading snapshots table"); 1207 ret = bch2_snapshots_read(c); 1208 if (ret) 1209 goto err; 1210 bch_verbose(c, "reading snapshots done"); 1211 1212 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); 1213 root_inode.bi_inum = BCACHEFS_ROOT_INO; 1214 root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; 1215 bch2_inode_pack(&packed_inode, &root_inode); 1216 packed_inode.inode.k.p.snapshot = U32_MAX; 1217 1218 ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0); 1219 bch_err_msg(c, ret, "creating root directory"); 1220 if (ret) 1221 goto err; 1222 1223 bch2_inode_init_early(c, &lostfound_inode); 1224 1225 ret = bch2_trans_commit_do(c, NULL, NULL, 0, 1226 bch2_create_trans(trans, 1227 BCACHEFS_ROOT_SUBVOL_INUM, 1228 &root_inode, &lostfound_inode, 1229 &lostfound, 1230 0, 0, S_IFDIR|0700, 0, 1231 NULL, NULL, (subvol_inum) { 0 }, 0)); 1232 bch_err_msg(c, ret, "creating lost+found"); 1233 if (ret) 1234 goto err; 1235 1236 c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1; 1237 1238 bch2_copygc_wakeup(c); 1239 bch2_rebalance_wakeup(c); 1240 1241 if (enabled_qtypes(c)) { 1242 ret = bch2_fs_quota_read(c); 1243 if (ret) 1244 goto err; 1245 } 1246 1247 ret = bch2_journal_flush(&c->journal); 1248 bch_err_msg(c, ret, "writing first journal entry"); 1249 if (ret) 1250 goto err; 1251 1252 mutex_lock(&c->sb_lock); 1253 SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); 1254 SET_BCH_SB_CLEAN(c->disk_sb.sb, false); 1255 1256 bch2_write_super(c); 1257 mutex_unlock(&c->sb_lock); 1258 1259 c->recovery.curr_pass = BCH_RECOVERY_PASS_NR; 1260 return 0; 1261 err: 1262 bch_err_fn(c, ret); 1263 return ret; 1264 } 1265