xref: /linux/fs/bcachefs/recovery.c (revision 71dfa617ea9f18e4585fe78364217cd32b1fc382)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "bkey_buf.h"
6 #include "btree_journal_iter.h"
7 #include "btree_node_scan.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "btree_io.h"
11 #include "buckets.h"
12 #include "dirent.h"
13 #include "errcode.h"
14 #include "error.h"
15 #include "fs-common.h"
16 #include "journal_io.h"
17 #include "journal_reclaim.h"
18 #include "journal_seq_blacklist.h"
19 #include "logged_ops.h"
20 #include "move.h"
21 #include "quota.h"
22 #include "rebalance.h"
23 #include "recovery.h"
24 #include "recovery_passes.h"
25 #include "replicas.h"
26 #include "sb-clean.h"
27 #include "sb-downgrade.h"
28 #include "snapshot.h"
29 #include "super-io.h"
30 
31 #include <linux/sort.h>
32 #include <linux/stat.h>
33 
34 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
35 
36 void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
37 {
38 	u64 b = BIT_ULL(btree);
39 
40 	if (!(c->sb.btrees_lost_data & b)) {
41 		bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
42 
43 		mutex_lock(&c->sb_lock);
44 		bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
45 		bch2_write_super(c);
46 		mutex_unlock(&c->sb_lock);
47 	}
48 }
49 
50 /* for -o reconstruct_alloc: */
51 static void bch2_reconstruct_alloc(struct bch_fs *c)
52 {
53 	bch2_journal_log_msg(c, "dropping alloc info");
54 	bch_info(c, "dropping and reconstructing all alloc info");
55 
56 	mutex_lock(&c->sb_lock);
57 	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
58 
59 	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
60 	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
61 	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
62 	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
63 	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
64 
65 	__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
66 	__set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
67 	__set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
68 	__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
69 	__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
70 	__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
71 	__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
72 	__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
73 	__set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
74 	__set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
75 	__set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
76 	__set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
77 	__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
78 	__set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
79 	c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
80 
81 	bch2_write_super(c);
82 	mutex_unlock(&c->sb_lock);
83 
84 	c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
85 
86 
87 	bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
88 				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
89 	bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
90 				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
91 	bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
92 				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
93 	bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
94 				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
95 	bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
96 				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
97 }
98 
99 /*
100  * Btree node pointers have a field to stack a pointer to the in memory btree
101  * node; we need to zero out this field when reading in btree nodes, or when
102  * reading in keys from the journal:
103  */
104 static void zero_out_btree_mem_ptr(struct journal_keys *keys)
105 {
106 	darray_for_each(*keys, i)
107 		if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
108 			bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
109 }
110 
111 /* journal replay: */
112 
113 static void replay_now_at(struct journal *j, u64 seq)
114 {
115 	BUG_ON(seq < j->replay_journal_seq);
116 
117 	seq = min(seq, j->replay_journal_seq_end);
118 
119 	while (j->replay_journal_seq < seq)
120 		bch2_journal_pin_put(j, j->replay_journal_seq++);
121 }
122 
123 static int bch2_journal_replay_key(struct btree_trans *trans,
124 				   struct journal_key *k)
125 {
126 	struct btree_iter iter;
127 	unsigned iter_flags =
128 		BTREE_ITER_INTENT|
129 		BTREE_ITER_NOT_EXTENTS;
130 	unsigned update_flags = BTREE_TRIGGER_NORUN;
131 	int ret;
132 
133 	if (k->overwritten)
134 		return 0;
135 
136 	trans->journal_res.seq = k->journal_seq;
137 
138 	/*
139 	 * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
140 	 * keep the key cache coherent with the underlying btree. Nothing
141 	 * besides the allocator is doing updates yet so we don't need key cache
142 	 * coherency for non-alloc btrees, and key cache fills for snapshots
143 	 * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until
144 	 * the snapshots recovery pass runs.
145 	 */
146 	if (!k->level && k->btree_id == BTREE_ID_alloc)
147 		iter_flags |= BTREE_ITER_CACHED;
148 	else
149 		update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM;
150 
151 	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
152 				  BTREE_MAX_DEPTH, k->level,
153 				  iter_flags);
154 	ret = bch2_btree_iter_traverse(&iter);
155 	if (ret)
156 		goto out;
157 
158 	struct btree_path *path = btree_iter_path(trans, &iter);
159 	if (unlikely(!btree_path_node(path, k->level))) {
160 		bch2_trans_iter_exit(trans, &iter);
161 		bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
162 					  BTREE_MAX_DEPTH, 0, iter_flags);
163 		ret =   bch2_btree_iter_traverse(&iter) ?:
164 			bch2_btree_increase_depth(trans, iter.path, 0) ?:
165 			-BCH_ERR_transaction_restart_nested;
166 		goto out;
167 	}
168 
169 	/* Must be checked with btree locked: */
170 	if (k->overwritten)
171 		goto out;
172 
173 	ret = bch2_trans_update(trans, &iter, k->k, update_flags);
174 out:
175 	bch2_trans_iter_exit(trans, &iter);
176 	return ret;
177 }
178 
179 static int journal_sort_seq_cmp(const void *_l, const void *_r)
180 {
181 	const struct journal_key *l = *((const struct journal_key **)_l);
182 	const struct journal_key *r = *((const struct journal_key **)_r);
183 
184 	return cmp_int(l->journal_seq, r->journal_seq);
185 }
186 
187 int bch2_journal_replay(struct bch_fs *c)
188 {
189 	struct journal_keys *keys = &c->journal_keys;
190 	DARRAY(struct journal_key *) keys_sorted = { 0 };
191 	struct journal *j = &c->journal;
192 	u64 start_seq	= c->journal_replay_seq_start;
193 	u64 end_seq	= c->journal_replay_seq_start;
194 	struct btree_trans *trans = bch2_trans_get(c);
195 	bool immediate_flush = false;
196 	int ret = 0;
197 
198 	if (keys->nr) {
199 		ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
200 					   keys->nr, start_seq, end_seq);
201 		if (ret)
202 			goto err;
203 	}
204 
205 	BUG_ON(!atomic_read(&keys->ref));
206 
207 	move_gap(keys, keys->nr);
208 
209 	/*
210 	 * First, attempt to replay keys in sorted order. This is more
211 	 * efficient - better locality of btree access -  but some might fail if
212 	 * that would cause a journal deadlock.
213 	 */
214 	darray_for_each(*keys, k) {
215 		cond_resched();
216 
217 		/*
218 		 * k->allocated means the key wasn't read in from the journal,
219 		 * rather it was from early repair code
220 		 */
221 		if (k->allocated)
222 			immediate_flush = true;
223 
224 		/* Skip fastpath if we're low on space in the journal */
225 		ret = c->journal.watermark ? -1 :
226 			commit_do(trans, NULL, NULL,
227 				  BCH_TRANS_COMMIT_no_enospc|
228 				  BCH_TRANS_COMMIT_journal_reclaim|
229 				  (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
230 			     bch2_journal_replay_key(trans, k));
231 		BUG_ON(!ret && !k->overwritten);
232 		if (ret) {
233 			ret = darray_push(&keys_sorted, k);
234 			if (ret)
235 				goto err;
236 		}
237 	}
238 
239 	/*
240 	 * Now, replay any remaining keys in the order in which they appear in
241 	 * the journal, unpinning those journal entries as we go:
242 	 */
243 	sort(keys_sorted.data, keys_sorted.nr,
244 	     sizeof(keys_sorted.data[0]),
245 	     journal_sort_seq_cmp, NULL);
246 
247 	darray_for_each(keys_sorted, kp) {
248 		cond_resched();
249 
250 		struct journal_key *k = *kp;
251 
252 		replay_now_at(j, k->journal_seq);
253 
254 		ret = commit_do(trans, NULL, NULL,
255 				BCH_TRANS_COMMIT_no_enospc|
256 				(!k->allocated
257 				 ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
258 				 : 0),
259 			     bch2_journal_replay_key(trans, k));
260 		bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
261 			    bch2_btree_id_str(k->btree_id), k->level);
262 		if (ret)
263 			goto err;
264 
265 		BUG_ON(!k->overwritten);
266 	}
267 
268 	/*
269 	 * We need to put our btree_trans before calling flush_all_pins(), since
270 	 * that will use a btree_trans internally
271 	 */
272 	bch2_trans_put(trans);
273 	trans = NULL;
274 
275 	if (!c->opts.retain_recovery_info &&
276 	    c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
277 		bch2_journal_keys_put_initial(c);
278 
279 	replay_now_at(j, j->replay_journal_seq_end);
280 	j->replay_journal_seq = 0;
281 
282 	bch2_journal_set_replay_done(j);
283 
284 	/* if we did any repair, flush it immediately */
285 	if (immediate_flush) {
286 		bch2_journal_flush_all_pins(&c->journal);
287 		ret = bch2_journal_meta(&c->journal);
288 	}
289 
290 	if (keys->nr)
291 		bch2_journal_log_msg(c, "journal replay finished");
292 err:
293 	if (trans)
294 		bch2_trans_put(trans);
295 	darray_exit(&keys_sorted);
296 	bch_err_fn(c, ret);
297 	return ret;
298 }
299 
300 /* journal replay early: */
301 
302 static int journal_replay_entry_early(struct bch_fs *c,
303 				      struct jset_entry *entry)
304 {
305 	int ret = 0;
306 
307 	switch (entry->type) {
308 	case BCH_JSET_ENTRY_btree_root: {
309 		struct btree_root *r;
310 
311 		while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
312 			ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
313 			if (ret)
314 				return ret;
315 		}
316 
317 		r = bch2_btree_id_root(c, entry->btree_id);
318 
319 		if (entry->u64s) {
320 			r->level = entry->level;
321 			bkey_copy(&r->key, (struct bkey_i *) entry->start);
322 			r->error = 0;
323 		} else {
324 			r->error = -BCH_ERR_btree_node_read_error;
325 		}
326 		r->alive = true;
327 		break;
328 	}
329 	case BCH_JSET_ENTRY_usage: {
330 		struct jset_entry_usage *u =
331 			container_of(entry, struct jset_entry_usage, entry);
332 
333 		switch (entry->btree_id) {
334 		case BCH_FS_USAGE_reserved:
335 			if (entry->level < BCH_REPLICAS_MAX)
336 				c->usage_base->persistent_reserved[entry->level] =
337 					le64_to_cpu(u->v);
338 			break;
339 		case BCH_FS_USAGE_inodes:
340 			c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
341 			break;
342 		case BCH_FS_USAGE_key_version:
343 			atomic64_set(&c->key_version,
344 				     le64_to_cpu(u->v));
345 			break;
346 		}
347 
348 		break;
349 	}
350 	case BCH_JSET_ENTRY_data_usage: {
351 		struct jset_entry_data_usage *u =
352 			container_of(entry, struct jset_entry_data_usage, entry);
353 
354 		ret = bch2_replicas_set_usage(c, &u->r,
355 					      le64_to_cpu(u->v));
356 		break;
357 	}
358 	case BCH_JSET_ENTRY_dev_usage: {
359 		struct jset_entry_dev_usage *u =
360 			container_of(entry, struct jset_entry_dev_usage, entry);
361 		struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
362 		unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
363 
364 		for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
365 			ca->usage_base->d[i].buckets	= le64_to_cpu(u->d[i].buckets);
366 			ca->usage_base->d[i].sectors	= le64_to_cpu(u->d[i].sectors);
367 			ca->usage_base->d[i].fragmented	= le64_to_cpu(u->d[i].fragmented);
368 		}
369 
370 		break;
371 	}
372 	case BCH_JSET_ENTRY_blacklist: {
373 		struct jset_entry_blacklist *bl_entry =
374 			container_of(entry, struct jset_entry_blacklist, entry);
375 
376 		ret = bch2_journal_seq_blacklist_add(c,
377 				le64_to_cpu(bl_entry->seq),
378 				le64_to_cpu(bl_entry->seq) + 1);
379 		break;
380 	}
381 	case BCH_JSET_ENTRY_blacklist_v2: {
382 		struct jset_entry_blacklist_v2 *bl_entry =
383 			container_of(entry, struct jset_entry_blacklist_v2, entry);
384 
385 		ret = bch2_journal_seq_blacklist_add(c,
386 				le64_to_cpu(bl_entry->start),
387 				le64_to_cpu(bl_entry->end) + 1);
388 		break;
389 	}
390 	case BCH_JSET_ENTRY_clock: {
391 		struct jset_entry_clock *clock =
392 			container_of(entry, struct jset_entry_clock, entry);
393 
394 		atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
395 	}
396 	}
397 
398 	return ret;
399 }
400 
401 static int journal_replay_early(struct bch_fs *c,
402 				struct bch_sb_field_clean *clean)
403 {
404 	if (clean) {
405 		for (struct jset_entry *entry = clean->start;
406 		     entry != vstruct_end(&clean->field);
407 		     entry = vstruct_next(entry)) {
408 			int ret = journal_replay_entry_early(c, entry);
409 			if (ret)
410 				return ret;
411 		}
412 	} else {
413 		struct genradix_iter iter;
414 		struct journal_replay *i, **_i;
415 
416 		genradix_for_each(&c->journal_entries, iter, _i) {
417 			i = *_i;
418 
419 			if (journal_replay_ignore(i))
420 				continue;
421 
422 			vstruct_for_each(&i->j, entry) {
423 				int ret = journal_replay_entry_early(c, entry);
424 				if (ret)
425 					return ret;
426 			}
427 		}
428 	}
429 
430 	bch2_fs_usage_initialize(c);
431 
432 	return 0;
433 }
434 
435 /* sb clean section: */
436 
437 static int read_btree_roots(struct bch_fs *c)
438 {
439 	int ret = 0;
440 
441 	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
442 		struct btree_root *r = bch2_btree_id_root(c, i);
443 
444 		if (!r->alive)
445 			continue;
446 
447 		if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
448 			continue;
449 
450 		if (mustfix_fsck_err_on((ret = r->error),
451 					c, btree_root_bkey_invalid,
452 					"invalid btree root %s",
453 					bch2_btree_id_str(i)) ||
454 		    mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
455 					c, btree_root_read_error,
456 					"error reading btree root %s l=%u: %s",
457 					bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
458 			if (btree_id_is_alloc(i)) {
459 				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
460 				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
461 				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
462 				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
463 				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
464 				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
465 				r->error = 0;
466 			} else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
467 				bch_info(c, "will run btree node scan");
468 				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
469 				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
470 			}
471 
472 			ret = 0;
473 			bch2_btree_lost_data(c, i);
474 		}
475 	}
476 
477 	for (unsigned i = 0; i < BTREE_ID_NR; i++) {
478 		struct btree_root *r = bch2_btree_id_root(c, i);
479 
480 		if (!r->b && !r->error) {
481 			r->alive = false;
482 			r->level = 0;
483 			bch2_btree_root_alloc_fake(c, i, 0);
484 		}
485 	}
486 fsck_err:
487 	return ret;
488 }
489 
490 static bool check_version_upgrade(struct bch_fs *c)
491 {
492 	unsigned latest_version	= bcachefs_metadata_version_current;
493 	unsigned latest_compatible = min(latest_version,
494 					 bch2_latest_compatible_version(c->sb.version));
495 	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
496 	unsigned new_version = 0;
497 
498 	if (old_version < bcachefs_metadata_required_upgrade_below) {
499 		if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
500 		    latest_compatible < bcachefs_metadata_required_upgrade_below)
501 			new_version = latest_version;
502 		else
503 			new_version = latest_compatible;
504 	} else {
505 		switch (c->opts.version_upgrade) {
506 		case BCH_VERSION_UPGRADE_compatible:
507 			new_version = latest_compatible;
508 			break;
509 		case BCH_VERSION_UPGRADE_incompatible:
510 			new_version = latest_version;
511 			break;
512 		case BCH_VERSION_UPGRADE_none:
513 			new_version = min(old_version, latest_version);
514 			break;
515 		}
516 	}
517 
518 	if (new_version > old_version) {
519 		struct printbuf buf = PRINTBUF;
520 
521 		if (old_version < bcachefs_metadata_required_upgrade_below)
522 			prt_str(&buf, "Version upgrade required:\n");
523 
524 		if (old_version != c->sb.version) {
525 			prt_str(&buf, "Version upgrade from ");
526 			bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
527 			prt_str(&buf, " to ");
528 			bch2_version_to_text(&buf, c->sb.version);
529 			prt_str(&buf, " incomplete\n");
530 		}
531 
532 		prt_printf(&buf, "Doing %s version upgrade from ",
533 			   BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
534 			   ? "incompatible" : "compatible");
535 		bch2_version_to_text(&buf, old_version);
536 		prt_str(&buf, " to ");
537 		bch2_version_to_text(&buf, new_version);
538 		prt_newline(&buf);
539 
540 		struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
541 		__le64 passes = ext->recovery_passes_required[0];
542 		bch2_sb_set_upgrade(c, old_version, new_version);
543 		passes = ext->recovery_passes_required[0] & ~passes;
544 
545 		if (passes) {
546 			prt_str(&buf, "  running recovery passes: ");
547 			prt_bitflags(&buf, bch2_recovery_passes,
548 				     bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
549 		}
550 
551 		bch_info(c, "%s", buf.buf);
552 
553 		bch2_sb_upgrade(c, new_version);
554 
555 		printbuf_exit(&buf);
556 		return true;
557 	}
558 
559 	return false;
560 }
561 
562 int bch2_fs_recovery(struct bch_fs *c)
563 {
564 	struct bch_sb_field_clean *clean = NULL;
565 	struct jset *last_journal_entry = NULL;
566 	u64 last_seq = 0, blacklist_seq, journal_seq;
567 	int ret = 0;
568 
569 	if (c->sb.clean) {
570 		clean = bch2_read_superblock_clean(c);
571 		ret = PTR_ERR_OR_ZERO(clean);
572 		if (ret)
573 			goto err;
574 
575 		bch_info(c, "recovering from clean shutdown, journal seq %llu",
576 			 le64_to_cpu(clean->journal_seq));
577 	} else {
578 		bch_info(c, "recovering from unclean shutdown");
579 	}
580 
581 	if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
582 		bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
583 		ret = -EINVAL;
584 		goto err;
585 	}
586 
587 	if (!c->sb.clean &&
588 	    !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
589 		bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
590 		ret = -EINVAL;
591 		goto err;
592 	}
593 
594 	if (c->opts.norecovery)
595 		c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1;
596 
597 	if (!c->opts.nochanges) {
598 		mutex_lock(&c->sb_lock);
599 		struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
600 		bool write_sb = false;
601 
602 		if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
603 			ext->recovery_passes_required[0] |=
604 				cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
605 			write_sb = true;
606 		}
607 
608 		u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
609 		if (sb_passes) {
610 			struct printbuf buf = PRINTBUF;
611 			prt_str(&buf, "superblock requires following recovery passes to be run:\n  ");
612 			prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
613 			bch_info(c, "%s", buf.buf);
614 			printbuf_exit(&buf);
615 		}
616 
617 		if (bch2_check_version_downgrade(c)) {
618 			struct printbuf buf = PRINTBUF;
619 
620 			prt_str(&buf, "Version downgrade required:");
621 
622 			__le64 passes = ext->recovery_passes_required[0];
623 			bch2_sb_set_downgrade(c,
624 					BCH_VERSION_MINOR(bcachefs_metadata_version_current),
625 					BCH_VERSION_MINOR(c->sb.version));
626 			passes = ext->recovery_passes_required[0] & ~passes;
627 			if (passes) {
628 				prt_str(&buf, "\n  running recovery passes: ");
629 				prt_bitflags(&buf, bch2_recovery_passes,
630 					     bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
631 			}
632 
633 			bch_info(c, "%s", buf.buf);
634 			printbuf_exit(&buf);
635 			write_sb = true;
636 		}
637 
638 		if (check_version_upgrade(c))
639 			write_sb = true;
640 
641 		if (write_sb)
642 			bch2_write_super(c);
643 
644 		c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
645 		mutex_unlock(&c->sb_lock);
646 	}
647 
648 	if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
649 		c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
650 
651 	if (c->opts.fsck)
652 		set_bit(BCH_FS_fsck_running, &c->flags);
653 
654 	ret = bch2_blacklist_table_initialize(c);
655 	if (ret) {
656 		bch_err(c, "error initializing blacklist table");
657 		goto err;
658 	}
659 
660 	if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) {
661 		struct genradix_iter iter;
662 		struct journal_replay **i;
663 
664 		bch_verbose(c, "starting journal read");
665 		ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
666 		if (ret)
667 			goto err;
668 
669 		/*
670 		 * note: cmd_list_journal needs the blacklist table fully up to date so
671 		 * it can asterisk ignored journal entries:
672 		 */
673 		if (c->opts.read_journal_only)
674 			goto out;
675 
676 		genradix_for_each_reverse(&c->journal_entries, iter, i)
677 			if (!journal_replay_ignore(*i)) {
678 				last_journal_entry = &(*i)->j;
679 				break;
680 			}
681 
682 		if (mustfix_fsck_err_on(c->sb.clean &&
683 					last_journal_entry &&
684 					!journal_entry_empty(last_journal_entry), c,
685 				clean_but_journal_not_empty,
686 				"filesystem marked clean but journal not empty")) {
687 			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
688 			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
689 			c->sb.clean = false;
690 		}
691 
692 		if (!last_journal_entry) {
693 			fsck_err_on(!c->sb.clean, c,
694 				    dirty_but_no_journal_entries,
695 				    "no journal entries found");
696 			if (clean)
697 				goto use_clean;
698 
699 			genradix_for_each_reverse(&c->journal_entries, iter, i)
700 				if (*i) {
701 					last_journal_entry = &(*i)->j;
702 					(*i)->ignore_blacklisted = false;
703 					(*i)->ignore_not_dirty= false;
704 					/*
705 					 * This was probably a NO_FLUSH entry,
706 					 * so last_seq was garbage - but we know
707 					 * we're only using a single journal
708 					 * entry, set it here:
709 					 */
710 					(*i)->j.last_seq = (*i)->j.seq;
711 					break;
712 				}
713 		}
714 
715 		ret = bch2_journal_keys_sort(c);
716 		if (ret)
717 			goto err;
718 
719 		if (c->sb.clean && last_journal_entry) {
720 			ret = bch2_verify_superblock_clean(c, &clean,
721 						      last_journal_entry);
722 			if (ret)
723 				goto err;
724 		}
725 	} else {
726 use_clean:
727 		if (!clean) {
728 			bch_err(c, "no superblock clean section found");
729 			ret = -BCH_ERR_fsck_repair_impossible;
730 			goto err;
731 
732 		}
733 		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
734 	}
735 
736 	c->journal_replay_seq_start	= last_seq;
737 	c->journal_replay_seq_end	= blacklist_seq - 1;
738 
739 	if (c->opts.reconstruct_alloc)
740 		bch2_reconstruct_alloc(c);
741 
742 	zero_out_btree_mem_ptr(&c->journal_keys);
743 
744 	ret = journal_replay_early(c, clean);
745 	if (ret)
746 		goto err;
747 
748 	/*
749 	 * After an unclean shutdown, skip then next few journal sequence
750 	 * numbers as they may have been referenced by btree writes that
751 	 * happened before their corresponding journal writes - those btree
752 	 * writes need to be ignored, by skipping and blacklisting the next few
753 	 * journal sequence numbers:
754 	 */
755 	if (!c->sb.clean)
756 		journal_seq += 8;
757 
758 	if (blacklist_seq != journal_seq) {
759 		ret =   bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
760 					     blacklist_seq, journal_seq) ?:
761 			bch2_journal_seq_blacklist_add(c,
762 					blacklist_seq, journal_seq);
763 		if (ret) {
764 			bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
765 			goto err;
766 		}
767 	}
768 
769 	ret =   bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
770 				     journal_seq, last_seq, blacklist_seq - 1) ?:
771 		bch2_fs_journal_start(&c->journal, journal_seq);
772 	if (ret)
773 		goto err;
774 
775 	/*
776 	 * Skip past versions that might have possibly been used (as nonces),
777 	 * but hadn't had their pointers written:
778 	 */
779 	if (c->sb.encryption_type && !c->sb.clean)
780 		atomic64_add(1 << 16, &c->key_version);
781 
782 	ret = read_btree_roots(c);
783 	if (ret)
784 		goto err;
785 
786 	ret = bch2_run_recovery_passes(c);
787 	if (ret)
788 		goto err;
789 
790 	clear_bit(BCH_FS_fsck_running, &c->flags);
791 
792 	/* fsync if we fixed errors */
793 	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
794 		bch2_journal_flush_all_pins(&c->journal);
795 		bch2_journal_meta(&c->journal);
796 	}
797 
798 	/* If we fixed errors, verify that fs is actually clean now: */
799 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
800 	    test_bit(BCH_FS_errors_fixed, &c->flags) &&
801 	    !test_bit(BCH_FS_errors_not_fixed, &c->flags) &&
802 	    !test_bit(BCH_FS_error, &c->flags)) {
803 		bch2_flush_fsck_errs(c);
804 
805 		bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
806 		clear_bit(BCH_FS_errors_fixed, &c->flags);
807 
808 		c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
809 
810 		ret = bch2_run_recovery_passes(c);
811 		if (ret)
812 			goto err;
813 
814 		if (test_bit(BCH_FS_errors_fixed, &c->flags) ||
815 		    test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
816 			bch_err(c, "Second fsck run was not clean");
817 			set_bit(BCH_FS_errors_not_fixed, &c->flags);
818 		}
819 
820 		set_bit(BCH_FS_errors_fixed, &c->flags);
821 	}
822 
823 	if (enabled_qtypes(c)) {
824 		bch_verbose(c, "reading quotas");
825 		ret = bch2_fs_quota_read(c);
826 		if (ret)
827 			goto err;
828 		bch_verbose(c, "quotas done");
829 	}
830 
831 	mutex_lock(&c->sb_lock);
832 	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
833 	bool write_sb = false;
834 
835 	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
836 		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
837 		write_sb = true;
838 	}
839 
840 	if (!test_bit(BCH_FS_error, &c->flags) &&
841 	    !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
842 		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
843 		write_sb = true;
844 	}
845 
846 	if (!test_bit(BCH_FS_error, &c->flags) &&
847 	    !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
848 		memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
849 		write_sb = true;
850 	}
851 
852 	if (c->opts.fsck &&
853 	    !test_bit(BCH_FS_error, &c->flags) &&
854 	    c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
855 	    ext->btrees_lost_data) {
856 		ext->btrees_lost_data = 0;
857 		write_sb = true;
858 	}
859 
860 	if (c->opts.fsck &&
861 	    !test_bit(BCH_FS_error, &c->flags) &&
862 	    !test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
863 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
864 		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
865 		write_sb = true;
866 	}
867 
868 	if (write_sb)
869 		bch2_write_super(c);
870 	mutex_unlock(&c->sb_lock);
871 
872 	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
873 	    c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
874 		struct bch_move_stats stats;
875 
876 		bch2_move_stats_init(&stats, "recovery");
877 
878 		struct printbuf buf = PRINTBUF;
879 		bch2_version_to_text(&buf, c->sb.version_min);
880 		bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
881 		printbuf_exit(&buf);
882 
883 		ret =   bch2_fs_read_write_early(c) ?:
884 			bch2_scan_old_btree_nodes(c, &stats);
885 		if (ret)
886 			goto err;
887 		bch_info(c, "scanning for old btree nodes done");
888 	}
889 
890 	if (c->journal_seq_blacklist_table &&
891 	    c->journal_seq_blacklist_table->nr > 128)
892 		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
893 
894 	ret = 0;
895 out:
896 	bch2_flush_fsck_errs(c);
897 
898 	if (!c->opts.retain_recovery_info) {
899 		bch2_journal_keys_put_initial(c);
900 		bch2_find_btree_nodes_exit(&c->found_btree_nodes);
901 	}
902 	kfree(clean);
903 
904 	if (!ret &&
905 	    test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
906 	    !c->opts.nochanges) {
907 		bch2_fs_read_write_early(c);
908 		bch2_delete_dead_snapshots_async(c);
909 	}
910 
911 	bch_err_fn(c, ret);
912 	return ret;
913 err:
914 fsck_err:
915 	bch2_fs_emergency_read_only(c);
916 	goto out;
917 }
918 
919 int bch2_fs_initialize(struct bch_fs *c)
920 {
921 	struct bch_inode_unpacked root_inode, lostfound_inode;
922 	struct bkey_inode_buf packed_inode;
923 	struct qstr lostfound = QSTR("lost+found");
924 	int ret;
925 
926 	bch_notice(c, "initializing new filesystem");
927 	set_bit(BCH_FS_new_fs, &c->flags);
928 
929 	mutex_lock(&c->sb_lock);
930 	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
931 	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
932 
933 	bch2_check_version_downgrade(c);
934 
935 	if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
936 		bch2_sb_upgrade(c, bcachefs_metadata_version_current);
937 		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
938 		bch2_write_super(c);
939 	}
940 	mutex_unlock(&c->sb_lock);
941 
942 	c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
943 	set_bit(BCH_FS_may_go_rw, &c->flags);
944 
945 	for (unsigned i = 0; i < BTREE_ID_NR; i++)
946 		bch2_btree_root_alloc_fake(c, i, 0);
947 
948 	for_each_member_device(c, ca)
949 		bch2_dev_usage_init(ca);
950 
951 	ret = bch2_fs_journal_alloc(c);
952 	if (ret)
953 		goto err;
954 
955 	/*
956 	 * journal_res_get() will crash if called before this has
957 	 * set up the journal.pin FIFO and journal.cur pointer:
958 	 */
959 	bch2_fs_journal_start(&c->journal, 1);
960 	bch2_journal_set_replay_done(&c->journal);
961 
962 	ret = bch2_fs_read_write_early(c);
963 	if (ret)
964 		goto err;
965 
966 	/*
967 	 * Write out the superblock and journal buckets, now that we can do
968 	 * btree updates
969 	 */
970 	bch_verbose(c, "marking superblocks");
971 	ret = bch2_trans_mark_dev_sbs(c);
972 	bch_err_msg(c, ret, "marking superblocks");
973 	if (ret)
974 		goto err;
975 
976 	for_each_online_member(c, ca)
977 		ca->new_fs_bucket_idx = 0;
978 
979 	ret = bch2_fs_freespace_init(c);
980 	if (ret)
981 		goto err;
982 
983 	ret = bch2_initialize_subvolumes(c);
984 	if (ret)
985 		goto err;
986 
987 	bch_verbose(c, "reading snapshots table");
988 	ret = bch2_snapshots_read(c);
989 	if (ret)
990 		goto err;
991 	bch_verbose(c, "reading snapshots done");
992 
993 	bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
994 	root_inode.bi_inum	= BCACHEFS_ROOT_INO;
995 	root_inode.bi_subvol	= BCACHEFS_ROOT_SUBVOL;
996 	bch2_inode_pack(&packed_inode, &root_inode);
997 	packed_inode.inode.k.p.snapshot = U32_MAX;
998 
999 	ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
1000 	bch_err_msg(c, ret, "creating root directory");
1001 	if (ret)
1002 		goto err;
1003 
1004 	bch2_inode_init_early(c, &lostfound_inode);
1005 
1006 	ret = bch2_trans_do(c, NULL, NULL, 0,
1007 		bch2_create_trans(trans,
1008 				  BCACHEFS_ROOT_SUBVOL_INUM,
1009 				  &root_inode, &lostfound_inode,
1010 				  &lostfound,
1011 				  0, 0, S_IFDIR|0700, 0,
1012 				  NULL, NULL, (subvol_inum) { 0 }, 0));
1013 	bch_err_msg(c, ret, "creating lost+found");
1014 	if (ret)
1015 		goto err;
1016 
1017 	c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
1018 
1019 	if (enabled_qtypes(c)) {
1020 		ret = bch2_fs_quota_read(c);
1021 		if (ret)
1022 			goto err;
1023 	}
1024 
1025 	ret = bch2_journal_flush(&c->journal);
1026 	bch_err_msg(c, ret, "writing first journal entry");
1027 	if (ret)
1028 		goto err;
1029 
1030 	mutex_lock(&c->sb_lock);
1031 	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
1032 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
1033 
1034 	bch2_write_super(c);
1035 	mutex_unlock(&c->sb_lock);
1036 
1037 	return 0;
1038 err:
1039 	bch_err_fn(c, ret);
1040 	return ret;
1041 }
1042