xref: /linux/fs/bcachefs/snapshot.c (revision fa8a4d3659d0c1ad73d5f59b2e0a6d408de5b317)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_key_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "errcode.h"
9 #include "error.h"
10 #include "fs.h"
11 #include "recovery_passes.h"
12 #include "snapshot.h"
13 
14 #include <linux/random.h>
15 
16 /*
17  * Snapshot trees:
18  *
19  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
20  * exist to provide a stable identifier for the whole lifetime of a snapshot
21  * tree.
22  */
23 
24 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
25 				struct bkey_s_c k)
26 {
27 	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
28 
29 	prt_printf(out, "subvol %u root snapshot %u",
30 		   le32_to_cpu(t.v->master_subvol),
31 		   le32_to_cpu(t.v->root_snapshot));
32 }
33 
34 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
35 			       enum bch_validate_flags flags)
36 {
37 	int ret = 0;
38 
39 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
40 			 bkey_lt(k.k->p, POS(0, 1)),
41 			 c, snapshot_tree_pos_bad,
42 			 "bad pos");
43 fsck_err:
44 	return ret;
45 }
46 
47 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
48 			      struct bch_snapshot_tree *s)
49 {
50 	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
51 					  BTREE_ITER_with_updates, snapshot_tree, s);
52 
53 	if (bch2_err_matches(ret, ENOENT))
54 		ret = -BCH_ERR_ENOENT_snapshot_tree;
55 	return ret;
56 }
57 
58 struct bkey_i_snapshot_tree *
59 __bch2_snapshot_tree_create(struct btree_trans *trans)
60 {
61 	struct btree_iter iter;
62 	int ret = bch2_bkey_get_empty_slot(trans, &iter,
63 			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
64 	struct bkey_i_snapshot_tree *s_t;
65 
66 	if (ret == -BCH_ERR_ENOSPC_btree_slot)
67 		ret = -BCH_ERR_ENOSPC_snapshot_tree;
68 	if (ret)
69 		return ERR_PTR(ret);
70 
71 	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
72 	ret = PTR_ERR_OR_ZERO(s_t);
73 	bch2_trans_iter_exit(trans, &iter);
74 	return ret ? ERR_PTR(ret) : s_t;
75 }
76 
77 static int bch2_snapshot_tree_create(struct btree_trans *trans,
78 				u32 root_id, u32 subvol_id, u32 *tree_id)
79 {
80 	struct bkey_i_snapshot_tree *n_tree =
81 		__bch2_snapshot_tree_create(trans);
82 
83 	if (IS_ERR(n_tree))
84 		return PTR_ERR(n_tree);
85 
86 	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
87 	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
88 	*tree_id = n_tree->k.p.offset;
89 	return 0;
90 }
91 
92 /* Snapshot nodes: */
93 
94 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
95 {
96 	while (id && id < ancestor) {
97 		const struct snapshot_t *s = __snapshot_t(t, id);
98 		id = s ? s->parent : 0;
99 	}
100 	return id == ancestor;
101 }
102 
103 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
104 {
105 	rcu_read_lock();
106 	bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
107 	rcu_read_unlock();
108 
109 	return ret;
110 }
111 
112 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
113 {
114 	const struct snapshot_t *s = __snapshot_t(t, id);
115 	if (!s)
116 		return 0;
117 
118 	if (s->skip[2] <= ancestor)
119 		return s->skip[2];
120 	if (s->skip[1] <= ancestor)
121 		return s->skip[1];
122 	if (s->skip[0] <= ancestor)
123 		return s->skip[0];
124 	return s->parent;
125 }
126 
127 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
128 {
129 	const struct snapshot_t *s = __snapshot_t(t, id);
130 	if (!s)
131 		return false;
132 
133 	return test_bit(ancestor - id - 1, s->is_ancestor);
134 }
135 
136 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
137 {
138 	bool ret;
139 
140 	rcu_read_lock();
141 	struct snapshot_table *t = rcu_dereference(c->snapshots);
142 
143 	if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
144 		ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
145 		goto out;
146 	}
147 
148 	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
149 		id = get_ancestor_below(t, id, ancestor);
150 
151 	ret = id && id < ancestor
152 		? test_ancestor_bitmap(t, id, ancestor)
153 		: id == ancestor;
154 
155 	EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
156 out:
157 	rcu_read_unlock();
158 
159 	return ret;
160 }
161 
162 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
163 {
164 	size_t idx = U32_MAX - id;
165 	struct snapshot_table *new, *old;
166 
167 	size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
168 	size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
169 
170 	if (unlikely(new_bytes > INT_MAX))
171 		return NULL;
172 
173 	new = kvzalloc(new_bytes, GFP_KERNEL);
174 	if (!new)
175 		return NULL;
176 
177 	new->nr = new_size;
178 
179 	old = rcu_dereference_protected(c->snapshots, true);
180 	if (old)
181 		memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
182 
183 	rcu_assign_pointer(c->snapshots, new);
184 	kvfree_rcu(old, rcu);
185 
186 	return &rcu_dereference_protected(c->snapshots,
187 				lockdep_is_held(&c->snapshot_table_lock))->s[idx];
188 }
189 
190 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
191 {
192 	size_t idx = U32_MAX - id;
193 	struct snapshot_table *table =
194 		rcu_dereference_protected(c->snapshots,
195 				lockdep_is_held(&c->snapshot_table_lock));
196 
197 	lockdep_assert_held(&c->snapshot_table_lock);
198 
199 	if (likely(table && idx < table->nr))
200 		return &table->s[idx];
201 
202 	return __snapshot_t_mut(c, id);
203 }
204 
205 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
206 			   struct bkey_s_c k)
207 {
208 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
209 
210 	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
211 	       BCH_SNAPSHOT_SUBVOL(s.v),
212 	       BCH_SNAPSHOT_DELETED(s.v),
213 	       le32_to_cpu(s.v->parent),
214 	       le32_to_cpu(s.v->children[0]),
215 	       le32_to_cpu(s.v->children[1]),
216 	       le32_to_cpu(s.v->subvol),
217 	       le32_to_cpu(s.v->tree));
218 
219 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
220 		prt_printf(out, " depth %u skiplist %u %u %u",
221 			   le32_to_cpu(s.v->depth),
222 			   le32_to_cpu(s.v->skip[0]),
223 			   le32_to_cpu(s.v->skip[1]),
224 			   le32_to_cpu(s.v->skip[2]));
225 }
226 
227 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
228 			  enum bch_validate_flags flags)
229 {
230 	struct bkey_s_c_snapshot s;
231 	u32 i, id;
232 	int ret = 0;
233 
234 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
235 			 bkey_lt(k.k->p, POS(0, 1)),
236 			 c, snapshot_pos_bad,
237 			 "bad pos");
238 
239 	s = bkey_s_c_to_snapshot(k);
240 
241 	id = le32_to_cpu(s.v->parent);
242 	bkey_fsck_err_on(id && id <= k.k->p.offset,
243 			 c, snapshot_parent_bad,
244 			 "bad parent node (%u <= %llu)",
245 			 id, k.k->p.offset);
246 
247 	bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
248 			 c, snapshot_children_not_normalized,
249 			 "children not normalized");
250 
251 	bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
252 			 c, snapshot_child_duplicate,
253 			 "duplicate child nodes");
254 
255 	for (i = 0; i < 2; i++) {
256 		id = le32_to_cpu(s.v->children[i]);
257 
258 		bkey_fsck_err_on(id >= k.k->p.offset,
259 				 c, snapshot_child_bad,
260 				 "bad child node (%u >= %llu)",
261 				 id, k.k->p.offset);
262 	}
263 
264 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
265 		bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
266 				 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
267 				 c, snapshot_skiplist_not_normalized,
268 				 "skiplist not normalized");
269 
270 		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
271 			id = le32_to_cpu(s.v->skip[i]);
272 
273 			bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
274 					 c, snapshot_skiplist_bad,
275 					 "bad skiplist node %u", id);
276 		}
277 	}
278 fsck_err:
279 	return ret;
280 }
281 
282 static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
283 {
284 	struct snapshot_t *t = snapshot_t_mut(c, id);
285 	u32 parent = id;
286 
287 	while ((parent = bch2_snapshot_parent_early(c, parent)) &&
288 	       parent - id - 1 < IS_ANCESTOR_BITMAP)
289 		__set_bit(parent - id - 1, t->is_ancestor);
290 }
291 
292 static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
293 {
294 	mutex_lock(&c->snapshot_table_lock);
295 	__set_is_ancestor_bitmap(c, id);
296 	mutex_unlock(&c->snapshot_table_lock);
297 }
298 
299 static int __bch2_mark_snapshot(struct btree_trans *trans,
300 		       enum btree_id btree, unsigned level,
301 		       struct bkey_s_c old, struct bkey_s_c new,
302 		       enum btree_iter_update_trigger_flags flags)
303 {
304 	struct bch_fs *c = trans->c;
305 	struct snapshot_t *t;
306 	u32 id = new.k->p.offset;
307 	int ret = 0;
308 
309 	mutex_lock(&c->snapshot_table_lock);
310 
311 	t = snapshot_t_mut(c, id);
312 	if (!t) {
313 		ret = -BCH_ERR_ENOMEM_mark_snapshot;
314 		goto err;
315 	}
316 
317 	if (new.k->type == KEY_TYPE_snapshot) {
318 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
319 
320 		t->parent	= le32_to_cpu(s.v->parent);
321 		t->children[0]	= le32_to_cpu(s.v->children[0]);
322 		t->children[1]	= le32_to_cpu(s.v->children[1]);
323 		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
324 		t->tree		= le32_to_cpu(s.v->tree);
325 
326 		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
327 			t->depth	= le32_to_cpu(s.v->depth);
328 			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
329 			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
330 			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
331 		} else {
332 			t->depth	= 0;
333 			t->skip[0]	= 0;
334 			t->skip[1]	= 0;
335 			t->skip[2]	= 0;
336 		}
337 
338 		__set_is_ancestor_bitmap(c, id);
339 
340 		if (BCH_SNAPSHOT_DELETED(s.v)) {
341 			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
342 			if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
343 				bch2_delete_dead_snapshots_async(c);
344 		}
345 	} else {
346 		memset(t, 0, sizeof(*t));
347 	}
348 err:
349 	mutex_unlock(&c->snapshot_table_lock);
350 	return ret;
351 }
352 
353 int bch2_mark_snapshot(struct btree_trans *trans,
354 		       enum btree_id btree, unsigned level,
355 		       struct bkey_s_c old, struct bkey_s new,
356 		       enum btree_iter_update_trigger_flags flags)
357 {
358 	return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
359 }
360 
361 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
362 			 struct bch_snapshot *s)
363 {
364 	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
365 				       BTREE_ITER_with_updates, snapshot, s);
366 }
367 
368 static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
369 {
370 	struct bch_snapshot v;
371 	int ret;
372 
373 	if (!id)
374 		return 0;
375 
376 	ret = bch2_snapshot_lookup(trans, id, &v);
377 	if (bch2_err_matches(ret, ENOENT))
378 		bch_err(trans->c, "snapshot node %u not found", id);
379 	if (ret)
380 		return ret;
381 
382 	return !BCH_SNAPSHOT_DELETED(&v);
383 }
384 
385 /*
386  * If @k is a snapshot with just one live child, it's part of a linear chain,
387  * which we consider to be an equivalence class: and then after snapshot
388  * deletion cleanup, there should only be a single key at a given position in
389  * this equivalence class.
390  *
391  * This sets the equivalence class of @k to be the child's equivalence class, if
392  * it's part of such a linear chain: this correctly sets equivalence classes on
393  * startup if we run leaf to root (i.e. in natural key order).
394  */
395 static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
396 {
397 	struct bch_fs *c = trans->c;
398 	unsigned i, nr_live = 0, live_idx = 0;
399 	struct bkey_s_c_snapshot snap;
400 	u32 id = k.k->p.offset, child[2];
401 
402 	if (k.k->type != KEY_TYPE_snapshot)
403 		return 0;
404 
405 	snap = bkey_s_c_to_snapshot(k);
406 
407 	child[0] = le32_to_cpu(snap.v->children[0]);
408 	child[1] = le32_to_cpu(snap.v->children[1]);
409 
410 	for (i = 0; i < 2; i++) {
411 		int ret = bch2_snapshot_live(trans, child[i]);
412 
413 		if (ret < 0)
414 			return ret;
415 
416 		if (ret)
417 			live_idx = i;
418 		nr_live += ret;
419 	}
420 
421 	mutex_lock(&c->snapshot_table_lock);
422 
423 	snapshot_t_mut(c, id)->equiv = nr_live == 1
424 		? snapshot_t_mut(c, child[live_idx])->equiv
425 		: id;
426 
427 	mutex_unlock(&c->snapshot_table_lock);
428 
429 	return 0;
430 }
431 
432 /* fsck: */
433 
434 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
435 {
436 	return snapshot_t(c, id)->children[child];
437 }
438 
439 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
440 {
441 	return bch2_snapshot_child(c, id, 0);
442 }
443 
444 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
445 {
446 	return bch2_snapshot_child(c, id, 1);
447 }
448 
449 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
450 {
451 	u32 n, parent;
452 
453 	n = bch2_snapshot_left_child(c, id);
454 	if (n)
455 		return n;
456 
457 	while ((parent = bch2_snapshot_parent(c, id))) {
458 		n = bch2_snapshot_right_child(c, parent);
459 		if (n && n != id)
460 			return n;
461 		id = parent;
462 	}
463 
464 	return 0;
465 }
466 
467 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
468 {
469 	u32 id = snapshot_root;
470 	u32 subvol = 0, s;
471 
472 	while (id) {
473 		s = snapshot_t(c, id)->subvol;
474 
475 		if (s && (!subvol || s < subvol))
476 			subvol = s;
477 
478 		id = bch2_snapshot_tree_next(c, id);
479 	}
480 
481 	return subvol;
482 }
483 
484 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
485 					    u32 snapshot_root, u32 *subvol_id)
486 {
487 	struct bch_fs *c = trans->c;
488 	struct btree_iter iter;
489 	struct bkey_s_c k;
490 	bool found = false;
491 	int ret;
492 
493 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
494 				     0, k, ret) {
495 		if (k.k->type != KEY_TYPE_subvolume)
496 			continue;
497 
498 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
499 		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
500 			continue;
501 		if (!BCH_SUBVOLUME_SNAP(s.v)) {
502 			*subvol_id = s.k->p.offset;
503 			found = true;
504 			break;
505 		}
506 	}
507 
508 	bch2_trans_iter_exit(trans, &iter);
509 
510 	if (!ret && !found) {
511 		struct bkey_i_subvolume *u;
512 
513 		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
514 
515 		u = bch2_bkey_get_mut_typed(trans, &iter,
516 					    BTREE_ID_subvolumes, POS(0, *subvol_id),
517 					    0, subvolume);
518 		ret = PTR_ERR_OR_ZERO(u);
519 		if (ret)
520 			return ret;
521 
522 		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
523 	}
524 
525 	return ret;
526 }
527 
528 static int check_snapshot_tree(struct btree_trans *trans,
529 			       struct btree_iter *iter,
530 			       struct bkey_s_c k)
531 {
532 	struct bch_fs *c = trans->c;
533 	struct bkey_s_c_snapshot_tree st;
534 	struct bch_snapshot s;
535 	struct bch_subvolume subvol;
536 	struct printbuf buf = PRINTBUF;
537 	u32 root_id;
538 	int ret;
539 
540 	if (k.k->type != KEY_TYPE_snapshot_tree)
541 		return 0;
542 
543 	st = bkey_s_c_to_snapshot_tree(k);
544 	root_id = le32_to_cpu(st.v->root_snapshot);
545 
546 	ret = bch2_snapshot_lookup(trans, root_id, &s);
547 	if (ret && !bch2_err_matches(ret, ENOENT))
548 		goto err;
549 
550 	if (fsck_err_on(ret ||
551 			root_id != bch2_snapshot_root(c, root_id) ||
552 			st.k->p.offset != le32_to_cpu(s.tree),
553 			trans, snapshot_tree_to_missing_snapshot,
554 			"snapshot tree points to missing/incorrect snapshot:\n  %s",
555 			(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
556 		ret = bch2_btree_delete_at(trans, iter, 0);
557 		goto err;
558 	}
559 
560 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
561 				 false, 0, &subvol);
562 	if (ret && !bch2_err_matches(ret, ENOENT))
563 		goto err;
564 
565 	if (fsck_err_on(ret,
566 			trans, snapshot_tree_to_missing_subvol,
567 			"snapshot tree points to missing subvolume:\n  %s",
568 			(printbuf_reset(&buf),
569 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
570 	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
571 						le32_to_cpu(subvol.snapshot),
572 						root_id),
573 			trans, snapshot_tree_to_wrong_subvol,
574 			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
575 			(printbuf_reset(&buf),
576 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
577 	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
578 			trans, snapshot_tree_to_snapshot_subvol,
579 			"snapshot tree points to snapshot subvolume:\n  %s",
580 			(printbuf_reset(&buf),
581 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
582 		struct bkey_i_snapshot_tree *u;
583 		u32 subvol_id;
584 
585 		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
586 		bch_err_fn(c, ret);
587 
588 		if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
589 			ret = 0;
590 			goto err;
591 		}
592 
593 		if (ret)
594 			goto err;
595 
596 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
597 		ret = PTR_ERR_OR_ZERO(u);
598 		if (ret)
599 			goto err;
600 
601 		u->v.master_subvol = cpu_to_le32(subvol_id);
602 		st = snapshot_tree_i_to_s_c(u);
603 	}
604 err:
605 fsck_err:
606 	printbuf_exit(&buf);
607 	return ret;
608 }
609 
610 /*
611  * For each snapshot_tree, make sure it points to the root of a snapshot tree
612  * and that snapshot entry points back to it, or delete it.
613  *
614  * And, make sure it points to a subvolume within that snapshot tree, or correct
615  * it to point to the oldest subvolume within that snapshot tree.
616  */
617 int bch2_check_snapshot_trees(struct bch_fs *c)
618 {
619 	int ret = bch2_trans_run(c,
620 		for_each_btree_key_commit(trans, iter,
621 			BTREE_ID_snapshot_trees, POS_MIN,
622 			BTREE_ITER_prefetch, k,
623 			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
624 		check_snapshot_tree(trans, &iter, k)));
625 	bch_err_fn(c, ret);
626 	return ret;
627 }
628 
629 /*
630  * Look up snapshot tree for @tree_id and find root,
631  * make sure @snap_id is a descendent:
632  */
633 static int snapshot_tree_ptr_good(struct btree_trans *trans,
634 				  u32 snap_id, u32 tree_id)
635 {
636 	struct bch_snapshot_tree s_t;
637 	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
638 
639 	if (bch2_err_matches(ret, ENOENT))
640 		return 0;
641 	if (ret)
642 		return ret;
643 
644 	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
645 }
646 
647 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
648 {
649 	const struct snapshot_t *s;
650 
651 	if (!id)
652 		return 0;
653 
654 	rcu_read_lock();
655 	s = snapshot_t(c, id);
656 	if (s->parent)
657 		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
658 	rcu_read_unlock();
659 
660 	return id;
661 }
662 
663 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
664 {
665 	unsigned i;
666 
667 	for (i = 0; i < 3; i++)
668 		if (!s.parent) {
669 			if (s.skip[i])
670 				return false;
671 		} else {
672 			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
673 				return false;
674 		}
675 
676 	return true;
677 }
678 
679 /*
680  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
681  * its snapshot_tree pointer is correct (allocate new one if necessary), then
682  * update this node's pointer to root node's pointer:
683  */
684 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
685 				    struct btree_iter *iter,
686 				    struct bkey_s_c k,
687 				    struct bch_snapshot *s)
688 {
689 	struct bch_fs *c = trans->c;
690 	struct btree_iter root_iter;
691 	struct bch_snapshot_tree s_t;
692 	struct bkey_s_c_snapshot root;
693 	struct bkey_i_snapshot *u;
694 	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
695 	int ret;
696 
697 	root = bch2_bkey_get_iter_typed(trans, &root_iter,
698 			       BTREE_ID_snapshots, POS(0, root_id),
699 			       BTREE_ITER_with_updates, snapshot);
700 	ret = bkey_err(root);
701 	if (ret)
702 		goto err;
703 
704 	tree_id = le32_to_cpu(root.v->tree);
705 
706 	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
707 	if (ret && !bch2_err_matches(ret, ENOENT))
708 		return ret;
709 
710 	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
711 		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
712 		ret =   PTR_ERR_OR_ZERO(u) ?:
713 			bch2_snapshot_tree_create(trans, root_id,
714 				bch2_snapshot_tree_oldest_subvol(c, root_id),
715 				&tree_id);
716 		if (ret)
717 			goto err;
718 
719 		u->v.tree = cpu_to_le32(tree_id);
720 		if (k.k->p.offset == root_id)
721 			*s = u->v;
722 	}
723 
724 	if (k.k->p.offset != root_id) {
725 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
726 		ret = PTR_ERR_OR_ZERO(u);
727 		if (ret)
728 			goto err;
729 
730 		u->v.tree = cpu_to_le32(tree_id);
731 		*s = u->v;
732 	}
733 err:
734 	bch2_trans_iter_exit(trans, &root_iter);
735 	return ret;
736 }
737 
738 static int check_snapshot(struct btree_trans *trans,
739 			  struct btree_iter *iter,
740 			  struct bkey_s_c k)
741 {
742 	struct bch_fs *c = trans->c;
743 	struct bch_snapshot s;
744 	struct bch_subvolume subvol;
745 	struct bch_snapshot v;
746 	struct bkey_i_snapshot *u;
747 	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
748 	u32 real_depth;
749 	struct printbuf buf = PRINTBUF;
750 	u32 i, id;
751 	int ret = 0;
752 
753 	if (k.k->type != KEY_TYPE_snapshot)
754 		return 0;
755 
756 	memset(&s, 0, sizeof(s));
757 	memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
758 
759 	id = le32_to_cpu(s.parent);
760 	if (id) {
761 		ret = bch2_snapshot_lookup(trans, id, &v);
762 		if (bch2_err_matches(ret, ENOENT))
763 			bch_err(c, "snapshot with nonexistent parent:\n  %s",
764 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
765 		if (ret)
766 			goto err;
767 
768 		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
769 		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
770 			bch_err(c, "snapshot parent %u missing pointer to child %llu",
771 				id, k.k->p.offset);
772 			ret = -EINVAL;
773 			goto err;
774 		}
775 	}
776 
777 	for (i = 0; i < 2 && s.children[i]; i++) {
778 		id = le32_to_cpu(s.children[i]);
779 
780 		ret = bch2_snapshot_lookup(trans, id, &v);
781 		if (bch2_err_matches(ret, ENOENT))
782 			bch_err(c, "snapshot node %llu has nonexistent child %u",
783 				k.k->p.offset, id);
784 		if (ret)
785 			goto err;
786 
787 		if (le32_to_cpu(v.parent) != k.k->p.offset) {
788 			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
789 				id, le32_to_cpu(v.parent), k.k->p.offset);
790 			ret = -EINVAL;
791 			goto err;
792 		}
793 	}
794 
795 	bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
796 		!BCH_SNAPSHOT_DELETED(&s);
797 
798 	if (should_have_subvol) {
799 		id = le32_to_cpu(s.subvol);
800 		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
801 		if (bch2_err_matches(ret, ENOENT))
802 			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
803 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
804 		if (ret)
805 			goto err;
806 
807 		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
808 			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
809 				k.k->p.offset);
810 			ret = -EINVAL;
811 			goto err;
812 		}
813 	} else {
814 		if (fsck_err_on(s.subvol,
815 				trans, snapshot_should_not_have_subvol,
816 				"snapshot should not point to subvol:\n  %s",
817 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
818 			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
819 			ret = PTR_ERR_OR_ZERO(u);
820 			if (ret)
821 				goto err;
822 
823 			u->v.subvol = 0;
824 			s = u->v;
825 		}
826 	}
827 
828 	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
829 	if (ret < 0)
830 		goto err;
831 
832 	if (fsck_err_on(!ret,
833 			trans, snapshot_to_bad_snapshot_tree,
834 			"snapshot points to missing/incorrect tree:\n  %s",
835 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
836 		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
837 		if (ret)
838 			goto err;
839 	}
840 	ret = 0;
841 
842 	real_depth = bch2_snapshot_depth(c, parent_id);
843 
844 	if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
845 			trans, snapshot_bad_depth,
846 			"snapshot with incorrect depth field, should be %u:\n  %s",
847 			real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
848 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
849 		ret = PTR_ERR_OR_ZERO(u);
850 		if (ret)
851 			goto err;
852 
853 		u->v.depth = cpu_to_le32(real_depth);
854 		s = u->v;
855 	}
856 
857 	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
858 	if (ret < 0)
859 		goto err;
860 
861 	if (fsck_err_on(!ret,
862 			trans, snapshot_bad_skiplist,
863 			"snapshot with bad skiplist field:\n  %s",
864 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
865 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
866 		ret = PTR_ERR_OR_ZERO(u);
867 		if (ret)
868 			goto err;
869 
870 		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
871 			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
872 
873 		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
874 		s = u->v;
875 	}
876 	ret = 0;
877 err:
878 fsck_err:
879 	printbuf_exit(&buf);
880 	return ret;
881 }
882 
883 int bch2_check_snapshots(struct bch_fs *c)
884 {
885 	/*
886 	 * We iterate backwards as checking/fixing the depth field requires that
887 	 * the parent's depth already be correct:
888 	 */
889 	int ret = bch2_trans_run(c,
890 		for_each_btree_key_reverse_commit(trans, iter,
891 				BTREE_ID_snapshots, POS_MAX,
892 				BTREE_ITER_prefetch, k,
893 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
894 			check_snapshot(trans, &iter, k)));
895 	bch_err_fn(c, ret);
896 	return ret;
897 }
898 
899 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
900 {
901 	struct bch_fs *c = trans->c;
902 
903 	if (bch2_snapshot_equiv(c, id))
904 		return 0;
905 
906 	/* 0 is an invalid tree ID */
907 	u32 tree_id = 0;
908 	int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
909 	if (ret)
910 		return ret;
911 
912 	struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
913 	ret = PTR_ERR_OR_ZERO(snapshot);
914 	if (ret)
915 		return ret;
916 
917 	bkey_snapshot_init(&snapshot->k_i);
918 	snapshot->k.p		= POS(0, id);
919 	snapshot->v.tree	= cpu_to_le32(tree_id);
920 	snapshot->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
921 
922 	return  bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
923 		bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
924 				   bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
925 		bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
926 }
927 
928 /* Figure out which snapshot nodes belong in the same tree: */
929 struct snapshot_tree_reconstruct {
930 	enum btree_id			btree;
931 	struct bpos			cur_pos;
932 	snapshot_id_list		cur_ids;
933 	DARRAY(snapshot_id_list)	trees;
934 };
935 
936 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
937 {
938 	darray_for_each(r->trees, i)
939 		darray_exit(i);
940 	darray_exit(&r->trees);
941 	darray_exit(&r->cur_ids);
942 }
943 
944 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
945 {
946 	return r->btree == BTREE_ID_inodes
947 		? r->cur_pos.offset == pos.offset
948 		: r->cur_pos.inode == pos.inode;
949 }
950 
951 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
952 {
953 	darray_for_each(*l, i)
954 		if (snapshot_list_has_id(r, *i))
955 			return true;
956 	return false;
957 }
958 
959 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
960 {
961 	bool first = true;
962 	darray_for_each(*s, i) {
963 		if (!first)
964 			prt_char(out, ' ');
965 		first = false;
966 		prt_printf(out, "%u", *i);
967 	}
968 }
969 
970 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
971 {
972 	if (r->cur_ids.nr) {
973 		darray_for_each(r->trees, i)
974 			if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
975 				int ret = snapshot_list_merge(c, i, &r->cur_ids);
976 				if (ret)
977 					return ret;
978 				goto out;
979 			}
980 		darray_push(&r->trees, r->cur_ids);
981 		darray_init(&r->cur_ids);
982 	}
983 out:
984 	r->cur_ids.nr = 0;
985 	return 0;
986 }
987 
988 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
989 {
990 	if (!same_snapshot(r, pos))
991 		snapshot_tree_reconstruct_next(c, r);
992 	r->cur_pos = pos;
993 	return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
994 }
995 
996 int bch2_reconstruct_snapshots(struct bch_fs *c)
997 {
998 	struct btree_trans *trans = bch2_trans_get(c);
999 	struct printbuf buf = PRINTBUF;
1000 	struct snapshot_tree_reconstruct r = {};
1001 	int ret = 0;
1002 
1003 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
1004 		if (btree_type_has_snapshots(btree)) {
1005 			r.btree = btree;
1006 
1007 			ret = for_each_btree_key(trans, iter, btree, POS_MIN,
1008 					BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
1009 				get_snapshot_trees(c, &r, k.k->p);
1010 			}));
1011 			if (ret)
1012 				goto err;
1013 
1014 			snapshot_tree_reconstruct_next(c, &r);
1015 		}
1016 	}
1017 
1018 	darray_for_each(r.trees, t) {
1019 		printbuf_reset(&buf);
1020 		snapshot_id_list_to_text(&buf, t);
1021 
1022 		darray_for_each(*t, id) {
1023 			if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
1024 					trans, snapshot_node_missing,
1025 					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
1026 				if (t->nr > 1) {
1027 					bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
1028 					ret = -BCH_ERR_fsck_repair_unimplemented;
1029 					goto err;
1030 				}
1031 
1032 				ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1033 						check_snapshot_exists(trans, *id));
1034 				if (ret)
1035 					goto err;
1036 			}
1037 		}
1038 	}
1039 fsck_err:
1040 err:
1041 	bch2_trans_put(trans);
1042 	snapshot_tree_reconstruct_exit(&r);
1043 	printbuf_exit(&buf);
1044 	bch_err_fn(c, ret);
1045 	return ret;
1046 }
1047 
1048 int bch2_check_key_has_snapshot(struct btree_trans *trans,
1049 				struct btree_iter *iter,
1050 				struct bkey_s_c k)
1051 {
1052 	struct bch_fs *c = trans->c;
1053 	struct printbuf buf = PRINTBUF;
1054 	int ret = 0;
1055 
1056 	if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot),
1057 			trans, bkey_in_missing_snapshot,
1058 			"key in missing snapshot %s, delete?",
1059 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1060 		ret = bch2_btree_delete_at(trans, iter,
1061 					    BTREE_UPDATE_internal_snapshot_node) ?: 1;
1062 fsck_err:
1063 	printbuf_exit(&buf);
1064 	return ret;
1065 }
1066 
1067 /*
1068  * Mark a snapshot as deleted, for future cleanup:
1069  */
1070 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1071 {
1072 	struct btree_iter iter;
1073 	struct bkey_i_snapshot *s;
1074 	int ret = 0;
1075 
1076 	s = bch2_bkey_get_mut_typed(trans, &iter,
1077 				    BTREE_ID_snapshots, POS(0, id),
1078 				    0, snapshot);
1079 	ret = PTR_ERR_OR_ZERO(s);
1080 	if (unlikely(ret)) {
1081 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1082 					trans->c, "missing snapshot %u", id);
1083 		return ret;
1084 	}
1085 
1086 	/* already deleted? */
1087 	if (BCH_SNAPSHOT_DELETED(&s->v))
1088 		goto err;
1089 
1090 	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1091 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1092 	s->v.subvol = 0;
1093 err:
1094 	bch2_trans_iter_exit(trans, &iter);
1095 	return ret;
1096 }
1097 
1098 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1099 {
1100 	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1101 		swap(s->children[0], s->children[1]);
1102 }
1103 
1104 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1105 {
1106 	struct bch_fs *c = trans->c;
1107 	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
1108 	struct btree_iter c_iter = (struct btree_iter) { NULL };
1109 	struct btree_iter tree_iter = (struct btree_iter) { NULL };
1110 	struct bkey_s_c_snapshot s;
1111 	u32 parent_id, child_id;
1112 	unsigned i;
1113 	int ret = 0;
1114 
1115 	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1116 				     BTREE_ITER_intent, snapshot);
1117 	ret = bkey_err(s);
1118 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1119 				"missing snapshot %u", id);
1120 
1121 	if (ret)
1122 		goto err;
1123 
1124 	BUG_ON(s.v->children[1]);
1125 
1126 	parent_id = le32_to_cpu(s.v->parent);
1127 	child_id = le32_to_cpu(s.v->children[0]);
1128 
1129 	if (parent_id) {
1130 		struct bkey_i_snapshot *parent;
1131 
1132 		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1133 				     BTREE_ID_snapshots, POS(0, parent_id),
1134 				     0, snapshot);
1135 		ret = PTR_ERR_OR_ZERO(parent);
1136 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1137 					"missing snapshot %u", parent_id);
1138 		if (unlikely(ret))
1139 			goto err;
1140 
1141 		/* find entry in parent->children for node being deleted */
1142 		for (i = 0; i < 2; i++)
1143 			if (le32_to_cpu(parent->v.children[i]) == id)
1144 				break;
1145 
1146 		if (bch2_fs_inconsistent_on(i == 2, c,
1147 					"snapshot %u missing child pointer to %u",
1148 					parent_id, id))
1149 			goto err;
1150 
1151 		parent->v.children[i] = cpu_to_le32(child_id);
1152 
1153 		normalize_snapshot_child_pointers(&parent->v);
1154 	}
1155 
1156 	if (child_id) {
1157 		struct bkey_i_snapshot *child;
1158 
1159 		child = bch2_bkey_get_mut_typed(trans, &c_iter,
1160 				     BTREE_ID_snapshots, POS(0, child_id),
1161 				     0, snapshot);
1162 		ret = PTR_ERR_OR_ZERO(child);
1163 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1164 					"missing snapshot %u", child_id);
1165 		if (unlikely(ret))
1166 			goto err;
1167 
1168 		child->v.parent = cpu_to_le32(parent_id);
1169 
1170 		if (!child->v.parent) {
1171 			child->v.skip[0] = 0;
1172 			child->v.skip[1] = 0;
1173 			child->v.skip[2] = 0;
1174 		}
1175 	}
1176 
1177 	if (!parent_id) {
1178 		/*
1179 		 * We're deleting the root of a snapshot tree: update the
1180 		 * snapshot_tree entry to point to the new root, or delete it if
1181 		 * this is the last snapshot ID in this tree:
1182 		 */
1183 		struct bkey_i_snapshot_tree *s_t;
1184 
1185 		BUG_ON(s.v->children[1]);
1186 
1187 		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1188 				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
1189 				0, snapshot_tree);
1190 		ret = PTR_ERR_OR_ZERO(s_t);
1191 		if (ret)
1192 			goto err;
1193 
1194 		if (s.v->children[0]) {
1195 			s_t->v.root_snapshot = s.v->children[0];
1196 		} else {
1197 			s_t->k.type = KEY_TYPE_deleted;
1198 			set_bkey_val_u64s(&s_t->k, 0);
1199 		}
1200 	}
1201 
1202 	ret = bch2_btree_delete_at(trans, &iter, 0);
1203 err:
1204 	bch2_trans_iter_exit(trans, &tree_iter);
1205 	bch2_trans_iter_exit(trans, &p_iter);
1206 	bch2_trans_iter_exit(trans, &c_iter);
1207 	bch2_trans_iter_exit(trans, &iter);
1208 	return ret;
1209 }
1210 
1211 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1212 			  u32 *new_snapids,
1213 			  u32 *snapshot_subvols,
1214 			  unsigned nr_snapids)
1215 {
1216 	struct bch_fs *c = trans->c;
1217 	struct btree_iter iter;
1218 	struct bkey_i_snapshot *n;
1219 	struct bkey_s_c k;
1220 	unsigned i, j;
1221 	u32 depth = bch2_snapshot_depth(c, parent);
1222 	int ret;
1223 
1224 	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1225 			     POS_MIN, BTREE_ITER_intent);
1226 	k = bch2_btree_iter_peek(&iter);
1227 	ret = bkey_err(k);
1228 	if (ret)
1229 		goto err;
1230 
1231 	for (i = 0; i < nr_snapids; i++) {
1232 		k = bch2_btree_iter_prev_slot(&iter);
1233 		ret = bkey_err(k);
1234 		if (ret)
1235 			goto err;
1236 
1237 		if (!k.k || !k.k->p.offset) {
1238 			ret = -BCH_ERR_ENOSPC_snapshot_create;
1239 			goto err;
1240 		}
1241 
1242 		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1243 		ret = PTR_ERR_OR_ZERO(n);
1244 		if (ret)
1245 			goto err;
1246 
1247 		n->v.flags	= 0;
1248 		n->v.parent	= cpu_to_le32(parent);
1249 		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
1250 		n->v.tree	= cpu_to_le32(tree);
1251 		n->v.depth	= cpu_to_le32(depth);
1252 		n->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
1253 		n->v.btime.hi	= 0;
1254 
1255 		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1256 			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1257 
1258 		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1259 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1260 
1261 		ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1262 					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1263 		if (ret)
1264 			goto err;
1265 
1266 		new_snapids[i]	= iter.pos.offset;
1267 
1268 		mutex_lock(&c->snapshot_table_lock);
1269 		snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
1270 		mutex_unlock(&c->snapshot_table_lock);
1271 	}
1272 err:
1273 	bch2_trans_iter_exit(trans, &iter);
1274 	return ret;
1275 }
1276 
1277 /*
1278  * Create new snapshot IDs as children of an existing snapshot ID:
1279  */
1280 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1281 			      u32 *new_snapids,
1282 			      u32 *snapshot_subvols,
1283 			      unsigned nr_snapids)
1284 {
1285 	struct btree_iter iter;
1286 	struct bkey_i_snapshot *n_parent;
1287 	int ret = 0;
1288 
1289 	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1290 			BTREE_ID_snapshots, POS(0, parent),
1291 			0, snapshot);
1292 	ret = PTR_ERR_OR_ZERO(n_parent);
1293 	if (unlikely(ret)) {
1294 		if (bch2_err_matches(ret, ENOENT))
1295 			bch_err(trans->c, "snapshot %u not found", parent);
1296 		return ret;
1297 	}
1298 
1299 	if (n_parent->v.children[0] || n_parent->v.children[1]) {
1300 		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1301 		ret = -EINVAL;
1302 		goto err;
1303 	}
1304 
1305 	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1306 			     new_snapids, snapshot_subvols, nr_snapids);
1307 	if (ret)
1308 		goto err;
1309 
1310 	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1311 	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1312 	n_parent->v.subvol = 0;
1313 	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1314 err:
1315 	bch2_trans_iter_exit(trans, &iter);
1316 	return ret;
1317 }
1318 
1319 /*
1320  * Create a snapshot node that is the root of a new tree:
1321  */
1322 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1323 			      u32 *new_snapids,
1324 			      u32 *snapshot_subvols,
1325 			      unsigned nr_snapids)
1326 {
1327 	struct bkey_i_snapshot_tree *n_tree;
1328 	int ret;
1329 
1330 	n_tree = __bch2_snapshot_tree_create(trans);
1331 	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1332 		create_snapids(trans, 0, n_tree->k.p.offset,
1333 			     new_snapids, snapshot_subvols, nr_snapids);
1334 	if (ret)
1335 		return ret;
1336 
1337 	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
1338 	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
1339 	return 0;
1340 }
1341 
1342 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1343 			      u32 *new_snapids,
1344 			      u32 *snapshot_subvols,
1345 			      unsigned nr_snapids)
1346 {
1347 	BUG_ON((parent == 0) != (nr_snapids == 1));
1348 	BUG_ON((parent != 0) != (nr_snapids == 2));
1349 
1350 	return parent
1351 		? bch2_snapshot_node_create_children(trans, parent,
1352 				new_snapids, snapshot_subvols, nr_snapids)
1353 		: bch2_snapshot_node_create_tree(trans,
1354 				new_snapids, snapshot_subvols, nr_snapids);
1355 
1356 }
1357 
1358 /*
1359  * If we have an unlinked inode in an internal snapshot node, and the inode
1360  * really has been deleted in all child snapshots, how does this get cleaned up?
1361  *
1362  * first there is the problem of how keys that have been overwritten in all
1363  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1364  * special?
1365  *
1366  * also: unlinked inode in internal snapshot appears to not be getting deleted
1367  * correctly if inode doesn't exist in leaf snapshots
1368  *
1369  * solution:
1370  *
1371  * for a key in an interior snapshot node that needs work to be done that
1372  * requires it to be mutated: iterate over all descendent leaf nodes and copy
1373  * that key to snapshot leaf nodes, where we can mutate it
1374  */
1375 
1376 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
1377 			       struct btree_iter *iter,
1378 			       struct bkey_s_c k,
1379 			       snapshot_id_list *deleted,
1380 			       snapshot_id_list *equiv_seen,
1381 			       struct bpos *last_pos)
1382 {
1383 	int ret = bch2_check_key_has_snapshot(trans, iter, k);
1384 	if (ret)
1385 		return ret < 0 ? ret : 0;
1386 
1387 	struct bch_fs *c = trans->c;
1388 	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
1389 	if (!equiv) /* key for invalid snapshot node, but we chose not to delete */
1390 		return 0;
1391 
1392 	if (!bkey_eq(k.k->p, *last_pos))
1393 		equiv_seen->nr = 0;
1394 
1395 	if (snapshot_list_has_id(deleted, k.k->p.snapshot))
1396 		return bch2_btree_delete_at(trans, iter,
1397 					    BTREE_UPDATE_internal_snapshot_node);
1398 
1399 	if (!bpos_eq(*last_pos, k.k->p) &&
1400 	    snapshot_list_has_id(equiv_seen, equiv))
1401 		return bch2_btree_delete_at(trans, iter,
1402 					    BTREE_UPDATE_internal_snapshot_node);
1403 
1404 	*last_pos = k.k->p;
1405 
1406 	ret = snapshot_list_add_nodup(c, equiv_seen, equiv);
1407 	if (ret)
1408 		return ret;
1409 
1410 	/*
1411 	 * When we have a linear chain of snapshot nodes, we consider
1412 	 * those to form an equivalence class: we're going to collapse
1413 	 * them all down to a single node, and keep the leaf-most node -
1414 	 * which has the same id as the equivalence class id.
1415 	 *
1416 	 * If there are multiple keys in different snapshots at the same
1417 	 * position, we're only going to keep the one in the newest
1418 	 * snapshot (we delete the others above) - the rest have been
1419 	 * overwritten and are redundant, and for the key we're going to keep we
1420 	 * need to move it to the equivalance class ID if it's not there
1421 	 * already.
1422 	 */
1423 	if (equiv != k.k->p.snapshot) {
1424 		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1425 		int ret = PTR_ERR_OR_ZERO(new);
1426 		if (ret)
1427 			return ret;
1428 
1429 		new->k.p.snapshot = equiv;
1430 
1431 		struct btree_iter new_iter;
1432 		bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
1433 				     BTREE_ITER_all_snapshots|
1434 				     BTREE_ITER_cached|
1435 				     BTREE_ITER_intent);
1436 
1437 		ret =   bch2_btree_iter_traverse(&new_iter) ?:
1438 			bch2_trans_update(trans, &new_iter, new,
1439 					BTREE_UPDATE_internal_snapshot_node) ?:
1440 			bch2_btree_delete_at(trans, iter,
1441 					BTREE_UPDATE_internal_snapshot_node);
1442 		bch2_trans_iter_exit(trans, &new_iter);
1443 		if (ret)
1444 			return ret;
1445 	}
1446 
1447 	return 0;
1448 }
1449 
1450 static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
1451 {
1452 	struct bkey_s_c_snapshot snap;
1453 	u32 children[2];
1454 	int ret;
1455 
1456 	if (k.k->type != KEY_TYPE_snapshot)
1457 		return 0;
1458 
1459 	snap = bkey_s_c_to_snapshot(k);
1460 	if (BCH_SNAPSHOT_DELETED(snap.v) ||
1461 	    BCH_SNAPSHOT_SUBVOL(snap.v))
1462 		return 0;
1463 
1464 	children[0] = le32_to_cpu(snap.v->children[0]);
1465 	children[1] = le32_to_cpu(snap.v->children[1]);
1466 
1467 	ret   = bch2_snapshot_live(trans, children[0]) ?:
1468 		bch2_snapshot_live(trans, children[1]);
1469 	if (ret < 0)
1470 		return ret;
1471 	return !ret;
1472 }
1473 
1474 /*
1475  * For a given snapshot, if it doesn't have a subvolume that points to it, and
1476  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1477  * as deleted.
1478  */
1479 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
1480 {
1481 	int ret = bch2_snapshot_needs_delete(trans, k);
1482 
1483 	return ret <= 0
1484 		? ret
1485 		: bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
1486 }
1487 
1488 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1489 						snapshot_id_list *skip)
1490 {
1491 	rcu_read_lock();
1492 	while (snapshot_list_has_id(skip, id))
1493 		id = __bch2_snapshot_parent(c, id);
1494 
1495 	while (n--) {
1496 		do {
1497 			id = __bch2_snapshot_parent(c, id);
1498 		} while (snapshot_list_has_id(skip, id));
1499 	}
1500 	rcu_read_unlock();
1501 
1502 	return id;
1503 }
1504 
1505 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1506 					      struct btree_iter *iter, struct bkey_s_c k,
1507 					      snapshot_id_list *deleted)
1508 {
1509 	struct bch_fs *c = trans->c;
1510 	u32 nr_deleted_ancestors = 0;
1511 	struct bkey_i_snapshot *s;
1512 	int ret;
1513 
1514 	if (k.k->type != KEY_TYPE_snapshot)
1515 		return 0;
1516 
1517 	if (snapshot_list_has_id(deleted, k.k->p.offset))
1518 		return 0;
1519 
1520 	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1521 	ret = PTR_ERR_OR_ZERO(s);
1522 	if (ret)
1523 		return ret;
1524 
1525 	darray_for_each(*deleted, i)
1526 		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
1527 
1528 	if (!nr_deleted_ancestors)
1529 		return 0;
1530 
1531 	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1532 
1533 	if (!s->v.depth) {
1534 		s->v.skip[0] = 0;
1535 		s->v.skip[1] = 0;
1536 		s->v.skip[2] = 0;
1537 	} else {
1538 		u32 depth = le32_to_cpu(s->v.depth);
1539 		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1540 
1541 		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1542 			u32 id = le32_to_cpu(s->v.skip[j]);
1543 
1544 			if (snapshot_list_has_id(deleted, id)) {
1545 				id = bch2_snapshot_nth_parent_skip(c,
1546 							parent,
1547 							depth > 1
1548 							? get_random_u32_below(depth - 1)
1549 							: 0,
1550 							deleted);
1551 				s->v.skip[j] = cpu_to_le32(id);
1552 			}
1553 		}
1554 
1555 		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1556 	}
1557 
1558 	return bch2_trans_update(trans, iter, &s->k_i, 0);
1559 }
1560 
1561 int bch2_delete_dead_snapshots(struct bch_fs *c)
1562 {
1563 	struct btree_trans *trans;
1564 	snapshot_id_list deleted = { 0 };
1565 	snapshot_id_list deleted_interior = { 0 };
1566 	int ret = 0;
1567 
1568 	if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1569 		return 0;
1570 
1571 	trans = bch2_trans_get(c);
1572 
1573 	/*
1574 	 * For every snapshot node: If we have no live children and it's not
1575 	 * pointed to by a subvolume, delete it:
1576 	 */
1577 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
1578 			POS_MIN, 0, k,
1579 			NULL, NULL, 0,
1580 		bch2_delete_redundant_snapshot(trans, k));
1581 	bch_err_msg(c, ret, "deleting redundant snapshots");
1582 	if (ret)
1583 		goto err;
1584 
1585 	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1586 				 POS_MIN, 0, k,
1587 		bch2_snapshot_set_equiv(trans, k));
1588 	bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
1589 	if (ret)
1590 		goto err;
1591 
1592 	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1593 				 POS_MIN, 0, k, ({
1594 		if (k.k->type != KEY_TYPE_snapshot)
1595 			continue;
1596 
1597 		BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
1598 			? snapshot_list_add(c, &deleted, k.k->p.offset)
1599 			: 0;
1600 	}));
1601 	bch_err_msg(c, ret, "walking snapshots");
1602 	if (ret)
1603 		goto err;
1604 
1605 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
1606 		struct bpos last_pos = POS_MIN;
1607 		snapshot_id_list equiv_seen = { 0 };
1608 		struct disk_reservation res = { 0 };
1609 
1610 		if (!btree_type_has_snapshots(btree))
1611 			continue;
1612 
1613 		ret = for_each_btree_key_commit(trans, iter,
1614 				btree, POS_MIN,
1615 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1616 				&res, NULL, BCH_TRANS_COMMIT_no_enospc,
1617 			delete_dead_snapshots_process_key(trans, &iter, k, &deleted,
1618 							  &equiv_seen, &last_pos));
1619 
1620 		bch2_disk_reservation_put(c, &res);
1621 		darray_exit(&equiv_seen);
1622 
1623 		bch_err_msg(c, ret, "deleting keys from dying snapshots");
1624 		if (ret)
1625 			goto err;
1626 	}
1627 
1628 	bch2_trans_unlock(trans);
1629 	down_write(&c->snapshot_create_lock);
1630 
1631 	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1632 				 POS_MIN, 0, k, ({
1633 		u32 snapshot = k.k->p.offset;
1634 		u32 equiv = bch2_snapshot_equiv(c, snapshot);
1635 
1636 		equiv != snapshot
1637 			? snapshot_list_add(c, &deleted_interior, snapshot)
1638 			: 0;
1639 	}));
1640 
1641 	bch_err_msg(c, ret, "walking snapshots");
1642 	if (ret)
1643 		goto err_create_lock;
1644 
1645 	/*
1646 	 * Fixing children of deleted snapshots can't be done completely
1647 	 * atomically, if we crash between here and when we delete the interior
1648 	 * nodes some depth fields will be off:
1649 	 */
1650 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1651 				  BTREE_ITER_intent, k,
1652 				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1653 		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
1654 	if (ret)
1655 		goto err_create_lock;
1656 
1657 	darray_for_each(deleted, i) {
1658 		ret = commit_do(trans, NULL, NULL, 0,
1659 			bch2_snapshot_node_delete(trans, *i));
1660 		bch_err_msg(c, ret, "deleting snapshot %u", *i);
1661 		if (ret)
1662 			goto err_create_lock;
1663 	}
1664 
1665 	darray_for_each(deleted_interior, i) {
1666 		ret = commit_do(trans, NULL, NULL, 0,
1667 			bch2_snapshot_node_delete(trans, *i));
1668 		bch_err_msg(c, ret, "deleting snapshot %u", *i);
1669 		if (ret)
1670 			goto err_create_lock;
1671 	}
1672 err_create_lock:
1673 	up_write(&c->snapshot_create_lock);
1674 err:
1675 	darray_exit(&deleted_interior);
1676 	darray_exit(&deleted);
1677 	bch2_trans_put(trans);
1678 	bch_err_fn(c, ret);
1679 	return ret;
1680 }
1681 
1682 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1683 {
1684 	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1685 
1686 	set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
1687 
1688 	bch2_delete_dead_snapshots(c);
1689 	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1690 }
1691 
1692 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1693 {
1694 	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
1695 	    !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1696 		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1697 }
1698 
1699 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1700 				       enum btree_id id,
1701 				       struct bpos pos)
1702 {
1703 	struct bch_fs *c = trans->c;
1704 	struct btree_iter iter;
1705 	struct bkey_s_c k;
1706 	int ret;
1707 
1708 	bch2_trans_iter_init(trans, &iter, id, pos,
1709 			     BTREE_ITER_not_extents|
1710 			     BTREE_ITER_all_snapshots);
1711 	while (1) {
1712 		k = bch2_btree_iter_prev(&iter);
1713 		ret = bkey_err(k);
1714 		if (ret)
1715 			break;
1716 
1717 		if (!k.k)
1718 			break;
1719 
1720 		if (!bkey_eq(pos, k.k->p))
1721 			break;
1722 
1723 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1724 			ret = 1;
1725 			break;
1726 		}
1727 	}
1728 	bch2_trans_iter_exit(trans, &iter);
1729 
1730 	return ret;
1731 }
1732 
1733 static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
1734 {
1735 	const struct snapshot_t *s = snapshot_t(c, id);
1736 
1737 	return s->children[1] ?: s->children[0];
1738 }
1739 
1740 static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
1741 {
1742 	u32 child;
1743 
1744 	while ((child = bch2_snapshot_smallest_child(c, id)))
1745 		id = child;
1746 	return id;
1747 }
1748 
1749 static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
1750 					       enum btree_id btree,
1751 					       struct bkey_s_c interior_k,
1752 					       u32 leaf_id, struct bpos *new_min_pos)
1753 {
1754 	struct btree_iter iter;
1755 	struct bpos pos = interior_k.k->p;
1756 	struct bkey_s_c k;
1757 	struct bkey_i *new;
1758 	int ret;
1759 
1760 	pos.snapshot = leaf_id;
1761 
1762 	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
1763 	k = bch2_btree_iter_peek_slot(&iter);
1764 	ret = bkey_err(k);
1765 	if (ret)
1766 		goto out;
1767 
1768 	/* key already overwritten in this snapshot? */
1769 	if (k.k->p.snapshot != interior_k.k->p.snapshot)
1770 		goto out;
1771 
1772 	if (bpos_eq(*new_min_pos, POS_MIN)) {
1773 		*new_min_pos = k.k->p;
1774 		new_min_pos->snapshot = leaf_id;
1775 	}
1776 
1777 	new = bch2_bkey_make_mut_noupdate(trans, interior_k);
1778 	ret = PTR_ERR_OR_ZERO(new);
1779 	if (ret)
1780 		goto out;
1781 
1782 	new->k.p.snapshot = leaf_id;
1783 	ret = bch2_trans_update(trans, &iter, new, 0);
1784 out:
1785 	bch2_trans_iter_exit(trans, &iter);
1786 	return ret;
1787 }
1788 
1789 int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
1790 					  enum btree_id btree,
1791 					  struct bkey_s_c k,
1792 					  struct bpos *new_min_pos)
1793 {
1794 	struct bch_fs *c = trans->c;
1795 	struct bkey_buf sk;
1796 	u32 restart_count = trans->restart_count;
1797 	int ret = 0;
1798 
1799 	bch2_bkey_buf_init(&sk);
1800 	bch2_bkey_buf_reassemble(&sk, c, k);
1801 	k = bkey_i_to_s_c(sk.k);
1802 
1803 	*new_min_pos = POS_MIN;
1804 
1805 	for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
1806 	     id < k.k->p.snapshot;
1807 	     id++) {
1808 		if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
1809 		    !bch2_snapshot_is_leaf(c, id))
1810 			continue;
1811 again:
1812 		ret =   btree_trans_too_many_iters(trans) ?:
1813 			bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
1814 			bch2_trans_commit(trans, NULL, NULL, 0);
1815 		if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1816 			bch2_trans_begin(trans);
1817 			goto again;
1818 		}
1819 
1820 		if (ret)
1821 			break;
1822 	}
1823 
1824 	bch2_bkey_buf_exit(&sk, c);
1825 
1826 	return ret ?: trans_was_restarted(trans, restart_count);
1827 }
1828 
1829 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1830 {
1831 	struct bch_fs *c = trans->c;
1832 	struct bkey_s_c_snapshot snap;
1833 	int ret = 0;
1834 
1835 	if (k.k->type != KEY_TYPE_snapshot)
1836 		return 0;
1837 
1838 	snap = bkey_s_c_to_snapshot(k);
1839 	if (BCH_SNAPSHOT_DELETED(snap.v) ||
1840 	    bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
1841 	    (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
1842 		set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
1843 		return 0;
1844 	}
1845 
1846 	return ret;
1847 }
1848 
1849 int bch2_snapshots_read(struct bch_fs *c)
1850 {
1851 	int ret = bch2_trans_run(c,
1852 		for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1853 				   POS_MIN, 0, k,
1854 			__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1855 			bch2_snapshot_set_equiv(trans, k) ?:
1856 			bch2_check_snapshot_needs_deletion(trans, k)) ?:
1857 		for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1858 				   POS_MIN, 0, k,
1859 			   (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
1860 	bch_err_fn(c, ret);
1861 
1862 	/*
1863 	 * It's important that we check if we need to reconstruct snapshots
1864 	 * before going RW, so we mark that pass as required in the superblock -
1865 	 * otherwise, we could end up deleting keys with missing snapshot nodes
1866 	 * instead
1867 	 */
1868 	BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
1869 	       test_bit(BCH_FS_may_go_rw, &c->flags));
1870 
1871 	if (bch2_err_matches(ret, EIO) ||
1872 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
1873 		ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
1874 
1875 	return ret;
1876 }
1877 
1878 void bch2_fs_snapshots_exit(struct bch_fs *c)
1879 {
1880 	kvfree(rcu_dereference_protected(c->snapshots, true));
1881 }
1882