xref: /linux/fs/bcachefs/snapshot.c (revision fcab107abe1ab5be9dbe874baa722372da8f4f73)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bbpos.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_key_cache.h"
8 #include "btree_update.h"
9 #include "buckets.h"
10 #include "enumerated_ref.h"
11 #include "errcode.h"
12 #include "error.h"
13 #include "fs.h"
14 #include "recovery_passes.h"
15 #include "snapshot.h"
16 
17 #include <linux/random.h>
18 
19 /*
20  * Snapshot trees:
21  *
22  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
23  * exist to provide a stable identifier for the whole lifetime of a snapshot
24  * tree.
25  */
26 
27 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
28 				struct bkey_s_c k)
29 {
30 	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
31 
32 	prt_printf(out, "subvol %u root snapshot %u",
33 		   le32_to_cpu(t.v->master_subvol),
34 		   le32_to_cpu(t.v->root_snapshot));
35 }
36 
37 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
38 				struct bkey_validate_context from)
39 {
40 	int ret = 0;
41 
42 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
43 			 bkey_lt(k.k->p, POS(0, 1)),
44 			 c, snapshot_tree_pos_bad,
45 			 "bad pos");
46 fsck_err:
47 	return ret;
48 }
49 
50 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
51 			      struct bch_snapshot_tree *s)
52 {
53 	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
54 					  BTREE_ITER_with_updates, snapshot_tree, s);
55 
56 	if (bch2_err_matches(ret, ENOENT))
57 		ret = bch_err_throw(trans->c, ENOENT_snapshot_tree);
58 	return ret;
59 }
60 
61 struct bkey_i_snapshot_tree *
62 __bch2_snapshot_tree_create(struct btree_trans *trans)
63 {
64 	struct btree_iter iter;
65 	int ret = bch2_bkey_get_empty_slot(trans, &iter,
66 			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
67 	struct bkey_i_snapshot_tree *s_t;
68 
69 	if (ret == -BCH_ERR_ENOSPC_btree_slot)
70 		ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree);
71 	if (ret)
72 		return ERR_PTR(ret);
73 
74 	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
75 	ret = PTR_ERR_OR_ZERO(s_t);
76 	bch2_trans_iter_exit(trans, &iter);
77 	return ret ? ERR_PTR(ret) : s_t;
78 }
79 
80 static int bch2_snapshot_tree_create(struct btree_trans *trans,
81 				u32 root_id, u32 subvol_id, u32 *tree_id)
82 {
83 	struct bkey_i_snapshot_tree *n_tree =
84 		__bch2_snapshot_tree_create(trans);
85 
86 	if (IS_ERR(n_tree))
87 		return PTR_ERR(n_tree);
88 
89 	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
90 	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
91 	*tree_id = n_tree->k.p.offset;
92 	return 0;
93 }
94 
95 /* Snapshot nodes: */
96 
97 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
98 {
99 	while (id && id < ancestor) {
100 		const struct snapshot_t *s = __snapshot_t(t, id);
101 		id = s ? s->parent : 0;
102 	}
103 	return id == ancestor;
104 }
105 
106 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
107 {
108 	guard(rcu)();
109 	return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
110 }
111 
112 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
113 {
114 	const struct snapshot_t *s = __snapshot_t(t, id);
115 	if (!s)
116 		return 0;
117 
118 	if (s->skip[2] <= ancestor)
119 		return s->skip[2];
120 	if (s->skip[1] <= ancestor)
121 		return s->skip[1];
122 	if (s->skip[0] <= ancestor)
123 		return s->skip[0];
124 	return s->parent;
125 }
126 
127 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
128 {
129 	const struct snapshot_t *s = __snapshot_t(t, id);
130 	if (!s)
131 		return false;
132 
133 	return test_bit(ancestor - id - 1, s->is_ancestor);
134 }
135 
136 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
137 {
138 	bool ret;
139 
140 	guard(rcu)();
141 	struct snapshot_table *t = rcu_dereference(c->snapshots);
142 
143 	if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots))
144 		return __bch2_snapshot_is_ancestor_early(t, id, ancestor);
145 
146 	if (likely(ancestor >= IS_ANCESTOR_BITMAP))
147 		while (id && id < ancestor - IS_ANCESTOR_BITMAP)
148 			id = get_ancestor_below(t, id, ancestor);
149 
150 	ret = id && id < ancestor
151 		? test_ancestor_bitmap(t, id, ancestor)
152 		: id == ancestor;
153 
154 	EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
155 	return ret;
156 }
157 
158 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
159 {
160 	size_t idx = U32_MAX - id;
161 	struct snapshot_table *new, *old;
162 
163 	size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
164 	size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
165 
166 	if (unlikely(new_bytes > INT_MAX))
167 		return NULL;
168 
169 	new = kvzalloc(new_bytes, GFP_KERNEL);
170 	if (!new)
171 		return NULL;
172 
173 	new->nr = new_size;
174 
175 	old = rcu_dereference_protected(c->snapshots, true);
176 	if (old)
177 		memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
178 
179 	rcu_assign_pointer(c->snapshots, new);
180 	kvfree_rcu(old, rcu);
181 
182 	return &rcu_dereference_protected(c->snapshots,
183 				lockdep_is_held(&c->snapshot_table_lock))->s[idx];
184 }
185 
186 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
187 {
188 	size_t idx = U32_MAX - id;
189 	struct snapshot_table *table =
190 		rcu_dereference_protected(c->snapshots,
191 				lockdep_is_held(&c->snapshot_table_lock));
192 
193 	lockdep_assert_held(&c->snapshot_table_lock);
194 
195 	if (likely(table && idx < table->nr))
196 		return &table->s[idx];
197 
198 	return __snapshot_t_mut(c, id);
199 }
200 
201 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
202 			   struct bkey_s_c k)
203 {
204 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
205 
206 	if (BCH_SNAPSHOT_SUBVOL(s.v))
207 		prt_str(out, "subvol ");
208 	if (BCH_SNAPSHOT_WILL_DELETE(s.v))
209 		prt_str(out, "will_delete ");
210 	if (BCH_SNAPSHOT_DELETED(s.v))
211 		prt_str(out, "deleted ");
212 
213 	prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u",
214 	       le32_to_cpu(s.v->parent),
215 	       le32_to_cpu(s.v->children[0]),
216 	       le32_to_cpu(s.v->children[1]),
217 	       le32_to_cpu(s.v->subvol),
218 	       le32_to_cpu(s.v->tree));
219 
220 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
221 		prt_printf(out, " depth %u skiplist %u %u %u",
222 			   le32_to_cpu(s.v->depth),
223 			   le32_to_cpu(s.v->skip[0]),
224 			   le32_to_cpu(s.v->skip[1]),
225 			   le32_to_cpu(s.v->skip[2]));
226 }
227 
228 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
229 			   struct bkey_validate_context from)
230 {
231 	struct bkey_s_c_snapshot s;
232 	u32 i, id;
233 	int ret = 0;
234 
235 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
236 			 bkey_lt(k.k->p, POS(0, 1)),
237 			 c, snapshot_pos_bad,
238 			 "bad pos");
239 
240 	s = bkey_s_c_to_snapshot(k);
241 
242 	id = le32_to_cpu(s.v->parent);
243 	bkey_fsck_err_on(id && id <= k.k->p.offset,
244 			 c, snapshot_parent_bad,
245 			 "bad parent node (%u <= %llu)",
246 			 id, k.k->p.offset);
247 
248 	bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
249 			 c, snapshot_children_not_normalized,
250 			 "children not normalized");
251 
252 	bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
253 			 c, snapshot_child_duplicate,
254 			 "duplicate child nodes");
255 
256 	for (i = 0; i < 2; i++) {
257 		id = le32_to_cpu(s.v->children[i]);
258 
259 		bkey_fsck_err_on(id >= k.k->p.offset,
260 				 c, snapshot_child_bad,
261 				 "bad child node (%u >= %llu)",
262 				 id, k.k->p.offset);
263 	}
264 
265 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
266 		bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
267 				 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
268 				 c, snapshot_skiplist_not_normalized,
269 				 "skiplist not normalized");
270 
271 		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
272 			id = le32_to_cpu(s.v->skip[i]);
273 
274 			bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
275 					 c, snapshot_skiplist_bad,
276 					 "bad skiplist node %u", id);
277 		}
278 	}
279 fsck_err:
280 	return ret;
281 }
282 
283 static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
284 {
285 	mutex_lock(&c->snapshot_table_lock);
286 	int ret = snapshot_t_mut(c, id)
287 		? 0
288 		: bch_err_throw(c, ENOMEM_mark_snapshot);
289 	mutex_unlock(&c->snapshot_table_lock);
290 	return ret;
291 }
292 
293 static int __bch2_mark_snapshot(struct btree_trans *trans,
294 		       enum btree_id btree, unsigned level,
295 		       struct bkey_s_c old, struct bkey_s_c new,
296 		       enum btree_iter_update_trigger_flags flags)
297 {
298 	struct bch_fs *c = trans->c;
299 	struct snapshot_t *t;
300 	u32 id = new.k->p.offset;
301 	int ret = 0;
302 
303 	mutex_lock(&c->snapshot_table_lock);
304 
305 	t = snapshot_t_mut(c, id);
306 	if (!t) {
307 		ret = bch_err_throw(c, ENOMEM_mark_snapshot);
308 		goto err;
309 	}
310 
311 	if (new.k->type == KEY_TYPE_snapshot) {
312 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
313 
314 		t->state	= !BCH_SNAPSHOT_DELETED(s.v)
315 			? SNAPSHOT_ID_live
316 			: SNAPSHOT_ID_deleted;
317 		t->parent	= le32_to_cpu(s.v->parent);
318 		t->children[0]	= le32_to_cpu(s.v->children[0]);
319 		t->children[1]	= le32_to_cpu(s.v->children[1]);
320 		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
321 		t->tree		= le32_to_cpu(s.v->tree);
322 
323 		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
324 			t->depth	= le32_to_cpu(s.v->depth);
325 			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
326 			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
327 			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
328 		} else {
329 			t->depth	= 0;
330 			t->skip[0]	= 0;
331 			t->skip[1]	= 0;
332 			t->skip[2]	= 0;
333 		}
334 
335 		u32 parent = id;
336 
337 		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
338 		       parent - id - 1 < IS_ANCESTOR_BITMAP)
339 			__set_bit(parent - id - 1, t->is_ancestor);
340 
341 		if (BCH_SNAPSHOT_WILL_DELETE(s.v)) {
342 			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
343 			if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots)
344 				bch2_delete_dead_snapshots_async(c);
345 		}
346 	} else {
347 		memset(t, 0, sizeof(*t));
348 	}
349 err:
350 	mutex_unlock(&c->snapshot_table_lock);
351 	return ret;
352 }
353 
354 int bch2_mark_snapshot(struct btree_trans *trans,
355 		       enum btree_id btree, unsigned level,
356 		       struct bkey_s_c old, struct bkey_s new,
357 		       enum btree_iter_update_trigger_flags flags)
358 {
359 	return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
360 }
361 
362 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
363 			 struct bch_snapshot *s)
364 {
365 	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
366 				       BTREE_ITER_with_updates, snapshot, s);
367 }
368 
369 /* fsck: */
370 
371 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
372 {
373 	return snapshot_t(c, id)->children[child];
374 }
375 
376 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
377 {
378 	return bch2_snapshot_child(c, id, 0);
379 }
380 
381 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
382 {
383 	return bch2_snapshot_child(c, id, 1);
384 }
385 
386 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
387 {
388 	u32 n, parent;
389 
390 	n = bch2_snapshot_left_child(c, id);
391 	if (n)
392 		return n;
393 
394 	while ((parent = bch2_snapshot_parent(c, id))) {
395 		n = bch2_snapshot_right_child(c, parent);
396 		if (n && n != id)
397 			return n;
398 		id = parent;
399 	}
400 
401 	return 0;
402 }
403 
404 u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
405 				snapshot_id_list *skip)
406 {
407 	guard(rcu)();
408 	u32 id, subvol = 0, s;
409 retry:
410 	id = snapshot_root;
411 	while (id && bch2_snapshot_exists(c, id)) {
412 		if (!(skip && snapshot_list_has_id(skip, id))) {
413 			s = snapshot_t(c, id)->subvol;
414 
415 			if (s && (!subvol || s < subvol))
416 				subvol = s;
417 		}
418 		id = bch2_snapshot_tree_next(c, id);
419 		if (id == snapshot_root)
420 			break;
421 	}
422 
423 	if (!subvol && skip) {
424 		skip = NULL;
425 		goto retry;
426 	}
427 
428 	return subvol;
429 }
430 
431 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
432 					    u32 snapshot_root, u32 *subvol_id)
433 {
434 	struct bch_fs *c = trans->c;
435 	struct btree_iter iter;
436 	struct bkey_s_c k;
437 	bool found = false;
438 	int ret;
439 
440 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
441 				     0, k, ret) {
442 		if (k.k->type != KEY_TYPE_subvolume)
443 			continue;
444 
445 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
446 		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
447 			continue;
448 		if (!BCH_SUBVOLUME_SNAP(s.v)) {
449 			*subvol_id = s.k->p.offset;
450 			found = true;
451 			break;
452 		}
453 	}
454 	bch2_trans_iter_exit(trans, &iter);
455 
456 	if (!ret && !found) {
457 		struct bkey_i_subvolume *u;
458 
459 		*subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL);
460 
461 		u = bch2_bkey_get_mut_typed(trans, &iter,
462 					    BTREE_ID_subvolumes, POS(0, *subvol_id),
463 					    0, subvolume);
464 		ret = PTR_ERR_OR_ZERO(u);
465 		if (ret)
466 			return ret;
467 
468 		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
469 	}
470 
471 	return ret;
472 }
473 
474 static int check_snapshot_tree(struct btree_trans *trans,
475 			       struct btree_iter *iter,
476 			       struct bkey_s_c k)
477 {
478 	struct bch_fs *c = trans->c;
479 	struct bkey_s_c_snapshot_tree st;
480 	struct bch_snapshot s;
481 	struct bch_subvolume subvol;
482 	struct printbuf buf = PRINTBUF;
483 	struct btree_iter snapshot_iter = {};
484 	u32 root_id;
485 	int ret;
486 
487 	if (k.k->type != KEY_TYPE_snapshot_tree)
488 		return 0;
489 
490 	st = bkey_s_c_to_snapshot_tree(k);
491 	root_id = le32_to_cpu(st.v->root_snapshot);
492 
493 	struct bkey_s_c_snapshot snapshot_k =
494 		bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
495 					 POS(0, root_id), 0, snapshot);
496 	ret = bkey_err(snapshot_k);
497 	if (ret && !bch2_err_matches(ret, ENOENT))
498 		goto err;
499 
500 	if (!ret)
501 		bkey_val_copy(&s, snapshot_k);
502 
503 	if (fsck_err_on(ret ||
504 			root_id != bch2_snapshot_root(c, root_id) ||
505 			st.k->p.offset != le32_to_cpu(s.tree),
506 			trans, snapshot_tree_to_missing_snapshot,
507 			"snapshot tree points to missing/incorrect snapshot:\n%s",
508 			(bch2_bkey_val_to_text(&buf, c, st.s_c),
509 			 prt_newline(&buf),
510 			 ret
511 			 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
512 			 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
513 			 buf.buf))) {
514 		ret = bch2_btree_delete_at(trans, iter, 0);
515 		goto err;
516 	}
517 
518 	if (!st.v->master_subvol)
519 		goto out;
520 
521 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
522 	if (ret && !bch2_err_matches(ret, ENOENT))
523 		goto err;
524 
525 	if (fsck_err_on(ret,
526 			trans, snapshot_tree_to_missing_subvol,
527 			"snapshot tree points to missing subvolume:\n%s",
528 			(printbuf_reset(&buf),
529 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
530 	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
531 						le32_to_cpu(subvol.snapshot),
532 						root_id),
533 			trans, snapshot_tree_to_wrong_subvol,
534 			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s",
535 			(printbuf_reset(&buf),
536 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
537 	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
538 			trans, snapshot_tree_to_snapshot_subvol,
539 			"snapshot tree points to snapshot subvolume:\n%s",
540 			(printbuf_reset(&buf),
541 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
542 		struct bkey_i_snapshot_tree *u;
543 		u32 subvol_id;
544 
545 		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
546 		bch_err_fn(c, ret);
547 
548 		if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
549 			ret = 0;
550 			goto err;
551 		}
552 
553 		if (ret)
554 			goto err;
555 
556 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
557 		ret = PTR_ERR_OR_ZERO(u);
558 		if (ret)
559 			goto err;
560 
561 		u->v.master_subvol = cpu_to_le32(subvol_id);
562 		st = snapshot_tree_i_to_s_c(u);
563 	}
564 out:
565 err:
566 fsck_err:
567 	bch2_trans_iter_exit(trans, &snapshot_iter);
568 	printbuf_exit(&buf);
569 	return ret;
570 }
571 
572 /*
573  * For each snapshot_tree, make sure it points to the root of a snapshot tree
574  * and that snapshot entry points back to it, or delete it.
575  *
576  * And, make sure it points to a subvolume within that snapshot tree, or correct
577  * it to point to the oldest subvolume within that snapshot tree.
578  */
579 int bch2_check_snapshot_trees(struct bch_fs *c)
580 {
581 	int ret = bch2_trans_run(c,
582 		for_each_btree_key_commit(trans, iter,
583 			BTREE_ID_snapshot_trees, POS_MIN,
584 			BTREE_ITER_prefetch, k,
585 			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
586 		check_snapshot_tree(trans, &iter, k)));
587 	bch_err_fn(c, ret);
588 	return ret;
589 }
590 
591 /*
592  * Look up snapshot tree for @tree_id and find root,
593  * make sure @snap_id is a descendent:
594  */
595 static int snapshot_tree_ptr_good(struct btree_trans *trans,
596 				  u32 snap_id, u32 tree_id)
597 {
598 	struct bch_snapshot_tree s_t;
599 	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
600 
601 	if (bch2_err_matches(ret, ENOENT))
602 		return 0;
603 	if (ret)
604 		return ret;
605 
606 	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
607 }
608 
609 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
610 {
611 	if (!id)
612 		return 0;
613 
614 	guard(rcu)();
615 	const struct snapshot_t *s = snapshot_t(c, id);
616 	return s->parent
617 		? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth))
618 		: id;
619 }
620 
621 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
622 {
623 	unsigned i;
624 
625 	for (i = 0; i < 3; i++)
626 		if (!s.parent) {
627 			if (s.skip[i])
628 				return false;
629 		} else {
630 			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
631 				return false;
632 		}
633 
634 	return true;
635 }
636 
637 /*
638  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
639  * its snapshot_tree pointer is correct (allocate new one if necessary), then
640  * update this node's pointer to root node's pointer:
641  */
642 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
643 				    struct btree_iter *iter,
644 				    struct bkey_s_c k,
645 				    struct bch_snapshot *s)
646 {
647 	struct bch_fs *c = trans->c;
648 	struct btree_iter root_iter;
649 	struct bch_snapshot_tree s_t;
650 	struct bkey_s_c_snapshot root;
651 	struct bkey_i_snapshot *u;
652 	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
653 	int ret;
654 
655 	root = bch2_bkey_get_iter_typed(trans, &root_iter,
656 			       BTREE_ID_snapshots, POS(0, root_id),
657 			       BTREE_ITER_with_updates, snapshot);
658 	ret = bkey_err(root);
659 	if (ret)
660 		goto err;
661 
662 	tree_id = le32_to_cpu(root.v->tree);
663 
664 	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
665 	if (ret && !bch2_err_matches(ret, ENOENT))
666 		return ret;
667 
668 	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
669 		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
670 		ret =   PTR_ERR_OR_ZERO(u) ?:
671 			bch2_snapshot_tree_create(trans, root_id,
672 				bch2_snapshot_oldest_subvol(c, root_id, NULL),
673 				&tree_id);
674 		if (ret)
675 			goto err;
676 
677 		u->v.tree = cpu_to_le32(tree_id);
678 		if (k.k->p.offset == root_id)
679 			*s = u->v;
680 	}
681 
682 	if (k.k->p.offset != root_id) {
683 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
684 		ret = PTR_ERR_OR_ZERO(u);
685 		if (ret)
686 			goto err;
687 
688 		u->v.tree = cpu_to_le32(tree_id);
689 		*s = u->v;
690 	}
691 err:
692 	bch2_trans_iter_exit(trans, &root_iter);
693 	return ret;
694 }
695 
696 static int check_snapshot(struct btree_trans *trans,
697 			  struct btree_iter *iter,
698 			  struct bkey_s_c k)
699 {
700 	struct bch_fs *c = trans->c;
701 	struct bch_snapshot s;
702 	struct bch_subvolume subvol;
703 	struct bch_snapshot v;
704 	struct bkey_i_snapshot *u;
705 	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
706 	u32 real_depth;
707 	struct printbuf buf = PRINTBUF;
708 	u32 i, id;
709 	int ret = 0;
710 
711 	if (k.k->type != KEY_TYPE_snapshot)
712 		return 0;
713 
714 	memset(&s, 0, sizeof(s));
715 	memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
716 
717 	if (BCH_SNAPSHOT_DELETED(&s))
718 		return 0;
719 
720 	id = le32_to_cpu(s.parent);
721 	if (id) {
722 		ret = bch2_snapshot_lookup(trans, id, &v);
723 		if (bch2_err_matches(ret, ENOENT))
724 			bch_err(c, "snapshot with nonexistent parent:\n  %s",
725 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
726 		if (ret)
727 			goto err;
728 
729 		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
730 		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
731 			bch_err(c, "snapshot parent %u missing pointer to child %llu",
732 				id, k.k->p.offset);
733 			ret = -EINVAL;
734 			goto err;
735 		}
736 	}
737 
738 	for (i = 0; i < 2 && s.children[i]; i++) {
739 		id = le32_to_cpu(s.children[i]);
740 
741 		ret = bch2_snapshot_lookup(trans, id, &v);
742 		if (bch2_err_matches(ret, ENOENT))
743 			bch_err(c, "snapshot node %llu has nonexistent child %u",
744 				k.k->p.offset, id);
745 		if (ret)
746 			goto err;
747 
748 		if (le32_to_cpu(v.parent) != k.k->p.offset) {
749 			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
750 				id, le32_to_cpu(v.parent), k.k->p.offset);
751 			ret = -EINVAL;
752 			goto err;
753 		}
754 	}
755 
756 	bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
757 		!BCH_SNAPSHOT_WILL_DELETE(&s);
758 
759 	if (should_have_subvol) {
760 		id = le32_to_cpu(s.subvol);
761 		ret = bch2_subvolume_get(trans, id, false, &subvol);
762 		if (bch2_err_matches(ret, ENOENT))
763 			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
764 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
765 		if (ret)
766 			goto err;
767 
768 		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
769 			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
770 				k.k->p.offset);
771 			ret = -EINVAL;
772 			goto err;
773 		}
774 	} else {
775 		if (fsck_err_on(s.subvol,
776 				trans, snapshot_should_not_have_subvol,
777 				"snapshot should not point to subvol:\n%s",
778 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
779 			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
780 			ret = PTR_ERR_OR_ZERO(u);
781 			if (ret)
782 				goto err;
783 
784 			u->v.subvol = 0;
785 			s = u->v;
786 		}
787 	}
788 
789 	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
790 	if (ret < 0)
791 		goto err;
792 
793 	if (fsck_err_on(!ret,
794 			trans, snapshot_to_bad_snapshot_tree,
795 			"snapshot points to missing/incorrect tree:\n%s",
796 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
797 		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
798 		if (ret)
799 			goto err;
800 	}
801 	ret = 0;
802 
803 	real_depth = bch2_snapshot_depth(c, parent_id);
804 
805 	if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
806 			trans, snapshot_bad_depth,
807 			"snapshot with incorrect depth field, should be %u:\n%s",
808 			real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
809 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
810 		ret = PTR_ERR_OR_ZERO(u);
811 		if (ret)
812 			goto err;
813 
814 		u->v.depth = cpu_to_le32(real_depth);
815 		s = u->v;
816 	}
817 
818 	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
819 	if (ret < 0)
820 		goto err;
821 
822 	if (fsck_err_on(!ret,
823 			trans, snapshot_bad_skiplist,
824 			"snapshot with bad skiplist field:\n%s",
825 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
826 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
827 		ret = PTR_ERR_OR_ZERO(u);
828 		if (ret)
829 			goto err;
830 
831 		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
832 			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
833 
834 		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
835 		s = u->v;
836 	}
837 	ret = 0;
838 err:
839 fsck_err:
840 	printbuf_exit(&buf);
841 	return ret;
842 }
843 
844 int bch2_check_snapshots(struct bch_fs *c)
845 {
846 	/*
847 	 * We iterate backwards as checking/fixing the depth field requires that
848 	 * the parent's depth already be correct:
849 	 */
850 	int ret = bch2_trans_run(c,
851 		for_each_btree_key_reverse_commit(trans, iter,
852 				BTREE_ID_snapshots, POS_MAX,
853 				BTREE_ITER_prefetch, k,
854 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
855 			check_snapshot(trans, &iter, k)));
856 	bch_err_fn(c, ret);
857 	return ret;
858 }
859 
860 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
861 {
862 	struct bch_fs *c = trans->c;
863 
864 	/* Do we need to reconstruct the snapshot_tree entry as well? */
865 	struct btree_iter iter;
866 	struct bkey_s_c k;
867 	int ret = 0;
868 	u32 tree_id = 0;
869 
870 	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
871 				     0, k, ret) {
872 		if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
873 			tree_id = k.k->p.offset;
874 			break;
875 		}
876 	}
877 	bch2_trans_iter_exit(trans, &iter);
878 
879 	if (ret)
880 		return ret;
881 
882 	if (!tree_id) {
883 		ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
884 		if (ret)
885 			return ret;
886 	}
887 
888 	struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
889 	ret = PTR_ERR_OR_ZERO(snapshot);
890 	if (ret)
891 		return ret;
892 
893 	bkey_snapshot_init(&snapshot->k_i);
894 	snapshot->k.p		= POS(0, id);
895 	snapshot->v.tree	= cpu_to_le32(tree_id);
896 	snapshot->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
897 
898 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
899 				     0, k, ret) {
900 		if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
901 			snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
902 			SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
903 			break;
904 		}
905 	}
906 	bch2_trans_iter_exit(trans, &iter);
907 
908 	return  bch2_snapshot_table_make_room(c, id) ?:
909 		bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0);
910 }
911 
912 /* Figure out which snapshot nodes belong in the same tree: */
913 struct snapshot_tree_reconstruct {
914 	enum btree_id			btree;
915 	struct bpos			cur_pos;
916 	snapshot_id_list		cur_ids;
917 	DARRAY(snapshot_id_list)	trees;
918 };
919 
920 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
921 {
922 	darray_for_each(r->trees, i)
923 		darray_exit(i);
924 	darray_exit(&r->trees);
925 	darray_exit(&r->cur_ids);
926 }
927 
928 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
929 {
930 	return r->btree == BTREE_ID_inodes
931 		? r->cur_pos.offset == pos.offset
932 		: r->cur_pos.inode == pos.inode;
933 }
934 
935 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
936 {
937 	return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL;
938 }
939 
940 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
941 {
942 	bool first = true;
943 	darray_for_each(*s, i) {
944 		if (!first)
945 			prt_char(out, ' ');
946 		first = false;
947 		prt_printf(out, "%u", *i);
948 	}
949 }
950 
951 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
952 {
953 	if (r->cur_ids.nr) {
954 		darray_for_each(r->trees, i)
955 			if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
956 				int ret = snapshot_list_merge(c, i, &r->cur_ids);
957 				if (ret)
958 					return ret;
959 				goto out;
960 			}
961 		darray_push(&r->trees, r->cur_ids);
962 		darray_init(&r->cur_ids);
963 	}
964 out:
965 	r->cur_ids.nr = 0;
966 	return 0;
967 }
968 
969 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
970 {
971 	if (!same_snapshot(r, pos))
972 		snapshot_tree_reconstruct_next(c, r);
973 	r->cur_pos = pos;
974 	return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
975 }
976 
977 int bch2_reconstruct_snapshots(struct bch_fs *c)
978 {
979 	struct btree_trans *trans = bch2_trans_get(c);
980 	struct printbuf buf = PRINTBUF;
981 	struct snapshot_tree_reconstruct r = {};
982 	int ret = 0;
983 
984 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
985 		if (btree_type_has_snapshots(btree)) {
986 			r.btree = btree;
987 
988 			ret = for_each_btree_key(trans, iter, btree, POS_MIN,
989 					BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
990 				get_snapshot_trees(c, &r, k.k->p);
991 			}));
992 			if (ret)
993 				goto err;
994 
995 			snapshot_tree_reconstruct_next(c, &r);
996 		}
997 	}
998 
999 	darray_for_each(r.trees, t) {
1000 		printbuf_reset(&buf);
1001 		snapshot_id_list_to_text(&buf, t);
1002 
1003 		darray_for_each(*t, id) {
1004 			if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty,
1005 					trans, snapshot_node_missing,
1006 					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
1007 				if (t->nr > 1) {
1008 					bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
1009 					ret = bch_err_throw(c, fsck_repair_unimplemented);
1010 					goto err;
1011 				}
1012 
1013 				ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1014 						check_snapshot_exists(trans, *id));
1015 				if (ret)
1016 					goto err;
1017 			}
1018 		}
1019 	}
1020 fsck_err:
1021 err:
1022 	bch2_trans_put(trans);
1023 	snapshot_tree_reconstruct_exit(&r);
1024 	printbuf_exit(&buf);
1025 	bch_err_fn(c, ret);
1026 	return ret;
1027 }
1028 
1029 int __bch2_check_key_has_snapshot(struct btree_trans *trans,
1030 				  struct btree_iter *iter,
1031 				  struct bkey_s_c k)
1032 {
1033 	struct bch_fs *c = trans->c;
1034 	struct printbuf buf = PRINTBUF;
1035 	int ret = 0;
1036 	enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot);
1037 
1038 	/* Snapshot was definitively deleted, this error is marked autofix */
1039 	if (fsck_err_on(state == SNAPSHOT_ID_deleted,
1040 			trans, bkey_in_deleted_snapshot,
1041 			"key in deleted snapshot %s, delete?",
1042 			(bch2_btree_id_to_text(&buf, iter->btree_id),
1043 			 prt_char(&buf, ' '),
1044 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1045 		ret = bch2_btree_delete_at(trans, iter,
1046 					   BTREE_UPDATE_internal_snapshot_node) ?: 1;
1047 
1048 	if (state == SNAPSHOT_ID_empty) {
1049 		/*
1050 		 * Snapshot missing: we should have caught this with btree_lost_data and
1051 		 * kicked off reconstruct_snapshots, so if we end up here we have no
1052 		 * idea what happened.
1053 		 *
1054 		 * Do not delete unless we know that subvolumes and snapshots
1055 		 * are consistent:
1056 		 *
1057 		 * XXX:
1058 		 *
1059 		 * We could be smarter here, and instead of using the generic
1060 		 * recovery pass ratelimiting, track if there have been any
1061 		 * changes to the snapshots or inodes btrees since those passes
1062 		 * last ran.
1063 		 */
1064 		ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
1065 		ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
1066 
1067 		if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
1068 			ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
1069 
1070 		unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
1071 
1072 		if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
1073 			     "key in missing snapshot %s, delete?",
1074 			     (bch2_btree_id_to_text(&buf, iter->btree_id),
1075 			      prt_char(&buf, ' '),
1076 			      bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1077 			ret = bch2_btree_delete_at(trans, iter,
1078 						   BTREE_UPDATE_internal_snapshot_node) ?: 1;
1079 		}
1080 	}
1081 fsck_err:
1082 	printbuf_exit(&buf);
1083 	return ret;
1084 }
1085 
1086 int __bch2_get_snapshot_overwrites(struct btree_trans *trans,
1087 				   enum btree_id btree, struct bpos pos,
1088 				   snapshot_id_list *s)
1089 {
1090 	struct bch_fs *c = trans->c;
1091 	struct btree_iter iter;
1092 	struct bkey_s_c k;
1093 	int ret = 0;
1094 
1095 	for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos),
1096 					     BTREE_ITER_all_snapshots, k, ret) {
1097 		if (!bkey_eq(k.k->p, pos))
1098 			break;
1099 
1100 		if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) ||
1101 		    snapshot_list_has_ancestor(c, s, k.k->p.snapshot))
1102 			continue;
1103 
1104 		ret = snapshot_list_add(c, s, k.k->p.snapshot);
1105 		if (ret)
1106 			break;
1107 	}
1108 	bch2_trans_iter_exit(trans, &iter);
1109 	if (ret)
1110 		darray_exit(s);
1111 
1112 	return ret;
1113 }
1114 
1115 /*
1116  * Mark a snapshot as deleted, for future cleanup:
1117  */
1118 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1119 {
1120 	struct btree_iter iter;
1121 	struct bkey_i_snapshot *s =
1122 		bch2_bkey_get_mut_typed(trans, &iter,
1123 				    BTREE_ID_snapshots, POS(0, id),
1124 				    0, snapshot);
1125 	int ret = PTR_ERR_OR_ZERO(s);
1126 	if (unlikely(ret)) {
1127 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1128 					trans->c, "missing snapshot %u", id);
1129 		return ret;
1130 	}
1131 
1132 	/* already deleted? */
1133 	if (BCH_SNAPSHOT_WILL_DELETE(&s->v))
1134 		goto err;
1135 
1136 	SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true);
1137 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1138 	s->v.subvol = 0;
1139 err:
1140 	bch2_trans_iter_exit(trans, &iter);
1141 	return ret;
1142 }
1143 
1144 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1145 {
1146 	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1147 		swap(s->children[0], s->children[1]);
1148 }
1149 
1150 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1151 {
1152 	struct bch_fs *c = trans->c;
1153 	struct btree_iter iter, p_iter = {};
1154 	struct btree_iter c_iter = {};
1155 	struct btree_iter tree_iter = {};
1156 	u32 parent_id, child_id;
1157 	unsigned i;
1158 	int ret = 0;
1159 
1160 	struct bkey_i_snapshot *s =
1161 		bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1162 					BTREE_ITER_intent, snapshot);
1163 	ret = PTR_ERR_OR_ZERO(s);
1164 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1165 				"missing snapshot %u", id);
1166 
1167 	if (ret)
1168 		goto err;
1169 
1170 	BUG_ON(BCH_SNAPSHOT_DELETED(&s->v));
1171 	BUG_ON(s->v.children[1]);
1172 
1173 	parent_id = le32_to_cpu(s->v.parent);
1174 	child_id = le32_to_cpu(s->v.children[0]);
1175 
1176 	if (parent_id) {
1177 		struct bkey_i_snapshot *parent;
1178 
1179 		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1180 				     BTREE_ID_snapshots, POS(0, parent_id),
1181 				     0, snapshot);
1182 		ret = PTR_ERR_OR_ZERO(parent);
1183 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1184 					"missing snapshot %u", parent_id);
1185 		if (unlikely(ret))
1186 			goto err;
1187 
1188 		/* find entry in parent->children for node being deleted */
1189 		for (i = 0; i < 2; i++)
1190 			if (le32_to_cpu(parent->v.children[i]) == id)
1191 				break;
1192 
1193 		if (bch2_fs_inconsistent_on(i == 2, c,
1194 					"snapshot %u missing child pointer to %u",
1195 					parent_id, id))
1196 			goto err;
1197 
1198 		parent->v.children[i] = cpu_to_le32(child_id);
1199 
1200 		normalize_snapshot_child_pointers(&parent->v);
1201 	}
1202 
1203 	if (child_id) {
1204 		struct bkey_i_snapshot *child;
1205 
1206 		child = bch2_bkey_get_mut_typed(trans, &c_iter,
1207 				     BTREE_ID_snapshots, POS(0, child_id),
1208 				     0, snapshot);
1209 		ret = PTR_ERR_OR_ZERO(child);
1210 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1211 					"missing snapshot %u", child_id);
1212 		if (unlikely(ret))
1213 			goto err;
1214 
1215 		child->v.parent = cpu_to_le32(parent_id);
1216 
1217 		if (!child->v.parent) {
1218 			child->v.skip[0] = 0;
1219 			child->v.skip[1] = 0;
1220 			child->v.skip[2] = 0;
1221 		}
1222 	}
1223 
1224 	if (!parent_id) {
1225 		/*
1226 		 * We're deleting the root of a snapshot tree: update the
1227 		 * snapshot_tree entry to point to the new root, or delete it if
1228 		 * this is the last snapshot ID in this tree:
1229 		 */
1230 		struct bkey_i_snapshot_tree *s_t;
1231 
1232 		BUG_ON(s->v.children[1]);
1233 
1234 		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1235 				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)),
1236 				0, snapshot_tree);
1237 		ret = PTR_ERR_OR_ZERO(s_t);
1238 		if (ret)
1239 			goto err;
1240 
1241 		if (s->v.children[0]) {
1242 			s_t->v.root_snapshot = s->v.children[0];
1243 		} else {
1244 			s_t->k.type = KEY_TYPE_deleted;
1245 			set_bkey_val_u64s(&s_t->k, 0);
1246 		}
1247 	}
1248 
1249 	if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) {
1250 		SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1251 		s->v.parent		= 0;
1252 		s->v.children[0]	= 0;
1253 		s->v.children[1]	= 0;
1254 		s->v.subvol		= 0;
1255 		s->v.tree		= 0;
1256 		s->v.depth		= 0;
1257 		s->v.skip[0]		= 0;
1258 		s->v.skip[1]		= 0;
1259 		s->v.skip[2]		= 0;
1260 	} else {
1261 		s->k.type = KEY_TYPE_deleted;
1262 		set_bkey_val_u64s(&s->k, 0);
1263 	}
1264 err:
1265 	bch2_trans_iter_exit(trans, &tree_iter);
1266 	bch2_trans_iter_exit(trans, &p_iter);
1267 	bch2_trans_iter_exit(trans, &c_iter);
1268 	bch2_trans_iter_exit(trans, &iter);
1269 	return ret;
1270 }
1271 
1272 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1273 			  u32 *new_snapids,
1274 			  u32 *snapshot_subvols,
1275 			  unsigned nr_snapids)
1276 {
1277 	struct bch_fs *c = trans->c;
1278 	struct btree_iter iter;
1279 	struct bkey_i_snapshot *n;
1280 	struct bkey_s_c k;
1281 	unsigned i, j;
1282 	u32 depth = bch2_snapshot_depth(c, parent);
1283 	int ret;
1284 
1285 	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1286 			     POS_MIN, BTREE_ITER_intent);
1287 	k = bch2_btree_iter_peek(trans, &iter);
1288 	ret = bkey_err(k);
1289 	if (ret)
1290 		goto err;
1291 
1292 	for (i = 0; i < nr_snapids; i++) {
1293 		k = bch2_btree_iter_prev_slot(trans, &iter);
1294 		ret = bkey_err(k);
1295 		if (ret)
1296 			goto err;
1297 
1298 		if (!k.k || !k.k->p.offset) {
1299 			ret = bch_err_throw(c, ENOSPC_snapshot_create);
1300 			goto err;
1301 		}
1302 
1303 		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1304 		ret = PTR_ERR_OR_ZERO(n);
1305 		if (ret)
1306 			goto err;
1307 
1308 		n->v.flags	= 0;
1309 		n->v.parent	= cpu_to_le32(parent);
1310 		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
1311 		n->v.tree	= cpu_to_le32(tree);
1312 		n->v.depth	= cpu_to_le32(depth);
1313 		n->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
1314 		n->v.btime.hi	= 0;
1315 
1316 		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1317 			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1318 
1319 		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1320 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1321 
1322 		ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1323 					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1324 		if (ret)
1325 			goto err;
1326 
1327 		new_snapids[i]	= iter.pos.offset;
1328 	}
1329 err:
1330 	bch2_trans_iter_exit(trans, &iter);
1331 	return ret;
1332 }
1333 
1334 /*
1335  * Create new snapshot IDs as children of an existing snapshot ID:
1336  */
1337 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1338 			      u32 *new_snapids,
1339 			      u32 *snapshot_subvols,
1340 			      unsigned nr_snapids)
1341 {
1342 	struct btree_iter iter;
1343 	struct bkey_i_snapshot *n_parent;
1344 	int ret = 0;
1345 
1346 	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1347 			BTREE_ID_snapshots, POS(0, parent),
1348 			0, snapshot);
1349 	ret = PTR_ERR_OR_ZERO(n_parent);
1350 	if (unlikely(ret)) {
1351 		if (bch2_err_matches(ret, ENOENT))
1352 			bch_err(trans->c, "snapshot %u not found", parent);
1353 		return ret;
1354 	}
1355 
1356 	if (n_parent->v.children[0] || n_parent->v.children[1]) {
1357 		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1358 		ret = -EINVAL;
1359 		goto err;
1360 	}
1361 
1362 	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1363 			     new_snapids, snapshot_subvols, nr_snapids);
1364 	if (ret)
1365 		goto err;
1366 
1367 	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1368 	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1369 	n_parent->v.subvol = 0;
1370 	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1371 err:
1372 	bch2_trans_iter_exit(trans, &iter);
1373 	return ret;
1374 }
1375 
1376 /*
1377  * Create a snapshot node that is the root of a new tree:
1378  */
1379 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1380 			      u32 *new_snapids,
1381 			      u32 *snapshot_subvols,
1382 			      unsigned nr_snapids)
1383 {
1384 	struct bkey_i_snapshot_tree *n_tree;
1385 	int ret;
1386 
1387 	n_tree = __bch2_snapshot_tree_create(trans);
1388 	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1389 		create_snapids(trans, 0, n_tree->k.p.offset,
1390 			     new_snapids, snapshot_subvols, nr_snapids);
1391 	if (ret)
1392 		return ret;
1393 
1394 	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
1395 	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
1396 	return 0;
1397 }
1398 
1399 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1400 			      u32 *new_snapids,
1401 			      u32 *snapshot_subvols,
1402 			      unsigned nr_snapids)
1403 {
1404 	BUG_ON((parent == 0) != (nr_snapids == 1));
1405 	BUG_ON((parent != 0) != (nr_snapids == 2));
1406 
1407 	return parent
1408 		? bch2_snapshot_node_create_children(trans, parent,
1409 				new_snapids, snapshot_subvols, nr_snapids)
1410 		: bch2_snapshot_node_create_tree(trans,
1411 				new_snapids, snapshot_subvols, nr_snapids);
1412 
1413 }
1414 
1415 /*
1416  * If we have an unlinked inode in an internal snapshot node, and the inode
1417  * really has been deleted in all child snapshots, how does this get cleaned up?
1418  *
1419  * first there is the problem of how keys that have been overwritten in all
1420  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1421  * special?
1422  *
1423  * also: unlinked inode in internal snapshot appears to not be getting deleted
1424  * correctly if inode doesn't exist in leaf snapshots
1425  *
1426  * solution:
1427  *
1428  * for a key in an interior snapshot node that needs work to be done that
1429  * requires it to be mutated: iterate over all descendent leaf nodes and copy
1430  * that key to snapshot leaf nodes, where we can mutate it
1431  */
1432 
1433 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
1434 {
1435 	struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id);
1436 	return i ? i->live_child : 0;
1437 }
1438 
1439 static unsigned __live_child(struct snapshot_table *t, u32 id,
1440 			     snapshot_id_list *delete_leaves,
1441 			     interior_delete_list *delete_interior)
1442 {
1443 	struct snapshot_t *s = __snapshot_t(t, id);
1444 	if (!s)
1445 		return 0;
1446 
1447 	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
1448 		if (s->children[i] &&
1449 		    !snapshot_list_has_id(delete_leaves, s->children[i]) &&
1450 		    !interior_delete_has_id(delete_interior, s->children[i]))
1451 			return s->children[i];
1452 
1453 	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
1454 		u32 live_child = s->children[i]
1455 			? __live_child(t, s->children[i], delete_leaves, delete_interior)
1456 			: 0;
1457 		if (live_child)
1458 			return live_child;
1459 	}
1460 
1461 	return 0;
1462 }
1463 
1464 static unsigned live_child(struct bch_fs *c, u32 id)
1465 {
1466 	struct snapshot_delete *d = &c->snapshot_delete;
1467 
1468 	guard(rcu)();
1469 	return __live_child(rcu_dereference(c->snapshots), id,
1470 			    &d->delete_leaves, &d->delete_interior);
1471 }
1472 
1473 static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
1474 {
1475 	return snapshot_list_has_id(&d->delete_leaves, id) ||
1476 		interior_delete_has_id(&d->delete_interior, id) != 0;
1477 }
1478 
1479 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
1480 					     struct btree_iter *iter,
1481 					     struct bkey_s_c k)
1482 {
1483 	struct snapshot_delete *d = &trans->c->snapshot_delete;
1484 
1485 	if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot))
1486 		return bch2_btree_delete_at(trans, iter,
1487 					    BTREE_UPDATE_internal_snapshot_node);
1488 
1489 	u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot);
1490 	if (live_child) {
1491 		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1492 		int ret = PTR_ERR_OR_ZERO(new);
1493 		if (ret)
1494 			return ret;
1495 
1496 		new->k.p.snapshot = live_child;
1497 
1498 		struct btree_iter dst_iter;
1499 		struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
1500 							   iter->btree_id, new->k.p,
1501 							   BTREE_ITER_all_snapshots|
1502 							   BTREE_ITER_intent);
1503 		ret = bkey_err(dst_k);
1504 		if (ret)
1505 			return ret;
1506 
1507 		ret =   (bkey_deleted(dst_k.k)
1508 			 ? bch2_trans_update(trans, &dst_iter, new,
1509 					     BTREE_UPDATE_internal_snapshot_node)
1510 			 : 0) ?:
1511 			bch2_btree_delete_at(trans, iter,
1512 					     BTREE_UPDATE_internal_snapshot_node);
1513 		bch2_trans_iter_exit(trans, &dst_iter);
1514 		return ret;
1515 	}
1516 
1517 	return 0;
1518 }
1519 
1520 static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
1521 {
1522 	struct bch_fs *c = trans->c;
1523 	struct snapshot_delete *d = &c->snapshot_delete;
1524 
1525 	u64 inum = iter->btree_id != BTREE_ID_inodes
1526 		? iter->pos.inode
1527 		: iter->pos.offset;
1528 
1529 	if (*prev_inum == inum)
1530 		return false;
1531 
1532 	*prev_inum = inum;
1533 
1534 	bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
1535 					 bch2_snapshot_tree(c, iter->pos.snapshot));
1536 	if (unlikely(ret)) {
1537 		struct bpos pos = iter->pos;
1538 		pos.snapshot = 0;
1539 		if (iter->btree_id != BTREE_ID_inodes)
1540 			pos.offset = U64_MAX;
1541 		bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos));
1542 	}
1543 
1544 	return ret;
1545 }
1546 
1547 static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
1548 {
1549 	struct bch_fs *c = trans->c;
1550 	struct snapshot_delete *d = &c->snapshot_delete;
1551 
1552 	for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
1553 		struct disk_reservation res = { 0 };
1554 		u64 prev_inum = 0;
1555 
1556 		d->pos.pos = POS_MIN;
1557 
1558 		if (!btree_type_has_snapshots(d->pos.btree))
1559 			continue;
1560 
1561 		int ret = for_each_btree_key_commit(trans, iter,
1562 				d->pos.btree, POS_MIN,
1563 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1564 				&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1565 			d->pos.pos = iter.pos;
1566 
1567 			if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
1568 				continue;
1569 
1570 			delete_dead_snapshots_process_key(trans, &iter, k);
1571 		}));
1572 
1573 		bch2_disk_reservation_put(c, &res);
1574 
1575 		if (ret)
1576 			return ret;
1577 	}
1578 
1579 	return 0;
1580 }
1581 
1582 static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
1583 					   struct bpos start, struct bpos end)
1584 {
1585 	struct bch_fs *c = trans->c;
1586 	struct snapshot_delete *d = &c->snapshot_delete;
1587 	struct disk_reservation res = { 0 };
1588 
1589 	d->pos.btree	= btree;
1590 	d->pos.pos	= POS_MIN;
1591 
1592 	int ret = for_each_btree_key_max_commit(trans, iter,
1593 			btree, start, end,
1594 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1595 			&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1596 		d->pos.pos = iter.pos;
1597 		delete_dead_snapshots_process_key(trans, &iter, k);
1598 	}));
1599 
1600 	bch2_disk_reservation_put(c, &res);
1601 	return ret;
1602 }
1603 
1604 static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
1605 {
1606 	struct bch_fs *c = trans->c;
1607 	struct snapshot_delete *d = &c->snapshot_delete;
1608 	struct disk_reservation res = { 0 };
1609 	u64 prev_inum = 0;
1610 	int ret = 0;
1611 
1612 	struct btree_iter iter;
1613 	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
1614 			     BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
1615 
1616 	while (1) {
1617 		struct bkey_s_c k;
1618 		ret = lockrestart_do(trans,
1619 				bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
1620 		if (ret)
1621 			break;
1622 
1623 		if (!k.k)
1624 			break;
1625 
1626 		d->pos.btree	= iter.btree_id;
1627 		d->pos.pos	= iter.pos;
1628 
1629 		if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
1630 			continue;
1631 
1632 		if (snapshot_id_dying(d, k.k->p.snapshot)) {
1633 			struct bpos start	= POS(k.k->p.offset, 0);
1634 			struct bpos end		= POS(k.k->p.offset, U64_MAX);
1635 
1636 			ret   = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
1637 				delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
1638 				delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
1639 			if (ret)
1640 				break;
1641 
1642 			bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
1643 		} else {
1644 			bch2_btree_iter_advance(trans, &iter);
1645 		}
1646 	}
1647 	bch2_trans_iter_exit(trans, &iter);
1648 
1649 	if (ret)
1650 		goto err;
1651 
1652 	prev_inum = 0;
1653 	ret = for_each_btree_key_commit(trans, iter,
1654 			BTREE_ID_inodes, POS_MIN,
1655 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1656 			&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1657 		d->pos.btree	= iter.btree_id;
1658 		d->pos.pos	= iter.pos;
1659 
1660 		if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
1661 			continue;
1662 
1663 		delete_dead_snapshots_process_key(trans, &iter, k);
1664 	}));
1665 err:
1666 	bch2_disk_reservation_put(c, &res);
1667 	return ret;
1668 }
1669 
1670 /*
1671  * For a given snapshot, if it doesn't have a subvolume that points to it, and
1672  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1673  * as deleted.
1674  */
1675 static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k)
1676 {
1677 	if (k.k->type != KEY_TYPE_snapshot)
1678 		return 0;
1679 
1680 	struct bch_fs *c = trans->c;
1681 	struct snapshot_delete *d = &c->snapshot_delete;
1682 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
1683 	unsigned live_children = 0;
1684 	int ret = 0;
1685 
1686 	if (BCH_SNAPSHOT_SUBVOL(s.v))
1687 		return 0;
1688 
1689 	if (BCH_SNAPSHOT_DELETED(s.v))
1690 		return 0;
1691 
1692 	mutex_lock(&d->progress_lock);
1693 	for (unsigned i = 0; i < 2; i++) {
1694 		u32 child = le32_to_cpu(s.v->children[i]);
1695 
1696 		live_children += child &&
1697 			!snapshot_list_has_id(&d->delete_leaves, child);
1698 	}
1699 
1700 	u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
1701 
1702 	if (live_children == 0) {
1703 		ret =   snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
1704 			snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
1705 	} else if (live_children == 1) {
1706 		struct snapshot_interior_delete n = {
1707 			.id		= s.k->p.offset,
1708 			.live_child	= live_child(c, s.k->p.offset),
1709 		};
1710 
1711 		if (!n.live_child) {
1712 			bch_err(c, "error finding live child of snapshot %u", n.id);
1713 			ret = -EINVAL;
1714 		} else {
1715 			ret =   snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
1716 				darray_push(&d->delete_interior, n);
1717 		}
1718 	}
1719 	mutex_unlock(&d->progress_lock);
1720 
1721 	return ret;
1722 }
1723 
1724 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1725 						interior_delete_list *skip)
1726 {
1727 	guard(rcu)();
1728 	while (interior_delete_has_id(skip, id))
1729 		id = __bch2_snapshot_parent(c, id);
1730 
1731 	while (n--) {
1732 		do {
1733 			id = __bch2_snapshot_parent(c, id);
1734 		} while (interior_delete_has_id(skip, id));
1735 	}
1736 
1737 	return id;
1738 }
1739 
1740 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1741 					      struct btree_iter *iter, struct bkey_s_c k,
1742 					      interior_delete_list *deleted)
1743 {
1744 	struct bch_fs *c = trans->c;
1745 	u32 nr_deleted_ancestors = 0;
1746 	struct bkey_i_snapshot *s;
1747 	int ret;
1748 
1749 	if (!bch2_snapshot_exists(c, k.k->p.offset))
1750 		return 0;
1751 
1752 	if (k.k->type != KEY_TYPE_snapshot)
1753 		return 0;
1754 
1755 	if (interior_delete_has_id(deleted, k.k->p.offset))
1756 		return 0;
1757 
1758 	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1759 	ret = PTR_ERR_OR_ZERO(s);
1760 	if (ret)
1761 		return ret;
1762 
1763 	darray_for_each(*deleted, i)
1764 		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
1765 
1766 	if (!nr_deleted_ancestors)
1767 		return 0;
1768 
1769 	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1770 
1771 	if (!s->v.depth) {
1772 		s->v.skip[0] = 0;
1773 		s->v.skip[1] = 0;
1774 		s->v.skip[2] = 0;
1775 	} else {
1776 		u32 depth = le32_to_cpu(s->v.depth);
1777 		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1778 
1779 		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1780 			u32 id = le32_to_cpu(s->v.skip[j]);
1781 
1782 			if (interior_delete_has_id(deleted, id)) {
1783 				id = bch2_snapshot_nth_parent_skip(c,
1784 							parent,
1785 							depth > 1
1786 							? get_random_u32_below(depth - 1)
1787 							: 0,
1788 							deleted);
1789 				s->v.skip[j] = cpu_to_le32(id);
1790 			}
1791 		}
1792 
1793 		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1794 	}
1795 
1796 	return bch2_trans_update(trans, iter, &s->k_i, 0);
1797 }
1798 
1799 static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d)
1800 {
1801 	prt_printf(out, "deleting from trees");
1802 	darray_for_each(d->deleting_from_trees, i)
1803 		prt_printf(out, " %u", *i);
1804 
1805 	prt_printf(out, "deleting leaves");
1806 	darray_for_each(d->delete_leaves, i)
1807 		prt_printf(out, " %u", *i);
1808 	prt_newline(out);
1809 
1810 	prt_printf(out, "interior");
1811 	darray_for_each(d->delete_interior, i)
1812 		prt_printf(out, " %u->%u", i->id, i->live_child);
1813 	prt_newline(out);
1814 }
1815 
1816 int __bch2_delete_dead_snapshots(struct bch_fs *c)
1817 {
1818 	struct snapshot_delete *d = &c->snapshot_delete;
1819 	int ret = 0;
1820 
1821 	if (!mutex_trylock(&d->lock))
1822 		return 0;
1823 
1824 	if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1825 		goto out_unlock;
1826 
1827 	struct btree_trans *trans = bch2_trans_get(c);
1828 
1829 	/*
1830 	 * For every snapshot node: If we have no live children and it's not
1831 	 * pointed to by a subvolume, delete it:
1832 	 */
1833 	d->running = true;
1834 	d->pos = BBPOS_MIN;
1835 
1836 	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
1837 		check_should_delete_snapshot(trans, k));
1838 	if (!bch2_err_matches(ret, EROFS))
1839 		bch_err_msg(c, ret, "walking snapshots");
1840 	if (ret)
1841 		goto err;
1842 
1843 	if (!d->delete_leaves.nr && !d->delete_interior.nr)
1844 		goto err;
1845 
1846 	{
1847 		struct printbuf buf = PRINTBUF;
1848 		bch2_snapshot_delete_nodes_to_text(&buf, d);
1849 
1850 		ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
1851 		printbuf_exit(&buf);
1852 		if (ret)
1853 			goto err;
1854 	}
1855 
1856 	ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
1857 		? delete_dead_snapshot_keys_v2(trans)
1858 		: delete_dead_snapshot_keys_v1(trans);
1859 	if (!bch2_err_matches(ret, EROFS))
1860 		bch_err_msg(c, ret, "deleting keys from dying snapshots");
1861 	if (ret)
1862 		goto err;
1863 
1864 	darray_for_each(d->delete_leaves, i) {
1865 		ret = commit_do(trans, NULL, NULL, 0,
1866 			bch2_snapshot_node_delete(trans, *i));
1867 		if (!bch2_err_matches(ret, EROFS))
1868 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
1869 		if (ret)
1870 			goto err;
1871 	}
1872 
1873 	/*
1874 	 * Fixing children of deleted snapshots can't be done completely
1875 	 * atomically, if we crash between here and when we delete the interior
1876 	 * nodes some depth fields will be off:
1877 	 */
1878 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1879 				  BTREE_ITER_intent, k,
1880 				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1881 		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior));
1882 	if (ret)
1883 		goto err;
1884 
1885 	darray_for_each(d->delete_interior, i) {
1886 		ret = commit_do(trans, NULL, NULL, 0,
1887 			bch2_snapshot_node_delete(trans, i->id));
1888 		if (!bch2_err_matches(ret, EROFS))
1889 			bch_err_msg(c, ret, "deleting snapshot %u", i->id);
1890 		if (ret)
1891 			goto err;
1892 	}
1893 err:
1894 	mutex_lock(&d->progress_lock);
1895 	darray_exit(&d->deleting_from_trees);
1896 	darray_exit(&d->delete_interior);
1897 	darray_exit(&d->delete_leaves);
1898 	d->running = false;
1899 	mutex_unlock(&d->progress_lock);
1900 	bch2_trans_put(trans);
1901 
1902 	bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots);
1903 out_unlock:
1904 	mutex_unlock(&d->lock);
1905 	if (!bch2_err_matches(ret, EROFS))
1906 		bch_err_fn(c, ret);
1907 	return ret;
1908 }
1909 
1910 int bch2_delete_dead_snapshots(struct bch_fs *c)
1911 {
1912 	if (!c->opts.auto_snapshot_deletion)
1913 		return 0;
1914 
1915 	return __bch2_delete_dead_snapshots(c);
1916 }
1917 
1918 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1919 {
1920 	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work);
1921 
1922 	set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
1923 
1924 	bch2_delete_dead_snapshots(c);
1925 	enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
1926 }
1927 
1928 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1929 {
1930 	if (!c->opts.auto_snapshot_deletion)
1931 		return;
1932 
1933 	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots))
1934 		return;
1935 
1936 	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
1937 
1938 	if (!queue_work(system_long_wq, &c->snapshot_delete.work))
1939 		enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
1940 }
1941 
1942 void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
1943 {
1944 	struct snapshot_delete *d = &c->snapshot_delete;
1945 
1946 	if (!d->running) {
1947 		prt_str(out, "(not running)");
1948 		return;
1949 	}
1950 
1951 	mutex_lock(&d->progress_lock);
1952 	bch2_snapshot_delete_nodes_to_text(out, d);
1953 
1954 	bch2_bbpos_to_text(out, d->pos);
1955 	mutex_unlock(&d->progress_lock);
1956 }
1957 
1958 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1959 				       enum btree_id id,
1960 				       struct bpos pos)
1961 {
1962 	struct bch_fs *c = trans->c;
1963 	struct btree_iter iter;
1964 	struct bkey_s_c k;
1965 	int ret;
1966 
1967 	for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
1968 					     BTREE_ITER_not_extents|
1969 					     BTREE_ITER_all_snapshots,
1970 					     k, ret) {
1971 		if (!bkey_eq(pos, k.k->p))
1972 			break;
1973 
1974 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1975 			ret = 1;
1976 			break;
1977 		}
1978 	}
1979 	bch2_trans_iter_exit(trans, &iter);
1980 
1981 	return ret;
1982 }
1983 
1984 static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
1985 {
1986 	/* If there's one child, it's redundant and keys will be moved to the child */
1987 	return !!snap.v->children[0] + !!snap.v->children[1] == 1;
1988 }
1989 
1990 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1991 {
1992 	if (k.k->type != KEY_TYPE_snapshot)
1993 		return 0;
1994 
1995 	struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
1996 	if (BCH_SNAPSHOT_WILL_DELETE(snap.v) ||
1997 	    interior_snapshot_needs_delete(snap))
1998 		set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
1999 
2000 	return 0;
2001 }
2002 
2003 int bch2_snapshots_read(struct bch_fs *c)
2004 {
2005 	/*
2006 	 * Initializing the is_ancestor bitmaps requires ancestors to already be
2007 	 * initialized - so mark in reverse:
2008 	 */
2009 	int ret = bch2_trans_run(c,
2010 		for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
2011 				   POS_MAX, 0, k,
2012 			__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
2013 			bch2_check_snapshot_needs_deletion(trans, k)));
2014 	bch_err_fn(c, ret);
2015 
2016 	/*
2017 	 * It's important that we check if we need to reconstruct snapshots
2018 	 * before going RW, so we mark that pass as required in the superblock -
2019 	 * otherwise, we could end up deleting keys with missing snapshot nodes
2020 	 * instead
2021 	 */
2022 	BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
2023 	       test_bit(BCH_FS_may_go_rw, &c->flags));
2024 
2025 	return ret;
2026 }
2027 
2028 void bch2_fs_snapshots_exit(struct bch_fs *c)
2029 {
2030 	kvfree(rcu_dereference_protected(c->snapshots, true));
2031 }
2032 
2033 void bch2_fs_snapshots_init_early(struct bch_fs *c)
2034 {
2035 	INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work);
2036 	mutex_init(&c->snapshot_delete.lock);
2037 	mutex_init(&c->snapshot_delete.progress_lock);
2038 	mutex_init(&c->snapshots_unlinked_lock);
2039 }
2040