xref: /linux/fs/bcachefs/snapshot.c (revision e814f3fd16acfb7f9966773953de8f740a1e3202)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_key_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "errcode.h"
10 #include "error.h"
11 #include "fs.h"
12 #include "recovery_passes.h"
13 #include "snapshot.h"
14 
15 #include <linux/random.h>
16 
17 /*
18  * Snapshot trees:
19  *
20  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
21  * exist to provide a stable identifier for the whole lifetime of a snapshot
22  * tree.
23  */
24 
25 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
26 				struct bkey_s_c k)
27 {
28 	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
29 
30 	prt_printf(out, "subvol %u root snapshot %u",
31 		   le32_to_cpu(t.v->master_subvol),
32 		   le32_to_cpu(t.v->root_snapshot));
33 }
34 
35 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
36 				struct bkey_validate_context from)
37 {
38 	int ret = 0;
39 
40 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
41 			 bkey_lt(k.k->p, POS(0, 1)),
42 			 c, snapshot_tree_pos_bad,
43 			 "bad pos");
44 fsck_err:
45 	return ret;
46 }
47 
48 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
49 			      struct bch_snapshot_tree *s)
50 {
51 	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
52 					  BTREE_ITER_with_updates, snapshot_tree, s);
53 
54 	if (bch2_err_matches(ret, ENOENT))
55 		ret = -BCH_ERR_ENOENT_snapshot_tree;
56 	return ret;
57 }
58 
59 struct bkey_i_snapshot_tree *
60 __bch2_snapshot_tree_create(struct btree_trans *trans)
61 {
62 	struct btree_iter iter;
63 	int ret = bch2_bkey_get_empty_slot(trans, &iter,
64 			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
65 	struct bkey_i_snapshot_tree *s_t;
66 
67 	if (ret == -BCH_ERR_ENOSPC_btree_slot)
68 		ret = -BCH_ERR_ENOSPC_snapshot_tree;
69 	if (ret)
70 		return ERR_PTR(ret);
71 
72 	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
73 	ret = PTR_ERR_OR_ZERO(s_t);
74 	bch2_trans_iter_exit(trans, &iter);
75 	return ret ? ERR_PTR(ret) : s_t;
76 }
77 
78 static int bch2_snapshot_tree_create(struct btree_trans *trans,
79 				u32 root_id, u32 subvol_id, u32 *tree_id)
80 {
81 	struct bkey_i_snapshot_tree *n_tree =
82 		__bch2_snapshot_tree_create(trans);
83 
84 	if (IS_ERR(n_tree))
85 		return PTR_ERR(n_tree);
86 
87 	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
88 	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
89 	*tree_id = n_tree->k.p.offset;
90 	return 0;
91 }
92 
93 /* Snapshot nodes: */
94 
95 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
96 {
97 	while (id && id < ancestor) {
98 		const struct snapshot_t *s = __snapshot_t(t, id);
99 		id = s ? s->parent : 0;
100 	}
101 	return id == ancestor;
102 }
103 
104 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
105 {
106 	rcu_read_lock();
107 	bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
108 	rcu_read_unlock();
109 
110 	return ret;
111 }
112 
113 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
114 {
115 	const struct snapshot_t *s = __snapshot_t(t, id);
116 	if (!s)
117 		return 0;
118 
119 	if (s->skip[2] <= ancestor)
120 		return s->skip[2];
121 	if (s->skip[1] <= ancestor)
122 		return s->skip[1];
123 	if (s->skip[0] <= ancestor)
124 		return s->skip[0];
125 	return s->parent;
126 }
127 
128 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
129 {
130 	const struct snapshot_t *s = __snapshot_t(t, id);
131 	if (!s)
132 		return false;
133 
134 	return test_bit(ancestor - id - 1, s->is_ancestor);
135 }
136 
137 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
138 {
139 	bool ret;
140 
141 	rcu_read_lock();
142 	struct snapshot_table *t = rcu_dereference(c->snapshots);
143 
144 	if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
145 		ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
146 		goto out;
147 	}
148 
149 	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
150 		id = get_ancestor_below(t, id, ancestor);
151 
152 	ret = id && id < ancestor
153 		? test_ancestor_bitmap(t, id, ancestor)
154 		: id == ancestor;
155 
156 	EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
157 out:
158 	rcu_read_unlock();
159 
160 	return ret;
161 }
162 
163 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
164 {
165 	size_t idx = U32_MAX - id;
166 	struct snapshot_table *new, *old;
167 
168 	size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
169 	size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
170 
171 	if (unlikely(new_bytes > INT_MAX))
172 		return NULL;
173 
174 	new = kvzalloc(new_bytes, GFP_KERNEL);
175 	if (!new)
176 		return NULL;
177 
178 	new->nr = new_size;
179 
180 	old = rcu_dereference_protected(c->snapshots, true);
181 	if (old)
182 		memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
183 
184 	rcu_assign_pointer(c->snapshots, new);
185 	kvfree_rcu(old, rcu);
186 
187 	return &rcu_dereference_protected(c->snapshots,
188 				lockdep_is_held(&c->snapshot_table_lock))->s[idx];
189 }
190 
191 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
192 {
193 	size_t idx = U32_MAX - id;
194 	struct snapshot_table *table =
195 		rcu_dereference_protected(c->snapshots,
196 				lockdep_is_held(&c->snapshot_table_lock));
197 
198 	lockdep_assert_held(&c->snapshot_table_lock);
199 
200 	if (likely(table && idx < table->nr))
201 		return &table->s[idx];
202 
203 	return __snapshot_t_mut(c, id);
204 }
205 
206 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
207 			   struct bkey_s_c k)
208 {
209 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
210 
211 	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
212 	       BCH_SNAPSHOT_SUBVOL(s.v),
213 	       BCH_SNAPSHOT_DELETED(s.v),
214 	       le32_to_cpu(s.v->parent),
215 	       le32_to_cpu(s.v->children[0]),
216 	       le32_to_cpu(s.v->children[1]),
217 	       le32_to_cpu(s.v->subvol),
218 	       le32_to_cpu(s.v->tree));
219 
220 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
221 		prt_printf(out, " depth %u skiplist %u %u %u",
222 			   le32_to_cpu(s.v->depth),
223 			   le32_to_cpu(s.v->skip[0]),
224 			   le32_to_cpu(s.v->skip[1]),
225 			   le32_to_cpu(s.v->skip[2]));
226 }
227 
228 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
229 			   struct bkey_validate_context from)
230 {
231 	struct bkey_s_c_snapshot s;
232 	u32 i, id;
233 	int ret = 0;
234 
235 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
236 			 bkey_lt(k.k->p, POS(0, 1)),
237 			 c, snapshot_pos_bad,
238 			 "bad pos");
239 
240 	s = bkey_s_c_to_snapshot(k);
241 
242 	id = le32_to_cpu(s.v->parent);
243 	bkey_fsck_err_on(id && id <= k.k->p.offset,
244 			 c, snapshot_parent_bad,
245 			 "bad parent node (%u <= %llu)",
246 			 id, k.k->p.offset);
247 
248 	bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
249 			 c, snapshot_children_not_normalized,
250 			 "children not normalized");
251 
252 	bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
253 			 c, snapshot_child_duplicate,
254 			 "duplicate child nodes");
255 
256 	for (i = 0; i < 2; i++) {
257 		id = le32_to_cpu(s.v->children[i]);
258 
259 		bkey_fsck_err_on(id >= k.k->p.offset,
260 				 c, snapshot_child_bad,
261 				 "bad child node (%u >= %llu)",
262 				 id, k.k->p.offset);
263 	}
264 
265 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
266 		bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
267 				 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
268 				 c, snapshot_skiplist_not_normalized,
269 				 "skiplist not normalized");
270 
271 		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
272 			id = le32_to_cpu(s.v->skip[i]);
273 
274 			bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
275 					 c, snapshot_skiplist_bad,
276 					 "bad skiplist node %u", id);
277 		}
278 	}
279 fsck_err:
280 	return ret;
281 }
282 
283 static int __bch2_mark_snapshot(struct btree_trans *trans,
284 		       enum btree_id btree, unsigned level,
285 		       struct bkey_s_c old, struct bkey_s_c new,
286 		       enum btree_iter_update_trigger_flags flags)
287 {
288 	struct bch_fs *c = trans->c;
289 	struct snapshot_t *t;
290 	u32 id = new.k->p.offset;
291 	int ret = 0;
292 
293 	mutex_lock(&c->snapshot_table_lock);
294 
295 	t = snapshot_t_mut(c, id);
296 	if (!t) {
297 		ret = -BCH_ERR_ENOMEM_mark_snapshot;
298 		goto err;
299 	}
300 
301 	if (new.k->type == KEY_TYPE_snapshot) {
302 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
303 
304 		t->live		= true;
305 		t->parent	= le32_to_cpu(s.v->parent);
306 		t->children[0]	= le32_to_cpu(s.v->children[0]);
307 		t->children[1]	= le32_to_cpu(s.v->children[1]);
308 		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
309 		t->tree		= le32_to_cpu(s.v->tree);
310 
311 		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
312 			t->depth	= le32_to_cpu(s.v->depth);
313 			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
314 			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
315 			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
316 		} else {
317 			t->depth	= 0;
318 			t->skip[0]	= 0;
319 			t->skip[1]	= 0;
320 			t->skip[2]	= 0;
321 		}
322 
323 		u32 parent = id;
324 
325 		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
326 		       parent - id - 1 < IS_ANCESTOR_BITMAP)
327 			__set_bit(parent - id - 1, t->is_ancestor);
328 
329 		if (BCH_SNAPSHOT_DELETED(s.v)) {
330 			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
331 			if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
332 				bch2_delete_dead_snapshots_async(c);
333 		}
334 	} else {
335 		memset(t, 0, sizeof(*t));
336 	}
337 err:
338 	mutex_unlock(&c->snapshot_table_lock);
339 	return ret;
340 }
341 
342 int bch2_mark_snapshot(struct btree_trans *trans,
343 		       enum btree_id btree, unsigned level,
344 		       struct bkey_s_c old, struct bkey_s new,
345 		       enum btree_iter_update_trigger_flags flags)
346 {
347 	return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
348 }
349 
350 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
351 			 struct bch_snapshot *s)
352 {
353 	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
354 				       BTREE_ITER_with_updates, snapshot, s);
355 }
356 
357 /* fsck: */
358 
359 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
360 {
361 	return snapshot_t(c, id)->children[child];
362 }
363 
364 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
365 {
366 	return bch2_snapshot_child(c, id, 0);
367 }
368 
369 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
370 {
371 	return bch2_snapshot_child(c, id, 1);
372 }
373 
374 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
375 {
376 	u32 n, parent;
377 
378 	n = bch2_snapshot_left_child(c, id);
379 	if (n)
380 		return n;
381 
382 	while ((parent = bch2_snapshot_parent(c, id))) {
383 		n = bch2_snapshot_right_child(c, parent);
384 		if (n && n != id)
385 			return n;
386 		id = parent;
387 	}
388 
389 	return 0;
390 }
391 
392 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
393 {
394 	u32 id = snapshot_root;
395 	u32 subvol = 0, s;
396 
397 	rcu_read_lock();
398 	while (id) {
399 		s = snapshot_t(c, id)->subvol;
400 
401 		if (s && (!subvol || s < subvol))
402 			subvol = s;
403 
404 		id = bch2_snapshot_tree_next(c, id);
405 	}
406 	rcu_read_unlock();
407 
408 	return subvol;
409 }
410 
411 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
412 					    u32 snapshot_root, u32 *subvol_id)
413 {
414 	struct bch_fs *c = trans->c;
415 	struct btree_iter iter;
416 	struct bkey_s_c k;
417 	bool found = false;
418 	int ret;
419 
420 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
421 				     0, k, ret) {
422 		if (k.k->type != KEY_TYPE_subvolume)
423 			continue;
424 
425 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
426 		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
427 			continue;
428 		if (!BCH_SUBVOLUME_SNAP(s.v)) {
429 			*subvol_id = s.k->p.offset;
430 			found = true;
431 			break;
432 		}
433 	}
434 	bch2_trans_iter_exit(trans, &iter);
435 
436 	if (!ret && !found) {
437 		struct bkey_i_subvolume *u;
438 
439 		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
440 
441 		u = bch2_bkey_get_mut_typed(trans, &iter,
442 					    BTREE_ID_subvolumes, POS(0, *subvol_id),
443 					    0, subvolume);
444 		ret = PTR_ERR_OR_ZERO(u);
445 		if (ret)
446 			return ret;
447 
448 		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
449 	}
450 
451 	return ret;
452 }
453 
454 static int check_snapshot_tree(struct btree_trans *trans,
455 			       struct btree_iter *iter,
456 			       struct bkey_s_c k)
457 {
458 	struct bch_fs *c = trans->c;
459 	struct bkey_s_c_snapshot_tree st;
460 	struct bch_snapshot s;
461 	struct bch_subvolume subvol;
462 	struct printbuf buf = PRINTBUF;
463 	struct btree_iter snapshot_iter = {};
464 	u32 root_id;
465 	int ret;
466 
467 	if (k.k->type != KEY_TYPE_snapshot_tree)
468 		return 0;
469 
470 	st = bkey_s_c_to_snapshot_tree(k);
471 	root_id = le32_to_cpu(st.v->root_snapshot);
472 
473 	struct bkey_s_c_snapshot snapshot_k =
474 		bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
475 					 POS(0, root_id), 0, snapshot);
476 	ret = bkey_err(snapshot_k);
477 	if (ret && !bch2_err_matches(ret, ENOENT))
478 		goto err;
479 
480 	if (!ret)
481 		bkey_val_copy(&s, snapshot_k);
482 
483 	if (fsck_err_on(ret ||
484 			root_id != bch2_snapshot_root(c, root_id) ||
485 			st.k->p.offset != le32_to_cpu(s.tree),
486 			trans, snapshot_tree_to_missing_snapshot,
487 			"snapshot tree points to missing/incorrect snapshot:\n  %s",
488 			(bch2_bkey_val_to_text(&buf, c, st.s_c),
489 			 prt_newline(&buf),
490 			 ret
491 			 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
492 			 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
493 			 buf.buf))) {
494 		ret = bch2_btree_delete_at(trans, iter, 0);
495 		goto err;
496 	}
497 
498 	if (!st.v->master_subvol)
499 		goto out;
500 
501 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
502 	if (ret && !bch2_err_matches(ret, ENOENT))
503 		goto err;
504 
505 	if (fsck_err_on(ret,
506 			trans, snapshot_tree_to_missing_subvol,
507 			"snapshot tree points to missing subvolume:\n  %s",
508 			(printbuf_reset(&buf),
509 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
510 	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
511 						le32_to_cpu(subvol.snapshot),
512 						root_id),
513 			trans, snapshot_tree_to_wrong_subvol,
514 			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
515 			(printbuf_reset(&buf),
516 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
517 	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
518 			trans, snapshot_tree_to_snapshot_subvol,
519 			"snapshot tree points to snapshot subvolume:\n  %s",
520 			(printbuf_reset(&buf),
521 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
522 		struct bkey_i_snapshot_tree *u;
523 		u32 subvol_id;
524 
525 		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
526 		bch_err_fn(c, ret);
527 
528 		if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
529 			ret = 0;
530 			goto err;
531 		}
532 
533 		if (ret)
534 			goto err;
535 
536 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
537 		ret = PTR_ERR_OR_ZERO(u);
538 		if (ret)
539 			goto err;
540 
541 		u->v.master_subvol = cpu_to_le32(subvol_id);
542 		st = snapshot_tree_i_to_s_c(u);
543 	}
544 out:
545 err:
546 fsck_err:
547 	bch2_trans_iter_exit(trans, &snapshot_iter);
548 	printbuf_exit(&buf);
549 	return ret;
550 }
551 
552 /*
553  * For each snapshot_tree, make sure it points to the root of a snapshot tree
554  * and that snapshot entry points back to it, or delete it.
555  *
556  * And, make sure it points to a subvolume within that snapshot tree, or correct
557  * it to point to the oldest subvolume within that snapshot tree.
558  */
559 int bch2_check_snapshot_trees(struct bch_fs *c)
560 {
561 	int ret = bch2_trans_run(c,
562 		for_each_btree_key_commit(trans, iter,
563 			BTREE_ID_snapshot_trees, POS_MIN,
564 			BTREE_ITER_prefetch, k,
565 			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
566 		check_snapshot_tree(trans, &iter, k)));
567 	bch_err_fn(c, ret);
568 	return ret;
569 }
570 
571 /*
572  * Look up snapshot tree for @tree_id and find root,
573  * make sure @snap_id is a descendent:
574  */
575 static int snapshot_tree_ptr_good(struct btree_trans *trans,
576 				  u32 snap_id, u32 tree_id)
577 {
578 	struct bch_snapshot_tree s_t;
579 	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
580 
581 	if (bch2_err_matches(ret, ENOENT))
582 		return 0;
583 	if (ret)
584 		return ret;
585 
586 	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
587 }
588 
589 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
590 {
591 	const struct snapshot_t *s;
592 
593 	if (!id)
594 		return 0;
595 
596 	rcu_read_lock();
597 	s = snapshot_t(c, id);
598 	if (s->parent)
599 		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
600 	rcu_read_unlock();
601 
602 	return id;
603 }
604 
605 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
606 {
607 	unsigned i;
608 
609 	for (i = 0; i < 3; i++)
610 		if (!s.parent) {
611 			if (s.skip[i])
612 				return false;
613 		} else {
614 			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
615 				return false;
616 		}
617 
618 	return true;
619 }
620 
621 /*
622  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
623  * its snapshot_tree pointer is correct (allocate new one if necessary), then
624  * update this node's pointer to root node's pointer:
625  */
626 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
627 				    struct btree_iter *iter,
628 				    struct bkey_s_c k,
629 				    struct bch_snapshot *s)
630 {
631 	struct bch_fs *c = trans->c;
632 	struct btree_iter root_iter;
633 	struct bch_snapshot_tree s_t;
634 	struct bkey_s_c_snapshot root;
635 	struct bkey_i_snapshot *u;
636 	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
637 	int ret;
638 
639 	root = bch2_bkey_get_iter_typed(trans, &root_iter,
640 			       BTREE_ID_snapshots, POS(0, root_id),
641 			       BTREE_ITER_with_updates, snapshot);
642 	ret = bkey_err(root);
643 	if (ret)
644 		goto err;
645 
646 	tree_id = le32_to_cpu(root.v->tree);
647 
648 	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
649 	if (ret && !bch2_err_matches(ret, ENOENT))
650 		return ret;
651 
652 	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
653 		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
654 		ret =   PTR_ERR_OR_ZERO(u) ?:
655 			bch2_snapshot_tree_create(trans, root_id,
656 				bch2_snapshot_tree_oldest_subvol(c, root_id),
657 				&tree_id);
658 		if (ret)
659 			goto err;
660 
661 		u->v.tree = cpu_to_le32(tree_id);
662 		if (k.k->p.offset == root_id)
663 			*s = u->v;
664 	}
665 
666 	if (k.k->p.offset != root_id) {
667 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
668 		ret = PTR_ERR_OR_ZERO(u);
669 		if (ret)
670 			goto err;
671 
672 		u->v.tree = cpu_to_le32(tree_id);
673 		*s = u->v;
674 	}
675 err:
676 	bch2_trans_iter_exit(trans, &root_iter);
677 	return ret;
678 }
679 
680 static int check_snapshot(struct btree_trans *trans,
681 			  struct btree_iter *iter,
682 			  struct bkey_s_c k)
683 {
684 	struct bch_fs *c = trans->c;
685 	struct bch_snapshot s;
686 	struct bch_subvolume subvol;
687 	struct bch_snapshot v;
688 	struct bkey_i_snapshot *u;
689 	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
690 	u32 real_depth;
691 	struct printbuf buf = PRINTBUF;
692 	u32 i, id;
693 	int ret = 0;
694 
695 	if (k.k->type != KEY_TYPE_snapshot)
696 		return 0;
697 
698 	memset(&s, 0, sizeof(s));
699 	memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
700 
701 	id = le32_to_cpu(s.parent);
702 	if (id) {
703 		ret = bch2_snapshot_lookup(trans, id, &v);
704 		if (bch2_err_matches(ret, ENOENT))
705 			bch_err(c, "snapshot with nonexistent parent:\n  %s",
706 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
707 		if (ret)
708 			goto err;
709 
710 		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
711 		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
712 			bch_err(c, "snapshot parent %u missing pointer to child %llu",
713 				id, k.k->p.offset);
714 			ret = -EINVAL;
715 			goto err;
716 		}
717 	}
718 
719 	for (i = 0; i < 2 && s.children[i]; i++) {
720 		id = le32_to_cpu(s.children[i]);
721 
722 		ret = bch2_snapshot_lookup(trans, id, &v);
723 		if (bch2_err_matches(ret, ENOENT))
724 			bch_err(c, "snapshot node %llu has nonexistent child %u",
725 				k.k->p.offset, id);
726 		if (ret)
727 			goto err;
728 
729 		if (le32_to_cpu(v.parent) != k.k->p.offset) {
730 			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
731 				id, le32_to_cpu(v.parent), k.k->p.offset);
732 			ret = -EINVAL;
733 			goto err;
734 		}
735 	}
736 
737 	bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
738 		!BCH_SNAPSHOT_DELETED(&s);
739 
740 	if (should_have_subvol) {
741 		id = le32_to_cpu(s.subvol);
742 		ret = bch2_subvolume_get(trans, id, false, &subvol);
743 		if (bch2_err_matches(ret, ENOENT))
744 			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
745 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
746 		if (ret)
747 			goto err;
748 
749 		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
750 			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
751 				k.k->p.offset);
752 			ret = -EINVAL;
753 			goto err;
754 		}
755 	} else {
756 		if (fsck_err_on(s.subvol,
757 				trans, snapshot_should_not_have_subvol,
758 				"snapshot should not point to subvol:\n  %s",
759 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
760 			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
761 			ret = PTR_ERR_OR_ZERO(u);
762 			if (ret)
763 				goto err;
764 
765 			u->v.subvol = 0;
766 			s = u->v;
767 		}
768 	}
769 
770 	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
771 	if (ret < 0)
772 		goto err;
773 
774 	if (fsck_err_on(!ret,
775 			trans, snapshot_to_bad_snapshot_tree,
776 			"snapshot points to missing/incorrect tree:\n  %s",
777 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
778 		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
779 		if (ret)
780 			goto err;
781 	}
782 	ret = 0;
783 
784 	real_depth = bch2_snapshot_depth(c, parent_id);
785 
786 	if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
787 			trans, snapshot_bad_depth,
788 			"snapshot with incorrect depth field, should be %u:\n  %s",
789 			real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
790 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
791 		ret = PTR_ERR_OR_ZERO(u);
792 		if (ret)
793 			goto err;
794 
795 		u->v.depth = cpu_to_le32(real_depth);
796 		s = u->v;
797 	}
798 
799 	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
800 	if (ret < 0)
801 		goto err;
802 
803 	if (fsck_err_on(!ret,
804 			trans, snapshot_bad_skiplist,
805 			"snapshot with bad skiplist field:\n  %s",
806 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
807 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
808 		ret = PTR_ERR_OR_ZERO(u);
809 		if (ret)
810 			goto err;
811 
812 		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
813 			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
814 
815 		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
816 		s = u->v;
817 	}
818 	ret = 0;
819 err:
820 fsck_err:
821 	printbuf_exit(&buf);
822 	return ret;
823 }
824 
825 int bch2_check_snapshots(struct bch_fs *c)
826 {
827 	/*
828 	 * We iterate backwards as checking/fixing the depth field requires that
829 	 * the parent's depth already be correct:
830 	 */
831 	int ret = bch2_trans_run(c,
832 		for_each_btree_key_reverse_commit(trans, iter,
833 				BTREE_ID_snapshots, POS_MAX,
834 				BTREE_ITER_prefetch, k,
835 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
836 			check_snapshot(trans, &iter, k)));
837 	bch_err_fn(c, ret);
838 	return ret;
839 }
840 
841 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
842 {
843 	struct bch_fs *c = trans->c;
844 
845 	if (bch2_snapshot_exists(c, id))
846 		return 0;
847 
848 	/* Do we need to reconstruct the snapshot_tree entry as well? */
849 	struct btree_iter iter;
850 	struct bkey_s_c k;
851 	int ret = 0;
852 	u32 tree_id = 0;
853 
854 	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
855 				     0, k, ret) {
856 		if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
857 			tree_id = k.k->p.offset;
858 			break;
859 		}
860 	}
861 	bch2_trans_iter_exit(trans, &iter);
862 
863 	if (ret)
864 		return ret;
865 
866 	if (!tree_id) {
867 		ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
868 		if (ret)
869 			return ret;
870 	}
871 
872 	struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
873 	ret = PTR_ERR_OR_ZERO(snapshot);
874 	if (ret)
875 		return ret;
876 
877 	bkey_snapshot_init(&snapshot->k_i);
878 	snapshot->k.p		= POS(0, id);
879 	snapshot->v.tree	= cpu_to_le32(tree_id);
880 	snapshot->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
881 
882 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
883 				     0, k, ret) {
884 		if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
885 			snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
886 			SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
887 			break;
888 		}
889 	}
890 	bch2_trans_iter_exit(trans, &iter);
891 
892 	return  bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
893 		bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
894 				   bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
895 }
896 
897 /* Figure out which snapshot nodes belong in the same tree: */
898 struct snapshot_tree_reconstruct {
899 	enum btree_id			btree;
900 	struct bpos			cur_pos;
901 	snapshot_id_list		cur_ids;
902 	DARRAY(snapshot_id_list)	trees;
903 };
904 
905 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
906 {
907 	darray_for_each(r->trees, i)
908 		darray_exit(i);
909 	darray_exit(&r->trees);
910 	darray_exit(&r->cur_ids);
911 }
912 
913 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
914 {
915 	return r->btree == BTREE_ID_inodes
916 		? r->cur_pos.offset == pos.offset
917 		: r->cur_pos.inode == pos.inode;
918 }
919 
920 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
921 {
922 	darray_for_each(*l, i)
923 		if (snapshot_list_has_id(r, *i))
924 			return true;
925 	return false;
926 }
927 
928 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
929 {
930 	bool first = true;
931 	darray_for_each(*s, i) {
932 		if (!first)
933 			prt_char(out, ' ');
934 		first = false;
935 		prt_printf(out, "%u", *i);
936 	}
937 }
938 
939 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
940 {
941 	if (r->cur_ids.nr) {
942 		darray_for_each(r->trees, i)
943 			if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
944 				int ret = snapshot_list_merge(c, i, &r->cur_ids);
945 				if (ret)
946 					return ret;
947 				goto out;
948 			}
949 		darray_push(&r->trees, r->cur_ids);
950 		darray_init(&r->cur_ids);
951 	}
952 out:
953 	r->cur_ids.nr = 0;
954 	return 0;
955 }
956 
957 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
958 {
959 	if (!same_snapshot(r, pos))
960 		snapshot_tree_reconstruct_next(c, r);
961 	r->cur_pos = pos;
962 	return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
963 }
964 
965 int bch2_reconstruct_snapshots(struct bch_fs *c)
966 {
967 	struct btree_trans *trans = bch2_trans_get(c);
968 	struct printbuf buf = PRINTBUF;
969 	struct snapshot_tree_reconstruct r = {};
970 	int ret = 0;
971 
972 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
973 		if (btree_type_has_snapshots(btree)) {
974 			r.btree = btree;
975 
976 			ret = for_each_btree_key(trans, iter, btree, POS_MIN,
977 					BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
978 				get_snapshot_trees(c, &r, k.k->p);
979 			}));
980 			if (ret)
981 				goto err;
982 
983 			snapshot_tree_reconstruct_next(c, &r);
984 		}
985 	}
986 
987 	darray_for_each(r.trees, t) {
988 		printbuf_reset(&buf);
989 		snapshot_id_list_to_text(&buf, t);
990 
991 		darray_for_each(*t, id) {
992 			if (fsck_err_on(!bch2_snapshot_exists(c, *id),
993 					trans, snapshot_node_missing,
994 					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
995 				if (t->nr > 1) {
996 					bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
997 					ret = -BCH_ERR_fsck_repair_unimplemented;
998 					goto err;
999 				}
1000 
1001 				ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1002 						check_snapshot_exists(trans, *id));
1003 				if (ret)
1004 					goto err;
1005 			}
1006 		}
1007 	}
1008 fsck_err:
1009 err:
1010 	bch2_trans_put(trans);
1011 	snapshot_tree_reconstruct_exit(&r);
1012 	printbuf_exit(&buf);
1013 	bch_err_fn(c, ret);
1014 	return ret;
1015 }
1016 
1017 int bch2_check_key_has_snapshot(struct btree_trans *trans,
1018 				struct btree_iter *iter,
1019 				struct bkey_s_c k)
1020 {
1021 	struct bch_fs *c = trans->c;
1022 	struct printbuf buf = PRINTBUF;
1023 	int ret = 0;
1024 
1025 	if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
1026 			trans, bkey_in_missing_snapshot,
1027 			"key in missing snapshot %s, delete?",
1028 			(bch2_btree_id_to_text(&buf, iter->btree_id),
1029 			 prt_char(&buf, ' '),
1030 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1031 		ret = bch2_btree_delete_at(trans, iter,
1032 					    BTREE_UPDATE_internal_snapshot_node) ?: 1;
1033 fsck_err:
1034 	printbuf_exit(&buf);
1035 	return ret;
1036 }
1037 
1038 /*
1039  * Mark a snapshot as deleted, for future cleanup:
1040  */
1041 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1042 {
1043 	struct btree_iter iter;
1044 	struct bkey_i_snapshot *s =
1045 		bch2_bkey_get_mut_typed(trans, &iter,
1046 				    BTREE_ID_snapshots, POS(0, id),
1047 				    0, snapshot);
1048 	int ret = PTR_ERR_OR_ZERO(s);
1049 	if (unlikely(ret)) {
1050 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1051 					trans->c, "missing snapshot %u", id);
1052 		return ret;
1053 	}
1054 
1055 	/* already deleted? */
1056 	if (BCH_SNAPSHOT_DELETED(&s->v))
1057 		goto err;
1058 
1059 	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1060 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1061 	s->v.subvol = 0;
1062 err:
1063 	bch2_trans_iter_exit(trans, &iter);
1064 	return ret;
1065 }
1066 
1067 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1068 {
1069 	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1070 		swap(s->children[0], s->children[1]);
1071 }
1072 
1073 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1074 {
1075 	struct bch_fs *c = trans->c;
1076 	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
1077 	struct btree_iter c_iter = (struct btree_iter) { NULL };
1078 	struct btree_iter tree_iter = (struct btree_iter) { NULL };
1079 	struct bkey_s_c_snapshot s;
1080 	u32 parent_id, child_id;
1081 	unsigned i;
1082 	int ret = 0;
1083 
1084 	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1085 				     BTREE_ITER_intent, snapshot);
1086 	ret = bkey_err(s);
1087 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1088 				"missing snapshot %u", id);
1089 
1090 	if (ret)
1091 		goto err;
1092 
1093 	BUG_ON(s.v->children[1]);
1094 
1095 	parent_id = le32_to_cpu(s.v->parent);
1096 	child_id = le32_to_cpu(s.v->children[0]);
1097 
1098 	if (parent_id) {
1099 		struct bkey_i_snapshot *parent;
1100 
1101 		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1102 				     BTREE_ID_snapshots, POS(0, parent_id),
1103 				     0, snapshot);
1104 		ret = PTR_ERR_OR_ZERO(parent);
1105 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1106 					"missing snapshot %u", parent_id);
1107 		if (unlikely(ret))
1108 			goto err;
1109 
1110 		/* find entry in parent->children for node being deleted */
1111 		for (i = 0; i < 2; i++)
1112 			if (le32_to_cpu(parent->v.children[i]) == id)
1113 				break;
1114 
1115 		if (bch2_fs_inconsistent_on(i == 2, c,
1116 					"snapshot %u missing child pointer to %u",
1117 					parent_id, id))
1118 			goto err;
1119 
1120 		parent->v.children[i] = cpu_to_le32(child_id);
1121 
1122 		normalize_snapshot_child_pointers(&parent->v);
1123 	}
1124 
1125 	if (child_id) {
1126 		struct bkey_i_snapshot *child;
1127 
1128 		child = bch2_bkey_get_mut_typed(trans, &c_iter,
1129 				     BTREE_ID_snapshots, POS(0, child_id),
1130 				     0, snapshot);
1131 		ret = PTR_ERR_OR_ZERO(child);
1132 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1133 					"missing snapshot %u", child_id);
1134 		if (unlikely(ret))
1135 			goto err;
1136 
1137 		child->v.parent = cpu_to_le32(parent_id);
1138 
1139 		if (!child->v.parent) {
1140 			child->v.skip[0] = 0;
1141 			child->v.skip[1] = 0;
1142 			child->v.skip[2] = 0;
1143 		}
1144 	}
1145 
1146 	if (!parent_id) {
1147 		/*
1148 		 * We're deleting the root of a snapshot tree: update the
1149 		 * snapshot_tree entry to point to the new root, or delete it if
1150 		 * this is the last snapshot ID in this tree:
1151 		 */
1152 		struct bkey_i_snapshot_tree *s_t;
1153 
1154 		BUG_ON(s.v->children[1]);
1155 
1156 		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1157 				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
1158 				0, snapshot_tree);
1159 		ret = PTR_ERR_OR_ZERO(s_t);
1160 		if (ret)
1161 			goto err;
1162 
1163 		if (s.v->children[0]) {
1164 			s_t->v.root_snapshot = s.v->children[0];
1165 		} else {
1166 			s_t->k.type = KEY_TYPE_deleted;
1167 			set_bkey_val_u64s(&s_t->k, 0);
1168 		}
1169 	}
1170 
1171 	ret = bch2_btree_delete_at(trans, &iter, 0);
1172 err:
1173 	bch2_trans_iter_exit(trans, &tree_iter);
1174 	bch2_trans_iter_exit(trans, &p_iter);
1175 	bch2_trans_iter_exit(trans, &c_iter);
1176 	bch2_trans_iter_exit(trans, &iter);
1177 	return ret;
1178 }
1179 
1180 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1181 			  u32 *new_snapids,
1182 			  u32 *snapshot_subvols,
1183 			  unsigned nr_snapids)
1184 {
1185 	struct bch_fs *c = trans->c;
1186 	struct btree_iter iter;
1187 	struct bkey_i_snapshot *n;
1188 	struct bkey_s_c k;
1189 	unsigned i, j;
1190 	u32 depth = bch2_snapshot_depth(c, parent);
1191 	int ret;
1192 
1193 	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1194 			     POS_MIN, BTREE_ITER_intent);
1195 	k = bch2_btree_iter_peek(&iter);
1196 	ret = bkey_err(k);
1197 	if (ret)
1198 		goto err;
1199 
1200 	for (i = 0; i < nr_snapids; i++) {
1201 		k = bch2_btree_iter_prev_slot(&iter);
1202 		ret = bkey_err(k);
1203 		if (ret)
1204 			goto err;
1205 
1206 		if (!k.k || !k.k->p.offset) {
1207 			ret = -BCH_ERR_ENOSPC_snapshot_create;
1208 			goto err;
1209 		}
1210 
1211 		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1212 		ret = PTR_ERR_OR_ZERO(n);
1213 		if (ret)
1214 			goto err;
1215 
1216 		n->v.flags	= 0;
1217 		n->v.parent	= cpu_to_le32(parent);
1218 		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
1219 		n->v.tree	= cpu_to_le32(tree);
1220 		n->v.depth	= cpu_to_le32(depth);
1221 		n->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
1222 		n->v.btime.hi	= 0;
1223 
1224 		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1225 			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1226 
1227 		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1228 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1229 
1230 		ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1231 					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1232 		if (ret)
1233 			goto err;
1234 
1235 		new_snapids[i]	= iter.pos.offset;
1236 	}
1237 err:
1238 	bch2_trans_iter_exit(trans, &iter);
1239 	return ret;
1240 }
1241 
1242 /*
1243  * Create new snapshot IDs as children of an existing snapshot ID:
1244  */
1245 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1246 			      u32 *new_snapids,
1247 			      u32 *snapshot_subvols,
1248 			      unsigned nr_snapids)
1249 {
1250 	struct btree_iter iter;
1251 	struct bkey_i_snapshot *n_parent;
1252 	int ret = 0;
1253 
1254 	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1255 			BTREE_ID_snapshots, POS(0, parent),
1256 			0, snapshot);
1257 	ret = PTR_ERR_OR_ZERO(n_parent);
1258 	if (unlikely(ret)) {
1259 		if (bch2_err_matches(ret, ENOENT))
1260 			bch_err(trans->c, "snapshot %u not found", parent);
1261 		return ret;
1262 	}
1263 
1264 	if (n_parent->v.children[0] || n_parent->v.children[1]) {
1265 		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1266 		ret = -EINVAL;
1267 		goto err;
1268 	}
1269 
1270 	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1271 			     new_snapids, snapshot_subvols, nr_snapids);
1272 	if (ret)
1273 		goto err;
1274 
1275 	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1276 	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1277 	n_parent->v.subvol = 0;
1278 	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1279 err:
1280 	bch2_trans_iter_exit(trans, &iter);
1281 	return ret;
1282 }
1283 
1284 /*
1285  * Create a snapshot node that is the root of a new tree:
1286  */
1287 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1288 			      u32 *new_snapids,
1289 			      u32 *snapshot_subvols,
1290 			      unsigned nr_snapids)
1291 {
1292 	struct bkey_i_snapshot_tree *n_tree;
1293 	int ret;
1294 
1295 	n_tree = __bch2_snapshot_tree_create(trans);
1296 	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1297 		create_snapids(trans, 0, n_tree->k.p.offset,
1298 			     new_snapids, snapshot_subvols, nr_snapids);
1299 	if (ret)
1300 		return ret;
1301 
1302 	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
1303 	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
1304 	return 0;
1305 }
1306 
1307 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1308 			      u32 *new_snapids,
1309 			      u32 *snapshot_subvols,
1310 			      unsigned nr_snapids)
1311 {
1312 	BUG_ON((parent == 0) != (nr_snapids == 1));
1313 	BUG_ON((parent != 0) != (nr_snapids == 2));
1314 
1315 	return parent
1316 		? bch2_snapshot_node_create_children(trans, parent,
1317 				new_snapids, snapshot_subvols, nr_snapids)
1318 		: bch2_snapshot_node_create_tree(trans,
1319 				new_snapids, snapshot_subvols, nr_snapids);
1320 
1321 }
1322 
1323 /*
1324  * If we have an unlinked inode in an internal snapshot node, and the inode
1325  * really has been deleted in all child snapshots, how does this get cleaned up?
1326  *
1327  * first there is the problem of how keys that have been overwritten in all
1328  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1329  * special?
1330  *
1331  * also: unlinked inode in internal snapshot appears to not be getting deleted
1332  * correctly if inode doesn't exist in leaf snapshots
1333  *
1334  * solution:
1335  *
1336  * for a key in an interior snapshot node that needs work to be done that
1337  * requires it to be mutated: iterate over all descendent leaf nodes and copy
1338  * that key to snapshot leaf nodes, where we can mutate it
1339  */
1340 
1341 struct snapshot_interior_delete {
1342 	u32	id;
1343 	u32	live_child;
1344 };
1345 typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
1346 
1347 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
1348 {
1349 	darray_for_each(*l, i)
1350 		if (i->id == id)
1351 			return i->live_child;
1352 	return 0;
1353 }
1354 
1355 static unsigned __live_child(struct snapshot_table *t, u32 id,
1356 			     snapshot_id_list *delete_leaves,
1357 			     interior_delete_list *delete_interior)
1358 {
1359 	struct snapshot_t *s = __snapshot_t(t, id);
1360 	if (!s)
1361 		return 0;
1362 
1363 	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
1364 		if (s->children[i] &&
1365 		    !snapshot_list_has_id(delete_leaves, s->children[i]) &&
1366 		    !interior_delete_has_id(delete_interior, s->children[i]))
1367 			return s->children[i];
1368 
1369 	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
1370 		u32 live_child = s->children[i]
1371 			? __live_child(t, s->children[i], delete_leaves, delete_interior)
1372 			: 0;
1373 		if (live_child)
1374 			return live_child;
1375 	}
1376 
1377 	return 0;
1378 }
1379 
1380 static unsigned live_child(struct bch_fs *c, u32 id,
1381 			   snapshot_id_list *delete_leaves,
1382 			   interior_delete_list *delete_interior)
1383 {
1384 	rcu_read_lock();
1385 	u32 ret = __live_child(rcu_dereference(c->snapshots), id,
1386 			       delete_leaves, delete_interior);
1387 	rcu_read_unlock();
1388 	return ret;
1389 }
1390 
1391 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
1392 					     struct btree_iter *iter,
1393 					     struct bkey_s_c k,
1394 					     snapshot_id_list *delete_leaves,
1395 					     interior_delete_list *delete_interior)
1396 {
1397 	if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
1398 		return bch2_btree_delete_at(trans, iter,
1399 					    BTREE_UPDATE_internal_snapshot_node);
1400 
1401 	u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
1402 	if (live_child) {
1403 		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1404 		int ret = PTR_ERR_OR_ZERO(new);
1405 		if (ret)
1406 			return ret;
1407 
1408 		new->k.p.snapshot = live_child;
1409 
1410 		struct btree_iter dst_iter;
1411 		struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
1412 							   iter->btree_id, new->k.p,
1413 							   BTREE_ITER_all_snapshots|
1414 							   BTREE_ITER_intent);
1415 		ret = bkey_err(dst_k);
1416 		if (ret)
1417 			return ret;
1418 
1419 		ret =   (bkey_deleted(dst_k.k)
1420 			 ? bch2_trans_update(trans, &dst_iter, new,
1421 					     BTREE_UPDATE_internal_snapshot_node)
1422 			 : 0) ?:
1423 			bch2_btree_delete_at(trans, iter,
1424 					     BTREE_UPDATE_internal_snapshot_node);
1425 		bch2_trans_iter_exit(trans, &dst_iter);
1426 		return ret;
1427 	}
1428 
1429 	return 0;
1430 }
1431 
1432 /*
1433  * For a given snapshot, if it doesn't have a subvolume that points to it, and
1434  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1435  * as deleted.
1436  */
1437 static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
1438 					snapshot_id_list *delete_leaves,
1439 					interior_delete_list *delete_interior)
1440 {
1441 	if (k.k->type != KEY_TYPE_snapshot)
1442 		return 0;
1443 
1444 	struct bch_fs *c = trans->c;
1445 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
1446 	unsigned live_children = 0;
1447 
1448 	if (BCH_SNAPSHOT_SUBVOL(s.v))
1449 		return 0;
1450 
1451 	for (unsigned i = 0; i < 2; i++) {
1452 		u32 child = le32_to_cpu(s.v->children[i]);
1453 
1454 		live_children += child &&
1455 			!snapshot_list_has_id(delete_leaves, child);
1456 	}
1457 
1458 	if (live_children == 0) {
1459 		return snapshot_list_add(c, delete_leaves, s.k->p.offset);
1460 	} else if (live_children == 1) {
1461 		struct snapshot_interior_delete d = {
1462 			.id		= s.k->p.offset,
1463 			.live_child	= live_child(c, s.k->p.offset, delete_leaves, delete_interior),
1464 		};
1465 
1466 		if (!d.live_child) {
1467 			bch_err(c, "error finding live child of snapshot %u", d.id);
1468 			return -EINVAL;
1469 		}
1470 
1471 		return darray_push(delete_interior, d);
1472 	} else {
1473 		return 0;
1474 	}
1475 }
1476 
1477 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1478 						interior_delete_list *skip)
1479 {
1480 	rcu_read_lock();
1481 	while (interior_delete_has_id(skip, id))
1482 		id = __bch2_snapshot_parent(c, id);
1483 
1484 	while (n--) {
1485 		do {
1486 			id = __bch2_snapshot_parent(c, id);
1487 		} while (interior_delete_has_id(skip, id));
1488 	}
1489 	rcu_read_unlock();
1490 
1491 	return id;
1492 }
1493 
1494 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1495 					      struct btree_iter *iter, struct bkey_s_c k,
1496 					      interior_delete_list *deleted)
1497 {
1498 	struct bch_fs *c = trans->c;
1499 	u32 nr_deleted_ancestors = 0;
1500 	struct bkey_i_snapshot *s;
1501 	int ret;
1502 
1503 	if (k.k->type != KEY_TYPE_snapshot)
1504 		return 0;
1505 
1506 	if (interior_delete_has_id(deleted, k.k->p.offset))
1507 		return 0;
1508 
1509 	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1510 	ret = PTR_ERR_OR_ZERO(s);
1511 	if (ret)
1512 		return ret;
1513 
1514 	darray_for_each(*deleted, i)
1515 		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
1516 
1517 	if (!nr_deleted_ancestors)
1518 		return 0;
1519 
1520 	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1521 
1522 	if (!s->v.depth) {
1523 		s->v.skip[0] = 0;
1524 		s->v.skip[1] = 0;
1525 		s->v.skip[2] = 0;
1526 	} else {
1527 		u32 depth = le32_to_cpu(s->v.depth);
1528 		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1529 
1530 		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1531 			u32 id = le32_to_cpu(s->v.skip[j]);
1532 
1533 			if (interior_delete_has_id(deleted, id)) {
1534 				id = bch2_snapshot_nth_parent_skip(c,
1535 							parent,
1536 							depth > 1
1537 							? get_random_u32_below(depth - 1)
1538 							: 0,
1539 							deleted);
1540 				s->v.skip[j] = cpu_to_le32(id);
1541 			}
1542 		}
1543 
1544 		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1545 	}
1546 
1547 	return bch2_trans_update(trans, iter, &s->k_i, 0);
1548 }
1549 
1550 int bch2_delete_dead_snapshots(struct bch_fs *c)
1551 {
1552 	if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1553 		return 0;
1554 
1555 	struct btree_trans *trans = bch2_trans_get(c);
1556 	snapshot_id_list delete_leaves = {};
1557 	interior_delete_list delete_interior = {};
1558 	int ret = 0;
1559 
1560 	/*
1561 	 * For every snapshot node: If we have no live children and it's not
1562 	 * pointed to by a subvolume, delete it:
1563 	 */
1564 	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
1565 		check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
1566 	if (!bch2_err_matches(ret, EROFS))
1567 		bch_err_msg(c, ret, "walking snapshots");
1568 	if (ret)
1569 		goto err;
1570 
1571 	if (!delete_leaves.nr && !delete_interior.nr)
1572 		goto err;
1573 
1574 	{
1575 		struct printbuf buf = PRINTBUF;
1576 		prt_printf(&buf, "deleting leaves");
1577 		darray_for_each(delete_leaves, i)
1578 			prt_printf(&buf, " %u", *i);
1579 
1580 		prt_printf(&buf, " interior");
1581 		darray_for_each(delete_interior, i)
1582 			prt_printf(&buf, " %u->%u", i->id, i->live_child);
1583 
1584 		ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
1585 		printbuf_exit(&buf);
1586 		if (ret)
1587 			goto err;
1588 	}
1589 
1590 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
1591 		struct disk_reservation res = { 0 };
1592 
1593 		if (!btree_type_has_snapshots(btree))
1594 			continue;
1595 
1596 		ret = for_each_btree_key_commit(trans, iter,
1597 				btree, POS_MIN,
1598 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1599 				&res, NULL, BCH_TRANS_COMMIT_no_enospc,
1600 			delete_dead_snapshots_process_key(trans, &iter, k,
1601 							  &delete_leaves,
1602 							  &delete_interior));
1603 
1604 		bch2_disk_reservation_put(c, &res);
1605 
1606 		if (!bch2_err_matches(ret, EROFS))
1607 			bch_err_msg(c, ret, "deleting keys from dying snapshots");
1608 		if (ret)
1609 			goto err;
1610 	}
1611 
1612 	darray_for_each(delete_leaves, i) {
1613 		ret = commit_do(trans, NULL, NULL, 0,
1614 			bch2_snapshot_node_delete(trans, *i));
1615 		if (!bch2_err_matches(ret, EROFS))
1616 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
1617 		if (ret)
1618 			goto err;
1619 	}
1620 
1621 	/*
1622 	 * Fixing children of deleted snapshots can't be done completely
1623 	 * atomically, if we crash between here and when we delete the interior
1624 	 * nodes some depth fields will be off:
1625 	 */
1626 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1627 				  BTREE_ITER_intent, k,
1628 				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1629 		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
1630 	if (ret)
1631 		goto err;
1632 
1633 	darray_for_each(delete_interior, i) {
1634 		ret = commit_do(trans, NULL, NULL, 0,
1635 			bch2_snapshot_node_delete(trans, i->id));
1636 		if (!bch2_err_matches(ret, EROFS))
1637 			bch_err_msg(c, ret, "deleting snapshot %u", i->id);
1638 		if (ret)
1639 			goto err;
1640 	}
1641 err:
1642 	darray_exit(&delete_interior);
1643 	darray_exit(&delete_leaves);
1644 	bch2_trans_put(trans);
1645 	if (!bch2_err_matches(ret, EROFS))
1646 		bch_err_fn(c, ret);
1647 	return ret;
1648 }
1649 
1650 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1651 {
1652 	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1653 
1654 	set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
1655 
1656 	bch2_delete_dead_snapshots(c);
1657 	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1658 }
1659 
1660 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1661 {
1662 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
1663 		return;
1664 
1665 	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
1666 
1667 	if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1668 		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1669 }
1670 
1671 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1672 				       enum btree_id id,
1673 				       struct bpos pos)
1674 {
1675 	struct bch_fs *c = trans->c;
1676 	struct btree_iter iter;
1677 	struct bkey_s_c k;
1678 	int ret;
1679 
1680 	for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
1681 					     BTREE_ITER_not_extents|
1682 					     BTREE_ITER_all_snapshots,
1683 					     k, ret) {
1684 		if (!bkey_eq(pos, k.k->p))
1685 			break;
1686 
1687 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1688 			ret = 1;
1689 			break;
1690 		}
1691 	}
1692 	bch2_trans_iter_exit(trans, &iter);
1693 
1694 	return ret;
1695 }
1696 
1697 static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
1698 {
1699 	/* If there's one child, it's redundant and keys will be moved to the child */
1700 	return !!snap.v->children[0] + !!snap.v->children[1] == 1;
1701 }
1702 
1703 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1704 {
1705 	if (k.k->type != KEY_TYPE_snapshot)
1706 		return 0;
1707 
1708 	struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
1709 	if (BCH_SNAPSHOT_DELETED(snap.v) ||
1710 	    interior_snapshot_needs_delete(snap))
1711 		set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
1712 
1713 	return 0;
1714 }
1715 
1716 int bch2_snapshots_read(struct bch_fs *c)
1717 {
1718 	/*
1719 	 * Initializing the is_ancestor bitmaps requires ancestors to already be
1720 	 * initialized - so mark in reverse:
1721 	 */
1722 	int ret = bch2_trans_run(c,
1723 		for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
1724 				   POS_MAX, 0, k,
1725 			__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1726 			bch2_check_snapshot_needs_deletion(trans, k)));
1727 	bch_err_fn(c, ret);
1728 
1729 	/*
1730 	 * It's important that we check if we need to reconstruct snapshots
1731 	 * before going RW, so we mark that pass as required in the superblock -
1732 	 * otherwise, we could end up deleting keys with missing snapshot nodes
1733 	 * instead
1734 	 */
1735 	BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
1736 	       test_bit(BCH_FS_may_go_rw, &c->flags));
1737 
1738 	if (bch2_err_matches(ret, EIO) ||
1739 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
1740 		ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
1741 
1742 	return ret;
1743 }
1744 
1745 void bch2_fs_snapshots_exit(struct bch_fs *c)
1746 {
1747 	kvfree(rcu_dereference_protected(c->snapshots, true));
1748 }
1749