xref: /linux/fs/bcachefs/snapshot.c (revision 031fba65fc202abf1f193e321be7a2c274fd88ba)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_key_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "errcode.h"
9 #include "error.h"
10 #include "fs.h"
11 #include "snapshot.h"
12 
13 #include <linux/random.h>
14 
15 /*
16  * Snapshot trees:
17  *
18  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
19  * exist to provide a stable identifier for the whole lifetime of a snapshot
20  * tree.
21  */
22 
23 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
24 				struct bkey_s_c k)
25 {
26 	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
27 
28 	prt_printf(out, "subvol %u root snapshot %u",
29 		   le32_to_cpu(t.v->master_subvol),
30 		   le32_to_cpu(t.v->root_snapshot));
31 }
32 
33 int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
34 			       enum bkey_invalid_flags flags,
35 			       struct printbuf *err)
36 {
37 	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
38 	    bkey_lt(k.k->p, POS(0, 1))) {
39 		prt_printf(err, "bad pos");
40 		return -BCH_ERR_invalid_bkey;
41 	}
42 
43 	return 0;
44 }
45 
46 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
47 			      struct bch_snapshot_tree *s)
48 {
49 	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
50 					  BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
51 
52 	if (bch2_err_matches(ret, ENOENT))
53 		ret = -BCH_ERR_ENOENT_snapshot_tree;
54 	return ret;
55 }
56 
57 struct bkey_i_snapshot_tree *
58 __bch2_snapshot_tree_create(struct btree_trans *trans)
59 {
60 	struct btree_iter iter;
61 	int ret = bch2_bkey_get_empty_slot(trans, &iter,
62 			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
63 	struct bkey_i_snapshot_tree *s_t;
64 
65 	if (ret == -BCH_ERR_ENOSPC_btree_slot)
66 		ret = -BCH_ERR_ENOSPC_snapshot_tree;
67 	if (ret)
68 		return ERR_PTR(ret);
69 
70 	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
71 	ret = PTR_ERR_OR_ZERO(s_t);
72 	bch2_trans_iter_exit(trans, &iter);
73 	return ret ? ERR_PTR(ret) : s_t;
74 }
75 
76 static int bch2_snapshot_tree_create(struct btree_trans *trans,
77 				u32 root_id, u32 subvol_id, u32 *tree_id)
78 {
79 	struct bkey_i_snapshot_tree *n_tree =
80 		__bch2_snapshot_tree_create(trans);
81 
82 	if (IS_ERR(n_tree))
83 		return PTR_ERR(n_tree);
84 
85 	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
86 	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
87 	*tree_id = n_tree->k.p.offset;
88 	return 0;
89 }
90 
91 /* Snapshot nodes: */
92 
93 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
94 {
95 	struct snapshot_table *t;
96 
97 	rcu_read_lock();
98 	t = rcu_dereference(c->snapshots);
99 
100 	while (id && id < ancestor)
101 		id = __snapshot_t(t, id)->parent;
102 	rcu_read_unlock();
103 
104 	return id == ancestor;
105 }
106 
107 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
108 {
109 	const struct snapshot_t *s = __snapshot_t(t, id);
110 
111 	if (s->skip[2] <= ancestor)
112 		return s->skip[2];
113 	if (s->skip[1] <= ancestor)
114 		return s->skip[1];
115 	if (s->skip[0] <= ancestor)
116 		return s->skip[0];
117 	return s->parent;
118 }
119 
120 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
121 {
122 	struct snapshot_table *t;
123 	bool ret;
124 
125 	EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
126 
127 	rcu_read_lock();
128 	t = rcu_dereference(c->snapshots);
129 
130 	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
131 		id = get_ancestor_below(t, id, ancestor);
132 
133 	if (id && id < ancestor) {
134 		ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
135 
136 		EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
137 	} else {
138 		ret = id == ancestor;
139 	}
140 
141 	rcu_read_unlock();
142 
143 	return ret;
144 }
145 
146 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
147 {
148 	size_t idx = U32_MAX - id;
149 	size_t new_size;
150 	struct snapshot_table *new, *old;
151 
152 	new_size = max(16UL, roundup_pow_of_two(idx + 1));
153 
154 	new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
155 	if (!new)
156 		return NULL;
157 
158 	old = rcu_dereference_protected(c->snapshots, true);
159 	if (old)
160 		memcpy(new->s,
161 		       rcu_dereference_protected(c->snapshots, true)->s,
162 		       sizeof(new->s[0]) * c->snapshot_table_size);
163 
164 	rcu_assign_pointer(c->snapshots, new);
165 	c->snapshot_table_size = new_size;
166 	kvfree_rcu_mightsleep(old);
167 
168 	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
169 }
170 
171 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
172 {
173 	size_t idx = U32_MAX - id;
174 
175 	lockdep_assert_held(&c->snapshot_table_lock);
176 
177 	if (likely(idx < c->snapshot_table_size))
178 		return &rcu_dereference_protected(c->snapshots, true)->s[idx];
179 
180 	return __snapshot_t_mut(c, id);
181 }
182 
183 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
184 			   struct bkey_s_c k)
185 {
186 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
187 
188 	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
189 	       BCH_SNAPSHOT_SUBVOL(s.v),
190 	       BCH_SNAPSHOT_DELETED(s.v),
191 	       le32_to_cpu(s.v->parent),
192 	       le32_to_cpu(s.v->children[0]),
193 	       le32_to_cpu(s.v->children[1]),
194 	       le32_to_cpu(s.v->subvol),
195 	       le32_to_cpu(s.v->tree));
196 
197 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
198 		prt_printf(out, " depth %u skiplist %u %u %u",
199 			   le32_to_cpu(s.v->depth),
200 			   le32_to_cpu(s.v->skip[0]),
201 			   le32_to_cpu(s.v->skip[1]),
202 			   le32_to_cpu(s.v->skip[2]));
203 }
204 
205 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
206 			  enum bkey_invalid_flags flags,
207 			  struct printbuf *err)
208 {
209 	struct bkey_s_c_snapshot s;
210 	u32 i, id;
211 
212 	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
213 	    bkey_lt(k.k->p, POS(0, 1))) {
214 		prt_printf(err, "bad pos");
215 		return -BCH_ERR_invalid_bkey;
216 	}
217 
218 	s = bkey_s_c_to_snapshot(k);
219 
220 	id = le32_to_cpu(s.v->parent);
221 	if (id && id <= k.k->p.offset) {
222 		prt_printf(err, "bad parent node (%u <= %llu)",
223 		       id, k.k->p.offset);
224 		return -BCH_ERR_invalid_bkey;
225 	}
226 
227 	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
228 		prt_printf(err, "children not normalized");
229 		return -BCH_ERR_invalid_bkey;
230 	}
231 
232 	if (s.v->children[0] &&
233 	    s.v->children[0] == s.v->children[1]) {
234 		prt_printf(err, "duplicate child nodes");
235 		return -BCH_ERR_invalid_bkey;
236 	}
237 
238 	for (i = 0; i < 2; i++) {
239 		id = le32_to_cpu(s.v->children[i]);
240 
241 		if (id >= k.k->p.offset) {
242 			prt_printf(err, "bad child node (%u >= %llu)",
243 			       id, k.k->p.offset);
244 			return -BCH_ERR_invalid_bkey;
245 		}
246 	}
247 
248 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
249 		if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
250 		    le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
251 			prt_printf(err, "skiplist not normalized");
252 			return -BCH_ERR_invalid_bkey;
253 		}
254 
255 		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
256 			id = le32_to_cpu(s.v->skip[i]);
257 
258 			if ((id && !s.v->parent) ||
259 			    (id && id <= k.k->p.offset)) {
260 				prt_printf(err, "bad skiplist node %u", id);
261 				return -BCH_ERR_invalid_bkey;
262 			}
263 		}
264 	}
265 
266 	return 0;
267 }
268 
269 static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
270 {
271 	struct snapshot_t *t = snapshot_t_mut(c, id);
272 	u32 parent = id;
273 
274 	while ((parent = bch2_snapshot_parent_early(c, parent)) &&
275 	       parent - id - 1 < IS_ANCESTOR_BITMAP)
276 		__set_bit(parent - id - 1, t->is_ancestor);
277 }
278 
279 static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
280 {
281 	mutex_lock(&c->snapshot_table_lock);
282 	__set_is_ancestor_bitmap(c, id);
283 	mutex_unlock(&c->snapshot_table_lock);
284 }
285 
286 int bch2_mark_snapshot(struct btree_trans *trans,
287 		       enum btree_id btree, unsigned level,
288 		       struct bkey_s_c old, struct bkey_s_c new,
289 		       unsigned flags)
290 {
291 	struct bch_fs *c = trans->c;
292 	struct snapshot_t *t;
293 	u32 id = new.k->p.offset;
294 	int ret = 0;
295 
296 	mutex_lock(&c->snapshot_table_lock);
297 
298 	t = snapshot_t_mut(c, id);
299 	if (!t) {
300 		ret = -BCH_ERR_ENOMEM_mark_snapshot;
301 		goto err;
302 	}
303 
304 	if (new.k->type == KEY_TYPE_snapshot) {
305 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
306 
307 		t->parent	= le32_to_cpu(s.v->parent);
308 		t->children[0]	= le32_to_cpu(s.v->children[0]);
309 		t->children[1]	= le32_to_cpu(s.v->children[1]);
310 		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
311 		t->tree		= le32_to_cpu(s.v->tree);
312 
313 		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
314 			t->depth	= le32_to_cpu(s.v->depth);
315 			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
316 			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
317 			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
318 		} else {
319 			t->depth	= 0;
320 			t->skip[0]	= 0;
321 			t->skip[1]	= 0;
322 			t->skip[2]	= 0;
323 		}
324 
325 		__set_is_ancestor_bitmap(c, id);
326 
327 		if (BCH_SNAPSHOT_DELETED(s.v)) {
328 			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
329 			c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
330 		}
331 	} else {
332 		memset(t, 0, sizeof(*t));
333 	}
334 err:
335 	mutex_unlock(&c->snapshot_table_lock);
336 	return ret;
337 }
338 
339 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
340 			 struct bch_snapshot *s)
341 {
342 	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
343 				       BTREE_ITER_WITH_UPDATES, snapshot, s);
344 }
345 
346 static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
347 {
348 	struct bch_snapshot v;
349 	int ret;
350 
351 	if (!id)
352 		return 0;
353 
354 	ret = bch2_snapshot_lookup(trans, id, &v);
355 	if (bch2_err_matches(ret, ENOENT))
356 		bch_err(trans->c, "snapshot node %u not found", id);
357 	if (ret)
358 		return ret;
359 
360 	return !BCH_SNAPSHOT_DELETED(&v);
361 }
362 
363 /*
364  * If @k is a snapshot with just one live child, it's part of a linear chain,
365  * which we consider to be an equivalence class: and then after snapshot
366  * deletion cleanup, there should only be a single key at a given position in
367  * this equivalence class.
368  *
369  * This sets the equivalence class of @k to be the child's equivalence class, if
370  * it's part of such a linear chain: this correctly sets equivalence classes on
371  * startup if we run leaf to root (i.e. in natural key order).
372  */
373 static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
374 {
375 	struct bch_fs *c = trans->c;
376 	unsigned i, nr_live = 0, live_idx = 0;
377 	struct bkey_s_c_snapshot snap;
378 	u32 id = k.k->p.offset, child[2];
379 
380 	if (k.k->type != KEY_TYPE_snapshot)
381 		return 0;
382 
383 	snap = bkey_s_c_to_snapshot(k);
384 
385 	child[0] = le32_to_cpu(snap.v->children[0]);
386 	child[1] = le32_to_cpu(snap.v->children[1]);
387 
388 	for (i = 0; i < 2; i++) {
389 		int ret = bch2_snapshot_live(trans, child[i]);
390 
391 		if (ret < 0)
392 			return ret;
393 
394 		if (ret)
395 			live_idx = i;
396 		nr_live += ret;
397 	}
398 
399 	mutex_lock(&c->snapshot_table_lock);
400 
401 	snapshot_t_mut(c, id)->equiv = nr_live == 1
402 		? snapshot_t_mut(c, child[live_idx])->equiv
403 		: id;
404 
405 	mutex_unlock(&c->snapshot_table_lock);
406 
407 	return 0;
408 }
409 
410 /* fsck: */
411 
412 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
413 {
414 	return snapshot_t(c, id)->children[child];
415 }
416 
417 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
418 {
419 	return bch2_snapshot_child(c, id, 0);
420 }
421 
422 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
423 {
424 	return bch2_snapshot_child(c, id, 1);
425 }
426 
427 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
428 {
429 	u32 n, parent;
430 
431 	n = bch2_snapshot_left_child(c, id);
432 	if (n)
433 		return n;
434 
435 	while ((parent = bch2_snapshot_parent(c, id))) {
436 		n = bch2_snapshot_right_child(c, parent);
437 		if (n && n != id)
438 			return n;
439 		id = parent;
440 	}
441 
442 	return 0;
443 }
444 
445 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
446 {
447 	u32 id = snapshot_root;
448 	u32 subvol = 0, s;
449 
450 	while (id) {
451 		s = snapshot_t(c, id)->subvol;
452 
453 		if (s && (!subvol || s < subvol))
454 			subvol = s;
455 
456 		id = bch2_snapshot_tree_next(c, id);
457 	}
458 
459 	return subvol;
460 }
461 
462 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
463 					    u32 snapshot_root, u32 *subvol_id)
464 {
465 	struct bch_fs *c = trans->c;
466 	struct btree_iter iter;
467 	struct bkey_s_c k;
468 	struct bkey_s_c_subvolume s;
469 	bool found = false;
470 	int ret;
471 
472 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
473 				     0, k, ret) {
474 		if (k.k->type != KEY_TYPE_subvolume)
475 			continue;
476 
477 		s = bkey_s_c_to_subvolume(k);
478 		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
479 			continue;
480 		if (!BCH_SUBVOLUME_SNAP(s.v)) {
481 			*subvol_id = s.k->p.offset;
482 			found = true;
483 			break;
484 		}
485 	}
486 
487 	bch2_trans_iter_exit(trans, &iter);
488 
489 	if (!ret && !found) {
490 		struct bkey_i_subvolume *u;
491 
492 		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
493 
494 		u = bch2_bkey_get_mut_typed(trans, &iter,
495 					    BTREE_ID_subvolumes, POS(0, *subvol_id),
496 					    0, subvolume);
497 		ret = PTR_ERR_OR_ZERO(u);
498 		if (ret)
499 			return ret;
500 
501 		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
502 	}
503 
504 	return ret;
505 }
506 
507 static int check_snapshot_tree(struct btree_trans *trans,
508 			       struct btree_iter *iter,
509 			       struct bkey_s_c k)
510 {
511 	struct bch_fs *c = trans->c;
512 	struct bkey_s_c_snapshot_tree st;
513 	struct bch_snapshot s;
514 	struct bch_subvolume subvol;
515 	struct printbuf buf = PRINTBUF;
516 	u32 root_id;
517 	int ret;
518 
519 	if (k.k->type != KEY_TYPE_snapshot_tree)
520 		return 0;
521 
522 	st = bkey_s_c_to_snapshot_tree(k);
523 	root_id = le32_to_cpu(st.v->root_snapshot);
524 
525 	ret = bch2_snapshot_lookup(trans, root_id, &s);
526 	if (ret && !bch2_err_matches(ret, ENOENT))
527 		goto err;
528 
529 	if (fsck_err_on(ret ||
530 			root_id != bch2_snapshot_root(c, root_id) ||
531 			st.k->p.offset != le32_to_cpu(s.tree),
532 			c,
533 			"snapshot tree points to missing/incorrect snapshot:\n  %s",
534 			(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
535 		ret = bch2_btree_delete_at(trans, iter, 0);
536 		goto err;
537 	}
538 
539 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
540 				 false, 0, &subvol);
541 	if (ret && !bch2_err_matches(ret, ENOENT))
542 		goto err;
543 
544 	if (fsck_err_on(ret, c,
545 			"snapshot tree points to missing subvolume:\n  %s",
546 			(printbuf_reset(&buf),
547 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
548 	    fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
549 						le32_to_cpu(subvol.snapshot),
550 						root_id), c,
551 			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
552 			(printbuf_reset(&buf),
553 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
554 	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
555 			"snapshot tree points to snapshot subvolume:\n  %s",
556 			(printbuf_reset(&buf),
557 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
558 		struct bkey_i_snapshot_tree *u;
559 		u32 subvol_id;
560 
561 		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
562 		if (ret)
563 			goto err;
564 
565 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
566 		ret = PTR_ERR_OR_ZERO(u);
567 		if (ret)
568 			goto err;
569 
570 		u->v.master_subvol = cpu_to_le32(subvol_id);
571 		st = snapshot_tree_i_to_s_c(u);
572 	}
573 err:
574 fsck_err:
575 	printbuf_exit(&buf);
576 	return ret;
577 }
578 
579 /*
580  * For each snapshot_tree, make sure it points to the root of a snapshot tree
581  * and that snapshot entry points back to it, or delete it.
582  *
583  * And, make sure it points to a subvolume within that snapshot tree, or correct
584  * it to point to the oldest subvolume within that snapshot tree.
585  */
586 int bch2_check_snapshot_trees(struct bch_fs *c)
587 {
588 	struct btree_iter iter;
589 	struct bkey_s_c k;
590 	int ret;
591 
592 	ret = bch2_trans_run(c,
593 		for_each_btree_key_commit(trans, iter,
594 			BTREE_ID_snapshot_trees, POS_MIN,
595 			BTREE_ITER_PREFETCH, k,
596 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
597 		check_snapshot_tree(trans, &iter, k)));
598 
599 	if (ret)
600 		bch_err(c, "error %i checking snapshot trees", ret);
601 	return ret;
602 }
603 
604 /*
605  * Look up snapshot tree for @tree_id and find root,
606  * make sure @snap_id is a descendent:
607  */
608 static int snapshot_tree_ptr_good(struct btree_trans *trans,
609 				  u32 snap_id, u32 tree_id)
610 {
611 	struct bch_snapshot_tree s_t;
612 	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
613 
614 	if (bch2_err_matches(ret, ENOENT))
615 		return 0;
616 	if (ret)
617 		return ret;
618 
619 	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
620 }
621 
622 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
623 {
624 	const struct snapshot_t *s;
625 
626 	if (!id)
627 		return 0;
628 
629 	rcu_read_lock();
630 	s = snapshot_t(c, id);
631 	if (s->parent)
632 		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
633 	rcu_read_unlock();
634 
635 	return id;
636 }
637 
638 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
639 {
640 	unsigned i;
641 
642 	for (i = 0; i < 3; i++)
643 		if (!s.parent) {
644 			if (s.skip[i])
645 				return false;
646 		} else {
647 			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
648 				return false;
649 		}
650 
651 	return true;
652 }
653 
654 /*
655  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
656  * its snapshot_tree pointer is correct (allocate new one if necessary), then
657  * update this node's pointer to root node's pointer:
658  */
659 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
660 				    struct btree_iter *iter,
661 				    struct bkey_s_c k,
662 				    struct bch_snapshot *s)
663 {
664 	struct bch_fs *c = trans->c;
665 	struct btree_iter root_iter;
666 	struct bch_snapshot_tree s_t;
667 	struct bkey_s_c_snapshot root;
668 	struct bkey_i_snapshot *u;
669 	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
670 	int ret;
671 
672 	root = bch2_bkey_get_iter_typed(trans, &root_iter,
673 			       BTREE_ID_snapshots, POS(0, root_id),
674 			       BTREE_ITER_WITH_UPDATES, snapshot);
675 	ret = bkey_err(root);
676 	if (ret)
677 		goto err;
678 
679 	tree_id = le32_to_cpu(root.v->tree);
680 
681 	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
682 	if (ret && !bch2_err_matches(ret, ENOENT))
683 		return ret;
684 
685 	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
686 		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
687 		ret =   PTR_ERR_OR_ZERO(u) ?:
688 			bch2_snapshot_tree_create(trans, root_id,
689 				bch2_snapshot_tree_oldest_subvol(c, root_id),
690 				&tree_id);
691 		if (ret)
692 			goto err;
693 
694 		u->v.tree = cpu_to_le32(tree_id);
695 		if (k.k->p.offset == root_id)
696 			*s = u->v;
697 	}
698 
699 	if (k.k->p.offset != root_id) {
700 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
701 		ret = PTR_ERR_OR_ZERO(u);
702 		if (ret)
703 			goto err;
704 
705 		u->v.tree = cpu_to_le32(tree_id);
706 		*s = u->v;
707 	}
708 err:
709 	bch2_trans_iter_exit(trans, &root_iter);
710 	return ret;
711 }
712 
713 static int check_snapshot(struct btree_trans *trans,
714 			  struct btree_iter *iter,
715 			  struct bkey_s_c k)
716 {
717 	struct bch_fs *c = trans->c;
718 	struct bch_snapshot s;
719 	struct bch_subvolume subvol;
720 	struct bch_snapshot v;
721 	struct bkey_i_snapshot *u;
722 	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
723 	u32 real_depth;
724 	struct printbuf buf = PRINTBUF;
725 	bool should_have_subvol;
726 	u32 i, id;
727 	int ret = 0;
728 
729 	if (k.k->type != KEY_TYPE_snapshot)
730 		return 0;
731 
732 	memset(&s, 0, sizeof(s));
733 	memcpy(&s, k.v, bkey_val_bytes(k.k));
734 
735 	id = le32_to_cpu(s.parent);
736 	if (id) {
737 		ret = bch2_snapshot_lookup(trans, id, &v);
738 		if (bch2_err_matches(ret, ENOENT))
739 			bch_err(c, "snapshot with nonexistent parent:\n  %s",
740 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
741 		if (ret)
742 			goto err;
743 
744 		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
745 		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
746 			bch_err(c, "snapshot parent %u missing pointer to child %llu",
747 				id, k.k->p.offset);
748 			ret = -EINVAL;
749 			goto err;
750 		}
751 	}
752 
753 	for (i = 0; i < 2 && s.children[i]; i++) {
754 		id = le32_to_cpu(s.children[i]);
755 
756 		ret = bch2_snapshot_lookup(trans, id, &v);
757 		if (bch2_err_matches(ret, ENOENT))
758 			bch_err(c, "snapshot node %llu has nonexistent child %u",
759 				k.k->p.offset, id);
760 		if (ret)
761 			goto err;
762 
763 		if (le32_to_cpu(v.parent) != k.k->p.offset) {
764 			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
765 				id, le32_to_cpu(v.parent), k.k->p.offset);
766 			ret = -EINVAL;
767 			goto err;
768 		}
769 	}
770 
771 	should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
772 		!BCH_SNAPSHOT_DELETED(&s);
773 
774 	if (should_have_subvol) {
775 		id = le32_to_cpu(s.subvol);
776 		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
777 		if (bch2_err_matches(ret, ENOENT))
778 			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
779 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
780 		if (ret)
781 			goto err;
782 
783 		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
784 			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
785 				k.k->p.offset);
786 			ret = -EINVAL;
787 			goto err;
788 		}
789 	} else {
790 		if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
791 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
792 			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
793 			ret = PTR_ERR_OR_ZERO(u);
794 			if (ret)
795 				goto err;
796 
797 			u->v.subvol = 0;
798 			s = u->v;
799 		}
800 	}
801 
802 	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
803 	if (ret < 0)
804 		goto err;
805 
806 	if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
807 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
808 		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
809 		if (ret)
810 			goto err;
811 	}
812 	ret = 0;
813 
814 	real_depth = bch2_snapshot_depth(c, parent_id);
815 
816 	if (le32_to_cpu(s.depth) != real_depth &&
817 	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
818 	     fsck_err(c, "snapshot with incorrect depth field, should be %u:\n  %s",
819 		      real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
820 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
821 		ret = PTR_ERR_OR_ZERO(u);
822 		if (ret)
823 			goto err;
824 
825 		u->v.depth = cpu_to_le32(real_depth);
826 		s = u->v;
827 	}
828 
829 	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
830 	if (ret < 0)
831 		goto err;
832 
833 	if (!ret &&
834 	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
835 	     fsck_err(c, "snapshot with bad skiplist field:\n  %s",
836 		      (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
837 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
838 		ret = PTR_ERR_OR_ZERO(u);
839 		if (ret)
840 			goto err;
841 
842 		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
843 			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
844 
845 		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
846 		s = u->v;
847 	}
848 	ret = 0;
849 err:
850 fsck_err:
851 	printbuf_exit(&buf);
852 	return ret;
853 }
854 
855 int bch2_check_snapshots(struct bch_fs *c)
856 {
857 	struct btree_iter iter;
858 	struct bkey_s_c k;
859 	int ret;
860 
861 	/*
862 	 * We iterate backwards as checking/fixing the depth field requires that
863 	 * the parent's depth already be correct:
864 	 */
865 	ret = bch2_trans_run(c,
866 		for_each_btree_key_reverse_commit(trans, iter,
867 			BTREE_ID_snapshots, POS_MAX,
868 			BTREE_ITER_PREFETCH, k,
869 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
870 		check_snapshot(trans, &iter, k)));
871 	if (ret)
872 		bch_err_fn(c, ret);
873 	return ret;
874 }
875 
876 /*
877  * Mark a snapshot as deleted, for future cleanup:
878  */
879 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
880 {
881 	struct btree_iter iter;
882 	struct bkey_i_snapshot *s;
883 	int ret = 0;
884 
885 	s = bch2_bkey_get_mut_typed(trans, &iter,
886 				    BTREE_ID_snapshots, POS(0, id),
887 				    0, snapshot);
888 	ret = PTR_ERR_OR_ZERO(s);
889 	if (unlikely(ret)) {
890 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
891 					trans->c, "missing snapshot %u", id);
892 		return ret;
893 	}
894 
895 	/* already deleted? */
896 	if (BCH_SNAPSHOT_DELETED(&s->v))
897 		goto err;
898 
899 	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
900 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
901 	s->v.subvol = 0;
902 err:
903 	bch2_trans_iter_exit(trans, &iter);
904 	return ret;
905 }
906 
907 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
908 {
909 	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
910 		swap(s->children[0], s->children[1]);
911 }
912 
913 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
914 {
915 	struct bch_fs *c = trans->c;
916 	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
917 	struct btree_iter c_iter = (struct btree_iter) { NULL };
918 	struct btree_iter tree_iter = (struct btree_iter) { NULL };
919 	struct bkey_s_c_snapshot s;
920 	u32 parent_id, child_id;
921 	unsigned i;
922 	int ret = 0;
923 
924 	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
925 				     BTREE_ITER_INTENT, snapshot);
926 	ret = bkey_err(s);
927 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
928 				"missing snapshot %u", id);
929 
930 	if (ret)
931 		goto err;
932 
933 	BUG_ON(s.v->children[1]);
934 
935 	parent_id = le32_to_cpu(s.v->parent);
936 	child_id = le32_to_cpu(s.v->children[0]);
937 
938 	if (parent_id) {
939 		struct bkey_i_snapshot *parent;
940 
941 		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
942 				     BTREE_ID_snapshots, POS(0, parent_id),
943 				     0, snapshot);
944 		ret = PTR_ERR_OR_ZERO(parent);
945 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
946 					"missing snapshot %u", parent_id);
947 		if (unlikely(ret))
948 			goto err;
949 
950 		/* find entry in parent->children for node being deleted */
951 		for (i = 0; i < 2; i++)
952 			if (le32_to_cpu(parent->v.children[i]) == id)
953 				break;
954 
955 		if (bch2_fs_inconsistent_on(i == 2, c,
956 					"snapshot %u missing child pointer to %u",
957 					parent_id, id))
958 			goto err;
959 
960 		parent->v.children[i] = le32_to_cpu(child_id);
961 
962 		normalize_snapshot_child_pointers(&parent->v);
963 	}
964 
965 	if (child_id) {
966 		struct bkey_i_snapshot *child;
967 
968 		child = bch2_bkey_get_mut_typed(trans, &c_iter,
969 				     BTREE_ID_snapshots, POS(0, child_id),
970 				     0, snapshot);
971 		ret = PTR_ERR_OR_ZERO(child);
972 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
973 					"missing snapshot %u", child_id);
974 		if (unlikely(ret))
975 			goto err;
976 
977 		child->v.parent = cpu_to_le32(parent_id);
978 
979 		if (!child->v.parent) {
980 			child->v.skip[0] = 0;
981 			child->v.skip[1] = 0;
982 			child->v.skip[2] = 0;
983 		}
984 	}
985 
986 	if (!parent_id) {
987 		/*
988 		 * We're deleting the root of a snapshot tree: update the
989 		 * snapshot_tree entry to point to the new root, or delete it if
990 		 * this is the last snapshot ID in this tree:
991 		 */
992 		struct bkey_i_snapshot_tree *s_t;
993 
994 		BUG_ON(s.v->children[1]);
995 
996 		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
997 				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
998 				0, snapshot_tree);
999 		ret = PTR_ERR_OR_ZERO(s_t);
1000 		if (ret)
1001 			goto err;
1002 
1003 		if (s.v->children[0]) {
1004 			s_t->v.root_snapshot = s.v->children[0];
1005 		} else {
1006 			s_t->k.type = KEY_TYPE_deleted;
1007 			set_bkey_val_u64s(&s_t->k, 0);
1008 		}
1009 	}
1010 
1011 	ret = bch2_btree_delete_at(trans, &iter, 0);
1012 err:
1013 	bch2_trans_iter_exit(trans, &tree_iter);
1014 	bch2_trans_iter_exit(trans, &p_iter);
1015 	bch2_trans_iter_exit(trans, &c_iter);
1016 	bch2_trans_iter_exit(trans, &iter);
1017 	return ret;
1018 }
1019 
1020 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1021 			  u32 *new_snapids,
1022 			  u32 *snapshot_subvols,
1023 			  unsigned nr_snapids)
1024 {
1025 	struct bch_fs *c = trans->c;
1026 	struct btree_iter iter;
1027 	struct bkey_i_snapshot *n;
1028 	struct bkey_s_c k;
1029 	unsigned i, j;
1030 	u32 depth = bch2_snapshot_depth(c, parent);
1031 	int ret;
1032 
1033 	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1034 			     POS_MIN, BTREE_ITER_INTENT);
1035 	k = bch2_btree_iter_peek(&iter);
1036 	ret = bkey_err(k);
1037 	if (ret)
1038 		goto err;
1039 
1040 	for (i = 0; i < nr_snapids; i++) {
1041 		k = bch2_btree_iter_prev_slot(&iter);
1042 		ret = bkey_err(k);
1043 		if (ret)
1044 			goto err;
1045 
1046 		if (!k.k || !k.k->p.offset) {
1047 			ret = -BCH_ERR_ENOSPC_snapshot_create;
1048 			goto err;
1049 		}
1050 
1051 		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1052 		ret = PTR_ERR_OR_ZERO(n);
1053 		if (ret)
1054 			goto err;
1055 
1056 		n->v.flags	= 0;
1057 		n->v.parent	= cpu_to_le32(parent);
1058 		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
1059 		n->v.tree	= cpu_to_le32(tree);
1060 		n->v.depth	= cpu_to_le32(depth);
1061 
1062 		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1063 			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1064 
1065 		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1066 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1067 
1068 		ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1069 					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1070 		if (ret)
1071 			goto err;
1072 
1073 		new_snapids[i]	= iter.pos.offset;
1074 
1075 		mutex_lock(&c->snapshot_table_lock);
1076 		snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
1077 		mutex_unlock(&c->snapshot_table_lock);
1078 	}
1079 err:
1080 	bch2_trans_iter_exit(trans, &iter);
1081 	return ret;
1082 }
1083 
1084 /*
1085  * Create new snapshot IDs as children of an existing snapshot ID:
1086  */
1087 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1088 			      u32 *new_snapids,
1089 			      u32 *snapshot_subvols,
1090 			      unsigned nr_snapids)
1091 {
1092 	struct btree_iter iter;
1093 	struct bkey_i_snapshot *n_parent;
1094 	int ret = 0;
1095 
1096 	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1097 			BTREE_ID_snapshots, POS(0, parent),
1098 			0, snapshot);
1099 	ret = PTR_ERR_OR_ZERO(n_parent);
1100 	if (unlikely(ret)) {
1101 		if (bch2_err_matches(ret, ENOENT))
1102 			bch_err(trans->c, "snapshot %u not found", parent);
1103 		return ret;
1104 	}
1105 
1106 	if (n_parent->v.children[0] || n_parent->v.children[1]) {
1107 		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1108 		ret = -EINVAL;
1109 		goto err;
1110 	}
1111 
1112 	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1113 			     new_snapids, snapshot_subvols, nr_snapids);
1114 	if (ret)
1115 		goto err;
1116 
1117 	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1118 	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1119 	n_parent->v.subvol = 0;
1120 	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1121 err:
1122 	bch2_trans_iter_exit(trans, &iter);
1123 	return ret;
1124 }
1125 
1126 /*
1127  * Create a snapshot node that is the root of a new tree:
1128  */
1129 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1130 			      u32 *new_snapids,
1131 			      u32 *snapshot_subvols,
1132 			      unsigned nr_snapids)
1133 {
1134 	struct bkey_i_snapshot_tree *n_tree;
1135 	int ret;
1136 
1137 	n_tree = __bch2_snapshot_tree_create(trans);
1138 	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1139 		create_snapids(trans, 0, n_tree->k.p.offset,
1140 			     new_snapids, snapshot_subvols, nr_snapids);
1141 	if (ret)
1142 		return ret;
1143 
1144 	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
1145 	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
1146 	return 0;
1147 }
1148 
1149 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1150 			      u32 *new_snapids,
1151 			      u32 *snapshot_subvols,
1152 			      unsigned nr_snapids)
1153 {
1154 	BUG_ON((parent == 0) != (nr_snapids == 1));
1155 	BUG_ON((parent != 0) != (nr_snapids == 2));
1156 
1157 	return parent
1158 		? bch2_snapshot_node_create_children(trans, parent,
1159 				new_snapids, snapshot_subvols, nr_snapids)
1160 		: bch2_snapshot_node_create_tree(trans,
1161 				new_snapids, snapshot_subvols, nr_snapids);
1162 
1163 }
1164 
1165 /*
1166  * If we have an unlinked inode in an internal snapshot node, and the inode
1167  * really has been deleted in all child snapshots, how does this get cleaned up?
1168  *
1169  * first there is the problem of how keys that have been overwritten in all
1170  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1171  * special?
1172  *
1173  * also: unlinked inode in internal snapshot appears to not be getting deleted
1174  * correctly if inode doesn't exist in leaf snapshots
1175  *
1176  * solution:
1177  *
1178  * for a key in an interior snapshot node that needs work to be done that
1179  * requires it to be mutated: iterate over all descendent leaf nodes and copy
1180  * that key to snapshot leaf nodes, where we can mutate it
1181  */
1182 
1183 static int snapshot_delete_key(struct btree_trans *trans,
1184 			       struct btree_iter *iter,
1185 			       struct bkey_s_c k,
1186 			       snapshot_id_list *deleted,
1187 			       snapshot_id_list *equiv_seen,
1188 			       struct bpos *last_pos)
1189 {
1190 	struct bch_fs *c = trans->c;
1191 	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
1192 
1193 	if (!bkey_eq(k.k->p, *last_pos))
1194 		equiv_seen->nr = 0;
1195 	*last_pos = k.k->p;
1196 
1197 	if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
1198 	    snapshot_list_has_id(equiv_seen, equiv)) {
1199 		return bch2_btree_delete_at(trans, iter,
1200 					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1201 	} else {
1202 		return snapshot_list_add(c, equiv_seen, equiv);
1203 	}
1204 }
1205 
1206 static int move_key_to_correct_snapshot(struct btree_trans *trans,
1207 			       struct btree_iter *iter,
1208 			       struct bkey_s_c k)
1209 {
1210 	struct bch_fs *c = trans->c;
1211 	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
1212 
1213 	/*
1214 	 * When we have a linear chain of snapshot nodes, we consider
1215 	 * those to form an equivalence class: we're going to collapse
1216 	 * them all down to a single node, and keep the leaf-most node -
1217 	 * which has the same id as the equivalence class id.
1218 	 *
1219 	 * If there are multiple keys in different snapshots at the same
1220 	 * position, we're only going to keep the one in the newest
1221 	 * snapshot - the rest have been overwritten and are redundant,
1222 	 * and for the key we're going to keep we need to move it to the
1223 	 * equivalance class ID if it's not there already.
1224 	 */
1225 	if (equiv != k.k->p.snapshot) {
1226 		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1227 		struct btree_iter new_iter;
1228 		int ret;
1229 
1230 		ret = PTR_ERR_OR_ZERO(new);
1231 		if (ret)
1232 			return ret;
1233 
1234 		new->k.p.snapshot = equiv;
1235 
1236 		bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
1237 				     BTREE_ITER_ALL_SNAPSHOTS|
1238 				     BTREE_ITER_CACHED|
1239 				     BTREE_ITER_INTENT);
1240 
1241 		ret =   bch2_btree_iter_traverse(&new_iter) ?:
1242 			bch2_trans_update(trans, &new_iter, new,
1243 					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
1244 			bch2_btree_delete_at(trans, iter,
1245 					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1246 		bch2_trans_iter_exit(trans, &new_iter);
1247 		if (ret)
1248 			return ret;
1249 	}
1250 
1251 	return 0;
1252 }
1253 
1254 /*
1255  * For a given snapshot, if it doesn't have a subvolume that points to it, and
1256  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1257  * as deleted.
1258  */
1259 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
1260 					  struct bkey_s_c k)
1261 {
1262 	struct bkey_s_c_snapshot snap;
1263 	u32 children[2];
1264 	int ret;
1265 
1266 	if (k.k->type != KEY_TYPE_snapshot)
1267 		return 0;
1268 
1269 	snap = bkey_s_c_to_snapshot(k);
1270 	if (BCH_SNAPSHOT_DELETED(snap.v) ||
1271 	    BCH_SNAPSHOT_SUBVOL(snap.v))
1272 		return 0;
1273 
1274 	children[0] = le32_to_cpu(snap.v->children[0]);
1275 	children[1] = le32_to_cpu(snap.v->children[1]);
1276 
1277 	ret   = bch2_snapshot_live(trans, children[0]) ?:
1278 		bch2_snapshot_live(trans, children[1]);
1279 	if (ret < 0)
1280 		return ret;
1281 
1282 	if (!ret)
1283 		return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
1284 	return 0;
1285 }
1286 
1287 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1288 						snapshot_id_list *skip)
1289 {
1290 	rcu_read_lock();
1291 	while (snapshot_list_has_id(skip, id))
1292 		id = __bch2_snapshot_parent(c, id);
1293 
1294 	while (n--) {
1295 		do {
1296 			id = __bch2_snapshot_parent(c, id);
1297 		} while (snapshot_list_has_id(skip, id));
1298 	}
1299 	rcu_read_unlock();
1300 
1301 	return id;
1302 }
1303 
1304 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1305 					      struct btree_iter *iter, struct bkey_s_c k,
1306 					      snapshot_id_list *deleted)
1307 {
1308 	struct bch_fs *c = trans->c;
1309 	u32 nr_deleted_ancestors = 0;
1310 	struct bkey_i_snapshot *s;
1311 	u32 *i;
1312 	int ret;
1313 
1314 	if (k.k->type != KEY_TYPE_snapshot)
1315 		return 0;
1316 
1317 	if (snapshot_list_has_id(deleted, k.k->p.offset))
1318 		return 0;
1319 
1320 	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1321 	ret = PTR_ERR_OR_ZERO(s);
1322 	if (ret)
1323 		return ret;
1324 
1325 	darray_for_each(*deleted, i)
1326 		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
1327 
1328 	if (!nr_deleted_ancestors)
1329 		return 0;
1330 
1331 	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1332 
1333 	if (!s->v.depth) {
1334 		s->v.skip[0] = 0;
1335 		s->v.skip[1] = 0;
1336 		s->v.skip[2] = 0;
1337 	} else {
1338 		u32 depth = le32_to_cpu(s->v.depth);
1339 		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1340 
1341 		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1342 			u32 id = le32_to_cpu(s->v.skip[j]);
1343 
1344 			if (snapshot_list_has_id(deleted, id)) {
1345 				id = depth > 1
1346 					? bch2_snapshot_nth_parent_skip(c,
1347 							parent,
1348 							get_random_u32_below(depth - 1),
1349 							deleted)
1350 					: parent;
1351 				s->v.skip[j] = cpu_to_le32(id);
1352 			}
1353 		}
1354 
1355 		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1356 	}
1357 
1358 	return bch2_trans_update(trans, iter, &s->k_i, 0);
1359 }
1360 
1361 int bch2_delete_dead_snapshots(struct bch_fs *c)
1362 {
1363 	struct btree_trans *trans;
1364 	struct btree_iter iter;
1365 	struct bkey_s_c k;
1366 	struct bkey_s_c_snapshot snap;
1367 	snapshot_id_list deleted = { 0 };
1368 	snapshot_id_list deleted_interior = { 0 };
1369 	u32 *i, id;
1370 	int ret = 0;
1371 
1372 	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
1373 		ret = bch2_fs_read_write_early(c);
1374 		if (ret) {
1375 			bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
1376 			return ret;
1377 		}
1378 	}
1379 
1380 	trans = bch2_trans_get(c);
1381 
1382 	/*
1383 	 * For every snapshot node: If we have no live children and it's not
1384 	 * pointed to by a subvolume, delete it:
1385 	 */
1386 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
1387 			POS_MIN, 0, k,
1388 			NULL, NULL, 0,
1389 		bch2_delete_redundant_snapshot(trans, &iter, k));
1390 	if (ret) {
1391 		bch_err_msg(c, ret, "deleting redundant snapshots");
1392 		goto err;
1393 	}
1394 
1395 	ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
1396 				  POS_MIN, 0, k,
1397 		bch2_snapshot_set_equiv(trans, k));
1398 	if (ret) {
1399 		bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
1400 		goto err;
1401 	}
1402 
1403 	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1404 			   POS_MIN, 0, k, ret) {
1405 		if (k.k->type != KEY_TYPE_snapshot)
1406 			continue;
1407 
1408 		snap = bkey_s_c_to_snapshot(k);
1409 		if (BCH_SNAPSHOT_DELETED(snap.v)) {
1410 			ret = snapshot_list_add(c, &deleted, k.k->p.offset);
1411 			if (ret)
1412 				break;
1413 		}
1414 	}
1415 	bch2_trans_iter_exit(trans, &iter);
1416 
1417 	if (ret) {
1418 		bch_err_msg(c, ret, "walking snapshots");
1419 		goto err;
1420 	}
1421 
1422 	for (id = 0; id < BTREE_ID_NR; id++) {
1423 		struct bpos last_pos = POS_MIN;
1424 		snapshot_id_list equiv_seen = { 0 };
1425 		struct disk_reservation res = { 0 };
1426 
1427 		if (!btree_type_has_snapshots(id))
1428 			continue;
1429 
1430 		ret = for_each_btree_key_commit(trans, iter,
1431 				id, POS_MIN,
1432 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1433 				&res, NULL, BTREE_INSERT_NOFAIL,
1434 			snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
1435 		      for_each_btree_key_commit(trans, iter,
1436 				id, POS_MIN,
1437 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1438 				&res, NULL, BTREE_INSERT_NOFAIL,
1439 			move_key_to_correct_snapshot(trans, &iter, k));
1440 
1441 		bch2_disk_reservation_put(c, &res);
1442 		darray_exit(&equiv_seen);
1443 
1444 		if (ret) {
1445 			bch_err_msg(c, ret, "deleting keys from dying snapshots");
1446 			goto err;
1447 		}
1448 	}
1449 
1450 	down_write(&c->snapshot_create_lock);
1451 
1452 	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1453 			   POS_MIN, 0, k, ret) {
1454 		u32 snapshot = k.k->p.offset;
1455 		u32 equiv = bch2_snapshot_equiv(c, snapshot);
1456 
1457 		if (equiv != snapshot)
1458 			snapshot_list_add(c, &deleted_interior, snapshot);
1459 	}
1460 	bch2_trans_iter_exit(trans, &iter);
1461 
1462 	if (ret)
1463 		goto err_create_lock;
1464 
1465 	/*
1466 	 * Fixing children of deleted snapshots can't be done completely
1467 	 * atomically, if we crash between here and when we delete the interior
1468 	 * nodes some depth fields will be off:
1469 	 */
1470 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1471 				  BTREE_ITER_INTENT, k,
1472 				  NULL, NULL, BTREE_INSERT_NOFAIL,
1473 		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
1474 	if (ret)
1475 		goto err_create_lock;
1476 
1477 	darray_for_each(deleted, i) {
1478 		ret = commit_do(trans, NULL, NULL, 0,
1479 			bch2_snapshot_node_delete(trans, *i));
1480 		if (ret) {
1481 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
1482 			goto err_create_lock;
1483 		}
1484 	}
1485 
1486 	darray_for_each(deleted_interior, i) {
1487 		ret = commit_do(trans, NULL, NULL, 0,
1488 			bch2_snapshot_node_delete(trans, *i));
1489 		if (ret) {
1490 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
1491 			goto err_create_lock;
1492 		}
1493 	}
1494 
1495 	clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
1496 err_create_lock:
1497 	up_write(&c->snapshot_create_lock);
1498 err:
1499 	darray_exit(&deleted_interior);
1500 	darray_exit(&deleted);
1501 	bch2_trans_put(trans);
1502 	if (ret)
1503 		bch_err_fn(c, ret);
1504 	return ret;
1505 }
1506 
1507 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1508 {
1509 	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1510 
1511 	if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
1512 		bch2_delete_dead_snapshots(c);
1513 	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1514 }
1515 
1516 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1517 {
1518 	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
1519 	    !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1520 		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1521 }
1522 
1523 int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
1524 				    struct btree_trans_commit_hook *h)
1525 {
1526 	struct bch_fs *c = trans->c;
1527 
1528 	set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
1529 
1530 	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
1531 		return 0;
1532 
1533 	bch2_delete_dead_snapshots_async(c);
1534 	return 0;
1535 }
1536 
1537 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1538 				       enum btree_id id,
1539 				       struct bpos pos)
1540 {
1541 	struct bch_fs *c = trans->c;
1542 	struct btree_iter iter;
1543 	struct bkey_s_c k;
1544 	int ret;
1545 
1546 	bch2_trans_iter_init(trans, &iter, id, pos,
1547 			     BTREE_ITER_NOT_EXTENTS|
1548 			     BTREE_ITER_ALL_SNAPSHOTS);
1549 	while (1) {
1550 		k = bch2_btree_iter_prev(&iter);
1551 		ret = bkey_err(k);
1552 		if (ret)
1553 			break;
1554 
1555 		if (!k.k)
1556 			break;
1557 
1558 		if (!bkey_eq(pos, k.k->p))
1559 			break;
1560 
1561 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1562 			ret = 1;
1563 			break;
1564 		}
1565 	}
1566 	bch2_trans_iter_exit(trans, &iter);
1567 
1568 	return ret;
1569 }
1570 
1571 static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
1572 {
1573 	const struct snapshot_t *s = snapshot_t(c, id);
1574 
1575 	return s->children[1] ?: s->children[0];
1576 }
1577 
1578 static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
1579 {
1580 	u32 child;
1581 
1582 	while ((child = bch2_snapshot_smallest_child(c, id)))
1583 		id = child;
1584 	return id;
1585 }
1586 
1587 static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
1588 					       enum btree_id btree,
1589 					       struct bkey_s_c interior_k,
1590 					       u32 leaf_id, struct bpos *new_min_pos)
1591 {
1592 	struct btree_iter iter;
1593 	struct bpos pos = interior_k.k->p;
1594 	struct bkey_s_c k;
1595 	struct bkey_i *new;
1596 	int ret;
1597 
1598 	pos.snapshot = leaf_id;
1599 
1600 	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
1601 	k = bch2_btree_iter_peek_slot(&iter);
1602 	ret = bkey_err(k);
1603 	if (ret)
1604 		goto out;
1605 
1606 	/* key already overwritten in this snapshot? */
1607 	if (k.k->p.snapshot != interior_k.k->p.snapshot)
1608 		goto out;
1609 
1610 	if (bpos_eq(*new_min_pos, POS_MIN)) {
1611 		*new_min_pos = k.k->p;
1612 		new_min_pos->snapshot = leaf_id;
1613 	}
1614 
1615 	new = bch2_bkey_make_mut_noupdate(trans, interior_k);
1616 	ret = PTR_ERR_OR_ZERO(new);
1617 	if (ret)
1618 		goto out;
1619 
1620 	new->k.p.snapshot = leaf_id;
1621 	ret = bch2_trans_update(trans, &iter, new, 0);
1622 out:
1623 	bch2_trans_iter_exit(trans, &iter);
1624 	return ret;
1625 }
1626 
1627 int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
1628 					  enum btree_id btree,
1629 					  struct bkey_s_c k,
1630 					  struct bpos *new_min_pos)
1631 {
1632 	struct bch_fs *c = trans->c;
1633 	struct bkey_buf sk;
1634 	u32 restart_count = trans->restart_count;
1635 	int ret = 0;
1636 
1637 	bch2_bkey_buf_init(&sk);
1638 	bch2_bkey_buf_reassemble(&sk, c, k);
1639 	k = bkey_i_to_s_c(sk.k);
1640 
1641 	*new_min_pos = POS_MIN;
1642 
1643 	for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
1644 	     id < k.k->p.snapshot;
1645 	     id++) {
1646 		if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
1647 		    !bch2_snapshot_is_leaf(c, id))
1648 			continue;
1649 again:
1650 		ret =   btree_trans_too_many_iters(trans) ?:
1651 			bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
1652 			bch2_trans_commit(trans, NULL, NULL, 0);
1653 		if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1654 			bch2_trans_begin(trans);
1655 			goto again;
1656 		}
1657 
1658 		if (ret)
1659 			break;
1660 	}
1661 
1662 	bch2_bkey_buf_exit(&sk, c);
1663 
1664 	return ret ?: trans_was_restarted(trans, restart_count);
1665 }
1666 
1667 int bch2_snapshots_read(struct bch_fs *c)
1668 {
1669 	struct btree_iter iter;
1670 	struct bkey_s_c k;
1671 	int ret = 0;
1672 
1673 	ret = bch2_trans_run(c,
1674 		for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
1675 			   POS_MIN, 0, k,
1676 			bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1677 			bch2_snapshot_set_equiv(trans, k)) ?:
1678 		for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
1679 			   POS_MIN, 0, k,
1680 			   (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
1681 	if (ret)
1682 		bch_err_fn(c, ret);
1683 	return ret;
1684 }
1685 
1686 void bch2_fs_snapshots_exit(struct bch_fs *c)
1687 {
1688 	kfree(rcu_dereference_protected(c->snapshots, true));
1689 }
1690