xref: /linux/fs/bcachefs/snapshot.c (revision 4a4b30ea80d8cb5e8c4c62bb86201f4ea0d9b030)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_key_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "errcode.h"
10 #include "error.h"
11 #include "fs.h"
12 #include "recovery_passes.h"
13 #include "snapshot.h"
14 
15 #include <linux/random.h>
16 
17 /*
18  * Snapshot trees:
19  *
20  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
21  * exist to provide a stable identifier for the whole lifetime of a snapshot
22  * tree.
23  */
24 
bch2_snapshot_tree_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)25 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
26 				struct bkey_s_c k)
27 {
28 	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
29 
30 	prt_printf(out, "subvol %u root snapshot %u",
31 		   le32_to_cpu(t.v->master_subvol),
32 		   le32_to_cpu(t.v->root_snapshot));
33 }
34 
bch2_snapshot_tree_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)35 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
36 				struct bkey_validate_context from)
37 {
38 	int ret = 0;
39 
40 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
41 			 bkey_lt(k.k->p, POS(0, 1)),
42 			 c, snapshot_tree_pos_bad,
43 			 "bad pos");
44 fsck_err:
45 	return ret;
46 }
47 
bch2_snapshot_tree_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot_tree * s)48 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
49 			      struct bch_snapshot_tree *s)
50 {
51 	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
52 					  BTREE_ITER_with_updates, snapshot_tree, s);
53 
54 	if (bch2_err_matches(ret, ENOENT))
55 		ret = -BCH_ERR_ENOENT_snapshot_tree;
56 	return ret;
57 }
58 
59 struct bkey_i_snapshot_tree *
__bch2_snapshot_tree_create(struct btree_trans * trans)60 __bch2_snapshot_tree_create(struct btree_trans *trans)
61 {
62 	struct btree_iter iter;
63 	int ret = bch2_bkey_get_empty_slot(trans, &iter,
64 			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
65 	struct bkey_i_snapshot_tree *s_t;
66 
67 	if (ret == -BCH_ERR_ENOSPC_btree_slot)
68 		ret = -BCH_ERR_ENOSPC_snapshot_tree;
69 	if (ret)
70 		return ERR_PTR(ret);
71 
72 	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
73 	ret = PTR_ERR_OR_ZERO(s_t);
74 	bch2_trans_iter_exit(trans, &iter);
75 	return ret ? ERR_PTR(ret) : s_t;
76 }
77 
bch2_snapshot_tree_create(struct btree_trans * trans,u32 root_id,u32 subvol_id,u32 * tree_id)78 static int bch2_snapshot_tree_create(struct btree_trans *trans,
79 				u32 root_id, u32 subvol_id, u32 *tree_id)
80 {
81 	struct bkey_i_snapshot_tree *n_tree =
82 		__bch2_snapshot_tree_create(trans);
83 
84 	if (IS_ERR(n_tree))
85 		return PTR_ERR(n_tree);
86 
87 	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
88 	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
89 	*tree_id = n_tree->k.p.offset;
90 	return 0;
91 }
92 
93 /* Snapshot nodes: */
94 
__bch2_snapshot_is_ancestor_early(struct snapshot_table * t,u32 id,u32 ancestor)95 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
96 {
97 	while (id && id < ancestor) {
98 		const struct snapshot_t *s = __snapshot_t(t, id);
99 		id = s ? s->parent : 0;
100 	}
101 	return id == ancestor;
102 }
103 
bch2_snapshot_is_ancestor_early(struct bch_fs * c,u32 id,u32 ancestor)104 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
105 {
106 	rcu_read_lock();
107 	bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
108 	rcu_read_unlock();
109 
110 	return ret;
111 }
112 
get_ancestor_below(struct snapshot_table * t,u32 id,u32 ancestor)113 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
114 {
115 	const struct snapshot_t *s = __snapshot_t(t, id);
116 	if (!s)
117 		return 0;
118 
119 	if (s->skip[2] <= ancestor)
120 		return s->skip[2];
121 	if (s->skip[1] <= ancestor)
122 		return s->skip[1];
123 	if (s->skip[0] <= ancestor)
124 		return s->skip[0];
125 	return s->parent;
126 }
127 
test_ancestor_bitmap(struct snapshot_table * t,u32 id,u32 ancestor)128 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
129 {
130 	const struct snapshot_t *s = __snapshot_t(t, id);
131 	if (!s)
132 		return false;
133 
134 	return test_bit(ancestor - id - 1, s->is_ancestor);
135 }
136 
__bch2_snapshot_is_ancestor(struct bch_fs * c,u32 id,u32 ancestor)137 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
138 {
139 	bool ret;
140 
141 	rcu_read_lock();
142 	struct snapshot_table *t = rcu_dereference(c->snapshots);
143 
144 	if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
145 		ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
146 		goto out;
147 	}
148 
149 	if (likely(ancestor >= IS_ANCESTOR_BITMAP))
150 		while (id && id < ancestor - IS_ANCESTOR_BITMAP)
151 			id = get_ancestor_below(t, id, ancestor);
152 
153 	ret = id && id < ancestor
154 		? test_ancestor_bitmap(t, id, ancestor)
155 		: id == ancestor;
156 
157 	EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
158 out:
159 	rcu_read_unlock();
160 
161 	return ret;
162 }
163 
__snapshot_t_mut(struct bch_fs * c,u32 id)164 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
165 {
166 	size_t idx = U32_MAX - id;
167 	struct snapshot_table *new, *old;
168 
169 	size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
170 	size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
171 
172 	if (unlikely(new_bytes > INT_MAX))
173 		return NULL;
174 
175 	new = kvzalloc(new_bytes, GFP_KERNEL);
176 	if (!new)
177 		return NULL;
178 
179 	new->nr = new_size;
180 
181 	old = rcu_dereference_protected(c->snapshots, true);
182 	if (old)
183 		memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
184 
185 	rcu_assign_pointer(c->snapshots, new);
186 	kvfree_rcu(old, rcu);
187 
188 	return &rcu_dereference_protected(c->snapshots,
189 				lockdep_is_held(&c->snapshot_table_lock))->s[idx];
190 }
191 
snapshot_t_mut(struct bch_fs * c,u32 id)192 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
193 {
194 	size_t idx = U32_MAX - id;
195 	struct snapshot_table *table =
196 		rcu_dereference_protected(c->snapshots,
197 				lockdep_is_held(&c->snapshot_table_lock));
198 
199 	lockdep_assert_held(&c->snapshot_table_lock);
200 
201 	if (likely(table && idx < table->nr))
202 		return &table->s[idx];
203 
204 	return __snapshot_t_mut(c, id);
205 }
206 
bch2_snapshot_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)207 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
208 			   struct bkey_s_c k)
209 {
210 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
211 
212 	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
213 	       BCH_SNAPSHOT_SUBVOL(s.v),
214 	       BCH_SNAPSHOT_DELETED(s.v),
215 	       le32_to_cpu(s.v->parent),
216 	       le32_to_cpu(s.v->children[0]),
217 	       le32_to_cpu(s.v->children[1]),
218 	       le32_to_cpu(s.v->subvol),
219 	       le32_to_cpu(s.v->tree));
220 
221 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
222 		prt_printf(out, " depth %u skiplist %u %u %u",
223 			   le32_to_cpu(s.v->depth),
224 			   le32_to_cpu(s.v->skip[0]),
225 			   le32_to_cpu(s.v->skip[1]),
226 			   le32_to_cpu(s.v->skip[2]));
227 }
228 
bch2_snapshot_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)229 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
230 			   struct bkey_validate_context from)
231 {
232 	struct bkey_s_c_snapshot s;
233 	u32 i, id;
234 	int ret = 0;
235 
236 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
237 			 bkey_lt(k.k->p, POS(0, 1)),
238 			 c, snapshot_pos_bad,
239 			 "bad pos");
240 
241 	s = bkey_s_c_to_snapshot(k);
242 
243 	id = le32_to_cpu(s.v->parent);
244 	bkey_fsck_err_on(id && id <= k.k->p.offset,
245 			 c, snapshot_parent_bad,
246 			 "bad parent node (%u <= %llu)",
247 			 id, k.k->p.offset);
248 
249 	bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
250 			 c, snapshot_children_not_normalized,
251 			 "children not normalized");
252 
253 	bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
254 			 c, snapshot_child_duplicate,
255 			 "duplicate child nodes");
256 
257 	for (i = 0; i < 2; i++) {
258 		id = le32_to_cpu(s.v->children[i]);
259 
260 		bkey_fsck_err_on(id >= k.k->p.offset,
261 				 c, snapshot_child_bad,
262 				 "bad child node (%u >= %llu)",
263 				 id, k.k->p.offset);
264 	}
265 
266 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
267 		bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
268 				 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
269 				 c, snapshot_skiplist_not_normalized,
270 				 "skiplist not normalized");
271 
272 		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
273 			id = le32_to_cpu(s.v->skip[i]);
274 
275 			bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
276 					 c, snapshot_skiplist_bad,
277 					 "bad skiplist node %u", id);
278 		}
279 	}
280 fsck_err:
281 	return ret;
282 }
283 
__bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s_c new,enum btree_iter_update_trigger_flags flags)284 static int __bch2_mark_snapshot(struct btree_trans *trans,
285 		       enum btree_id btree, unsigned level,
286 		       struct bkey_s_c old, struct bkey_s_c new,
287 		       enum btree_iter_update_trigger_flags flags)
288 {
289 	struct bch_fs *c = trans->c;
290 	struct snapshot_t *t;
291 	u32 id = new.k->p.offset;
292 	int ret = 0;
293 
294 	mutex_lock(&c->snapshot_table_lock);
295 
296 	t = snapshot_t_mut(c, id);
297 	if (!t) {
298 		ret = -BCH_ERR_ENOMEM_mark_snapshot;
299 		goto err;
300 	}
301 
302 	if (new.k->type == KEY_TYPE_snapshot) {
303 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
304 
305 		t->live		= true;
306 		t->parent	= le32_to_cpu(s.v->parent);
307 		t->children[0]	= le32_to_cpu(s.v->children[0]);
308 		t->children[1]	= le32_to_cpu(s.v->children[1]);
309 		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
310 		t->tree		= le32_to_cpu(s.v->tree);
311 
312 		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
313 			t->depth	= le32_to_cpu(s.v->depth);
314 			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
315 			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
316 			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
317 		} else {
318 			t->depth	= 0;
319 			t->skip[0]	= 0;
320 			t->skip[1]	= 0;
321 			t->skip[2]	= 0;
322 		}
323 
324 		u32 parent = id;
325 
326 		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
327 		       parent - id - 1 < IS_ANCESTOR_BITMAP)
328 			__set_bit(parent - id - 1, t->is_ancestor);
329 
330 		if (BCH_SNAPSHOT_DELETED(s.v)) {
331 			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
332 			if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
333 				bch2_delete_dead_snapshots_async(c);
334 		}
335 	} else {
336 		memset(t, 0, sizeof(*t));
337 	}
338 err:
339 	mutex_unlock(&c->snapshot_table_lock);
340 	return ret;
341 }
342 
bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s new,enum btree_iter_update_trigger_flags flags)343 int bch2_mark_snapshot(struct btree_trans *trans,
344 		       enum btree_id btree, unsigned level,
345 		       struct bkey_s_c old, struct bkey_s new,
346 		       enum btree_iter_update_trigger_flags flags)
347 {
348 	return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
349 }
350 
bch2_snapshot_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot * s)351 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
352 			 struct bch_snapshot *s)
353 {
354 	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
355 				       BTREE_ITER_with_updates, snapshot, s);
356 }
357 
358 /* fsck: */
359 
bch2_snapshot_child(struct bch_fs * c,u32 id,unsigned child)360 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
361 {
362 	return snapshot_t(c, id)->children[child];
363 }
364 
bch2_snapshot_left_child(struct bch_fs * c,u32 id)365 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
366 {
367 	return bch2_snapshot_child(c, id, 0);
368 }
369 
bch2_snapshot_right_child(struct bch_fs * c,u32 id)370 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
371 {
372 	return bch2_snapshot_child(c, id, 1);
373 }
374 
bch2_snapshot_tree_next(struct bch_fs * c,u32 id)375 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
376 {
377 	u32 n, parent;
378 
379 	n = bch2_snapshot_left_child(c, id);
380 	if (n)
381 		return n;
382 
383 	while ((parent = bch2_snapshot_parent(c, id))) {
384 		n = bch2_snapshot_right_child(c, parent);
385 		if (n && n != id)
386 			return n;
387 		id = parent;
388 	}
389 
390 	return 0;
391 }
392 
bch2_snapshot_tree_oldest_subvol(struct bch_fs * c,u32 snapshot_root)393 u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
394 {
395 	u32 id = snapshot_root;
396 	u32 subvol = 0, s;
397 
398 	rcu_read_lock();
399 	while (id) {
400 		s = snapshot_t(c, id)->subvol;
401 
402 		if (s && (!subvol || s < subvol))
403 			subvol = s;
404 
405 		id = bch2_snapshot_tree_next(c, id);
406 	}
407 	rcu_read_unlock();
408 
409 	return subvol;
410 }
411 
bch2_snapshot_tree_master_subvol(struct btree_trans * trans,u32 snapshot_root,u32 * subvol_id)412 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
413 					    u32 snapshot_root, u32 *subvol_id)
414 {
415 	struct bch_fs *c = trans->c;
416 	struct btree_iter iter;
417 	struct bkey_s_c k;
418 	bool found = false;
419 	int ret;
420 
421 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
422 				     0, k, ret) {
423 		if (k.k->type != KEY_TYPE_subvolume)
424 			continue;
425 
426 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
427 		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
428 			continue;
429 		if (!BCH_SUBVOLUME_SNAP(s.v)) {
430 			*subvol_id = s.k->p.offset;
431 			found = true;
432 			break;
433 		}
434 	}
435 	bch2_trans_iter_exit(trans, &iter);
436 
437 	if (!ret && !found) {
438 		struct bkey_i_subvolume *u;
439 
440 		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
441 
442 		u = bch2_bkey_get_mut_typed(trans, &iter,
443 					    BTREE_ID_subvolumes, POS(0, *subvol_id),
444 					    0, subvolume);
445 		ret = PTR_ERR_OR_ZERO(u);
446 		if (ret)
447 			return ret;
448 
449 		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
450 	}
451 
452 	return ret;
453 }
454 
check_snapshot_tree(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)455 static int check_snapshot_tree(struct btree_trans *trans,
456 			       struct btree_iter *iter,
457 			       struct bkey_s_c k)
458 {
459 	struct bch_fs *c = trans->c;
460 	struct bkey_s_c_snapshot_tree st;
461 	struct bch_snapshot s;
462 	struct bch_subvolume subvol;
463 	struct printbuf buf = PRINTBUF;
464 	struct btree_iter snapshot_iter = {};
465 	u32 root_id;
466 	int ret;
467 
468 	if (k.k->type != KEY_TYPE_snapshot_tree)
469 		return 0;
470 
471 	st = bkey_s_c_to_snapshot_tree(k);
472 	root_id = le32_to_cpu(st.v->root_snapshot);
473 
474 	struct bkey_s_c_snapshot snapshot_k =
475 		bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
476 					 POS(0, root_id), 0, snapshot);
477 	ret = bkey_err(snapshot_k);
478 	if (ret && !bch2_err_matches(ret, ENOENT))
479 		goto err;
480 
481 	if (!ret)
482 		bkey_val_copy(&s, snapshot_k);
483 
484 	if (fsck_err_on(ret ||
485 			root_id != bch2_snapshot_root(c, root_id) ||
486 			st.k->p.offset != le32_to_cpu(s.tree),
487 			trans, snapshot_tree_to_missing_snapshot,
488 			"snapshot tree points to missing/incorrect snapshot:\n  %s",
489 			(bch2_bkey_val_to_text(&buf, c, st.s_c),
490 			 prt_newline(&buf),
491 			 ret
492 			 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
493 			 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
494 			 buf.buf))) {
495 		ret = bch2_btree_delete_at(trans, iter, 0);
496 		goto err;
497 	}
498 
499 	if (!st.v->master_subvol)
500 		goto out;
501 
502 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
503 	if (ret && !bch2_err_matches(ret, ENOENT))
504 		goto err;
505 
506 	if (fsck_err_on(ret,
507 			trans, snapshot_tree_to_missing_subvol,
508 			"snapshot tree points to missing subvolume:\n  %s",
509 			(printbuf_reset(&buf),
510 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
511 	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
512 						le32_to_cpu(subvol.snapshot),
513 						root_id),
514 			trans, snapshot_tree_to_wrong_subvol,
515 			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
516 			(printbuf_reset(&buf),
517 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
518 	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
519 			trans, snapshot_tree_to_snapshot_subvol,
520 			"snapshot tree points to snapshot subvolume:\n  %s",
521 			(printbuf_reset(&buf),
522 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
523 		struct bkey_i_snapshot_tree *u;
524 		u32 subvol_id;
525 
526 		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
527 		bch_err_fn(c, ret);
528 
529 		if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
530 			ret = 0;
531 			goto err;
532 		}
533 
534 		if (ret)
535 			goto err;
536 
537 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
538 		ret = PTR_ERR_OR_ZERO(u);
539 		if (ret)
540 			goto err;
541 
542 		u->v.master_subvol = cpu_to_le32(subvol_id);
543 		st = snapshot_tree_i_to_s_c(u);
544 	}
545 out:
546 err:
547 fsck_err:
548 	bch2_trans_iter_exit(trans, &snapshot_iter);
549 	printbuf_exit(&buf);
550 	return ret;
551 }
552 
553 /*
554  * For each snapshot_tree, make sure it points to the root of a snapshot tree
555  * and that snapshot entry points back to it, or delete it.
556  *
557  * And, make sure it points to a subvolume within that snapshot tree, or correct
558  * it to point to the oldest subvolume within that snapshot tree.
559  */
bch2_check_snapshot_trees(struct bch_fs * c)560 int bch2_check_snapshot_trees(struct bch_fs *c)
561 {
562 	int ret = bch2_trans_run(c,
563 		for_each_btree_key_commit(trans, iter,
564 			BTREE_ID_snapshot_trees, POS_MIN,
565 			BTREE_ITER_prefetch, k,
566 			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
567 		check_snapshot_tree(trans, &iter, k)));
568 	bch_err_fn(c, ret);
569 	return ret;
570 }
571 
572 /*
573  * Look up snapshot tree for @tree_id and find root,
574  * make sure @snap_id is a descendent:
575  */
snapshot_tree_ptr_good(struct btree_trans * trans,u32 snap_id,u32 tree_id)576 static int snapshot_tree_ptr_good(struct btree_trans *trans,
577 				  u32 snap_id, u32 tree_id)
578 {
579 	struct bch_snapshot_tree s_t;
580 	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
581 
582 	if (bch2_err_matches(ret, ENOENT))
583 		return 0;
584 	if (ret)
585 		return ret;
586 
587 	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
588 }
589 
bch2_snapshot_skiplist_get(struct bch_fs * c,u32 id)590 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
591 {
592 	const struct snapshot_t *s;
593 
594 	if (!id)
595 		return 0;
596 
597 	rcu_read_lock();
598 	s = snapshot_t(c, id);
599 	if (s->parent)
600 		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
601 	rcu_read_unlock();
602 
603 	return id;
604 }
605 
snapshot_skiplist_good(struct btree_trans * trans,u32 id,struct bch_snapshot s)606 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
607 {
608 	unsigned i;
609 
610 	for (i = 0; i < 3; i++)
611 		if (!s.parent) {
612 			if (s.skip[i])
613 				return false;
614 		} else {
615 			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
616 				return false;
617 		}
618 
619 	return true;
620 }
621 
622 /*
623  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
624  * its snapshot_tree pointer is correct (allocate new one if necessary), then
625  * update this node's pointer to root node's pointer:
626  */
snapshot_tree_ptr_repair(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_snapshot * s)627 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
628 				    struct btree_iter *iter,
629 				    struct bkey_s_c k,
630 				    struct bch_snapshot *s)
631 {
632 	struct bch_fs *c = trans->c;
633 	struct btree_iter root_iter;
634 	struct bch_snapshot_tree s_t;
635 	struct bkey_s_c_snapshot root;
636 	struct bkey_i_snapshot *u;
637 	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
638 	int ret;
639 
640 	root = bch2_bkey_get_iter_typed(trans, &root_iter,
641 			       BTREE_ID_snapshots, POS(0, root_id),
642 			       BTREE_ITER_with_updates, snapshot);
643 	ret = bkey_err(root);
644 	if (ret)
645 		goto err;
646 
647 	tree_id = le32_to_cpu(root.v->tree);
648 
649 	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
650 	if (ret && !bch2_err_matches(ret, ENOENT))
651 		return ret;
652 
653 	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
654 		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
655 		ret =   PTR_ERR_OR_ZERO(u) ?:
656 			bch2_snapshot_tree_create(trans, root_id,
657 				bch2_snapshot_tree_oldest_subvol(c, root_id),
658 				&tree_id);
659 		if (ret)
660 			goto err;
661 
662 		u->v.tree = cpu_to_le32(tree_id);
663 		if (k.k->p.offset == root_id)
664 			*s = u->v;
665 	}
666 
667 	if (k.k->p.offset != root_id) {
668 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
669 		ret = PTR_ERR_OR_ZERO(u);
670 		if (ret)
671 			goto err;
672 
673 		u->v.tree = cpu_to_le32(tree_id);
674 		*s = u->v;
675 	}
676 err:
677 	bch2_trans_iter_exit(trans, &root_iter);
678 	return ret;
679 }
680 
check_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)681 static int check_snapshot(struct btree_trans *trans,
682 			  struct btree_iter *iter,
683 			  struct bkey_s_c k)
684 {
685 	struct bch_fs *c = trans->c;
686 	struct bch_snapshot s;
687 	struct bch_subvolume subvol;
688 	struct bch_snapshot v;
689 	struct bkey_i_snapshot *u;
690 	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
691 	u32 real_depth;
692 	struct printbuf buf = PRINTBUF;
693 	u32 i, id;
694 	int ret = 0;
695 
696 	if (k.k->type != KEY_TYPE_snapshot)
697 		return 0;
698 
699 	memset(&s, 0, sizeof(s));
700 	memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
701 
702 	id = le32_to_cpu(s.parent);
703 	if (id) {
704 		ret = bch2_snapshot_lookup(trans, id, &v);
705 		if (bch2_err_matches(ret, ENOENT))
706 			bch_err(c, "snapshot with nonexistent parent:\n  %s",
707 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
708 		if (ret)
709 			goto err;
710 
711 		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
712 		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
713 			bch_err(c, "snapshot parent %u missing pointer to child %llu",
714 				id, k.k->p.offset);
715 			ret = -EINVAL;
716 			goto err;
717 		}
718 	}
719 
720 	for (i = 0; i < 2 && s.children[i]; i++) {
721 		id = le32_to_cpu(s.children[i]);
722 
723 		ret = bch2_snapshot_lookup(trans, id, &v);
724 		if (bch2_err_matches(ret, ENOENT))
725 			bch_err(c, "snapshot node %llu has nonexistent child %u",
726 				k.k->p.offset, id);
727 		if (ret)
728 			goto err;
729 
730 		if (le32_to_cpu(v.parent) != k.k->p.offset) {
731 			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
732 				id, le32_to_cpu(v.parent), k.k->p.offset);
733 			ret = -EINVAL;
734 			goto err;
735 		}
736 	}
737 
738 	bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
739 		!BCH_SNAPSHOT_DELETED(&s);
740 
741 	if (should_have_subvol) {
742 		id = le32_to_cpu(s.subvol);
743 		ret = bch2_subvolume_get(trans, id, false, &subvol);
744 		if (bch2_err_matches(ret, ENOENT))
745 			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
746 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
747 		if (ret)
748 			goto err;
749 
750 		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
751 			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
752 				k.k->p.offset);
753 			ret = -EINVAL;
754 			goto err;
755 		}
756 	} else {
757 		if (fsck_err_on(s.subvol,
758 				trans, snapshot_should_not_have_subvol,
759 				"snapshot should not point to subvol:\n  %s",
760 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
761 			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
762 			ret = PTR_ERR_OR_ZERO(u);
763 			if (ret)
764 				goto err;
765 
766 			u->v.subvol = 0;
767 			s = u->v;
768 		}
769 	}
770 
771 	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
772 	if (ret < 0)
773 		goto err;
774 
775 	if (fsck_err_on(!ret,
776 			trans, snapshot_to_bad_snapshot_tree,
777 			"snapshot points to missing/incorrect tree:\n  %s",
778 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
779 		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
780 		if (ret)
781 			goto err;
782 	}
783 	ret = 0;
784 
785 	real_depth = bch2_snapshot_depth(c, parent_id);
786 
787 	if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
788 			trans, snapshot_bad_depth,
789 			"snapshot with incorrect depth field, should be %u:\n  %s",
790 			real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
791 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
792 		ret = PTR_ERR_OR_ZERO(u);
793 		if (ret)
794 			goto err;
795 
796 		u->v.depth = cpu_to_le32(real_depth);
797 		s = u->v;
798 	}
799 
800 	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
801 	if (ret < 0)
802 		goto err;
803 
804 	if (fsck_err_on(!ret,
805 			trans, snapshot_bad_skiplist,
806 			"snapshot with bad skiplist field:\n  %s",
807 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
808 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
809 		ret = PTR_ERR_OR_ZERO(u);
810 		if (ret)
811 			goto err;
812 
813 		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
814 			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
815 
816 		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
817 		s = u->v;
818 	}
819 	ret = 0;
820 err:
821 fsck_err:
822 	printbuf_exit(&buf);
823 	return ret;
824 }
825 
bch2_check_snapshots(struct bch_fs * c)826 int bch2_check_snapshots(struct bch_fs *c)
827 {
828 	/*
829 	 * We iterate backwards as checking/fixing the depth field requires that
830 	 * the parent's depth already be correct:
831 	 */
832 	int ret = bch2_trans_run(c,
833 		for_each_btree_key_reverse_commit(trans, iter,
834 				BTREE_ID_snapshots, POS_MAX,
835 				BTREE_ITER_prefetch, k,
836 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
837 			check_snapshot(trans, &iter, k)));
838 	bch_err_fn(c, ret);
839 	return ret;
840 }
841 
check_snapshot_exists(struct btree_trans * trans,u32 id)842 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
843 {
844 	struct bch_fs *c = trans->c;
845 
846 	if (bch2_snapshot_exists(c, id))
847 		return 0;
848 
849 	/* Do we need to reconstruct the snapshot_tree entry as well? */
850 	struct btree_iter iter;
851 	struct bkey_s_c k;
852 	int ret = 0;
853 	u32 tree_id = 0;
854 
855 	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
856 				     0, k, ret) {
857 		if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
858 			tree_id = k.k->p.offset;
859 			break;
860 		}
861 	}
862 	bch2_trans_iter_exit(trans, &iter);
863 
864 	if (ret)
865 		return ret;
866 
867 	if (!tree_id) {
868 		ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
869 		if (ret)
870 			return ret;
871 	}
872 
873 	struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
874 	ret = PTR_ERR_OR_ZERO(snapshot);
875 	if (ret)
876 		return ret;
877 
878 	bkey_snapshot_init(&snapshot->k_i);
879 	snapshot->k.p		= POS(0, id);
880 	snapshot->v.tree	= cpu_to_le32(tree_id);
881 	snapshot->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
882 
883 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
884 				     0, k, ret) {
885 		if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
886 			snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
887 			SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
888 			break;
889 		}
890 	}
891 	bch2_trans_iter_exit(trans, &iter);
892 
893 	return  bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
894 		bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
895 				   bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
896 }
897 
898 /* Figure out which snapshot nodes belong in the same tree: */
899 struct snapshot_tree_reconstruct {
900 	enum btree_id			btree;
901 	struct bpos			cur_pos;
902 	snapshot_id_list		cur_ids;
903 	DARRAY(snapshot_id_list)	trees;
904 };
905 
snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct * r)906 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
907 {
908 	darray_for_each(r->trees, i)
909 		darray_exit(i);
910 	darray_exit(&r->trees);
911 	darray_exit(&r->cur_ids);
912 }
913 
same_snapshot(struct snapshot_tree_reconstruct * r,struct bpos pos)914 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
915 {
916 	return r->btree == BTREE_ID_inodes
917 		? r->cur_pos.offset == pos.offset
918 		: r->cur_pos.inode == pos.inode;
919 }
920 
snapshot_id_lists_have_common(snapshot_id_list * l,snapshot_id_list * r)921 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
922 {
923 	darray_for_each(*l, i)
924 		if (snapshot_list_has_id(r, *i))
925 			return true;
926 	return false;
927 }
928 
snapshot_id_list_to_text(struct printbuf * out,snapshot_id_list * s)929 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
930 {
931 	bool first = true;
932 	darray_for_each(*s, i) {
933 		if (!first)
934 			prt_char(out, ' ');
935 		first = false;
936 		prt_printf(out, "%u", *i);
937 	}
938 }
939 
snapshot_tree_reconstruct_next(struct bch_fs * c,struct snapshot_tree_reconstruct * r)940 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
941 {
942 	if (r->cur_ids.nr) {
943 		darray_for_each(r->trees, i)
944 			if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
945 				int ret = snapshot_list_merge(c, i, &r->cur_ids);
946 				if (ret)
947 					return ret;
948 				goto out;
949 			}
950 		darray_push(&r->trees, r->cur_ids);
951 		darray_init(&r->cur_ids);
952 	}
953 out:
954 	r->cur_ids.nr = 0;
955 	return 0;
956 }
957 
get_snapshot_trees(struct bch_fs * c,struct snapshot_tree_reconstruct * r,struct bpos pos)958 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
959 {
960 	if (!same_snapshot(r, pos))
961 		snapshot_tree_reconstruct_next(c, r);
962 	r->cur_pos = pos;
963 	return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
964 }
965 
bch2_reconstruct_snapshots(struct bch_fs * c)966 int bch2_reconstruct_snapshots(struct bch_fs *c)
967 {
968 	struct btree_trans *trans = bch2_trans_get(c);
969 	struct printbuf buf = PRINTBUF;
970 	struct snapshot_tree_reconstruct r = {};
971 	int ret = 0;
972 
973 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
974 		if (btree_type_has_snapshots(btree)) {
975 			r.btree = btree;
976 
977 			ret = for_each_btree_key(trans, iter, btree, POS_MIN,
978 					BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
979 				get_snapshot_trees(c, &r, k.k->p);
980 			}));
981 			if (ret)
982 				goto err;
983 
984 			snapshot_tree_reconstruct_next(c, &r);
985 		}
986 	}
987 
988 	darray_for_each(r.trees, t) {
989 		printbuf_reset(&buf);
990 		snapshot_id_list_to_text(&buf, t);
991 
992 		darray_for_each(*t, id) {
993 			if (fsck_err_on(!bch2_snapshot_exists(c, *id),
994 					trans, snapshot_node_missing,
995 					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
996 				if (t->nr > 1) {
997 					bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
998 					ret = -BCH_ERR_fsck_repair_unimplemented;
999 					goto err;
1000 				}
1001 
1002 				ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1003 						check_snapshot_exists(trans, *id));
1004 				if (ret)
1005 					goto err;
1006 			}
1007 		}
1008 	}
1009 fsck_err:
1010 err:
1011 	bch2_trans_put(trans);
1012 	snapshot_tree_reconstruct_exit(&r);
1013 	printbuf_exit(&buf);
1014 	bch_err_fn(c, ret);
1015 	return ret;
1016 }
1017 
bch2_check_key_has_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1018 int bch2_check_key_has_snapshot(struct btree_trans *trans,
1019 				struct btree_iter *iter,
1020 				struct bkey_s_c k)
1021 {
1022 	struct bch_fs *c = trans->c;
1023 	struct printbuf buf = PRINTBUF;
1024 	int ret = 0;
1025 
1026 	if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
1027 			trans, bkey_in_missing_snapshot,
1028 			"key in missing snapshot %s, delete?",
1029 			(bch2_btree_id_to_text(&buf, iter->btree_id),
1030 			 prt_char(&buf, ' '),
1031 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1032 		ret = bch2_btree_delete_at(trans, iter,
1033 					    BTREE_UPDATE_internal_snapshot_node) ?: 1;
1034 fsck_err:
1035 	printbuf_exit(&buf);
1036 	return ret;
1037 }
1038 
1039 /*
1040  * Mark a snapshot as deleted, for future cleanup:
1041  */
bch2_snapshot_node_set_deleted(struct btree_trans * trans,u32 id)1042 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1043 {
1044 	struct btree_iter iter;
1045 	struct bkey_i_snapshot *s =
1046 		bch2_bkey_get_mut_typed(trans, &iter,
1047 				    BTREE_ID_snapshots, POS(0, id),
1048 				    0, snapshot);
1049 	int ret = PTR_ERR_OR_ZERO(s);
1050 	if (unlikely(ret)) {
1051 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1052 					trans->c, "missing snapshot %u", id);
1053 		return ret;
1054 	}
1055 
1056 	/* already deleted? */
1057 	if (BCH_SNAPSHOT_DELETED(&s->v))
1058 		goto err;
1059 
1060 	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1061 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1062 	s->v.subvol = 0;
1063 err:
1064 	bch2_trans_iter_exit(trans, &iter);
1065 	return ret;
1066 }
1067 
normalize_snapshot_child_pointers(struct bch_snapshot * s)1068 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1069 {
1070 	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1071 		swap(s->children[0], s->children[1]);
1072 }
1073 
bch2_snapshot_node_delete(struct btree_trans * trans,u32 id)1074 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1075 {
1076 	struct bch_fs *c = trans->c;
1077 	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
1078 	struct btree_iter c_iter = (struct btree_iter) { NULL };
1079 	struct btree_iter tree_iter = (struct btree_iter) { NULL };
1080 	struct bkey_s_c_snapshot s;
1081 	u32 parent_id, child_id;
1082 	unsigned i;
1083 	int ret = 0;
1084 
1085 	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1086 				     BTREE_ITER_intent, snapshot);
1087 	ret = bkey_err(s);
1088 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1089 				"missing snapshot %u", id);
1090 
1091 	if (ret)
1092 		goto err;
1093 
1094 	BUG_ON(s.v->children[1]);
1095 
1096 	parent_id = le32_to_cpu(s.v->parent);
1097 	child_id = le32_to_cpu(s.v->children[0]);
1098 
1099 	if (parent_id) {
1100 		struct bkey_i_snapshot *parent;
1101 
1102 		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1103 				     BTREE_ID_snapshots, POS(0, parent_id),
1104 				     0, snapshot);
1105 		ret = PTR_ERR_OR_ZERO(parent);
1106 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1107 					"missing snapshot %u", parent_id);
1108 		if (unlikely(ret))
1109 			goto err;
1110 
1111 		/* find entry in parent->children for node being deleted */
1112 		for (i = 0; i < 2; i++)
1113 			if (le32_to_cpu(parent->v.children[i]) == id)
1114 				break;
1115 
1116 		if (bch2_fs_inconsistent_on(i == 2, c,
1117 					"snapshot %u missing child pointer to %u",
1118 					parent_id, id))
1119 			goto err;
1120 
1121 		parent->v.children[i] = cpu_to_le32(child_id);
1122 
1123 		normalize_snapshot_child_pointers(&parent->v);
1124 	}
1125 
1126 	if (child_id) {
1127 		struct bkey_i_snapshot *child;
1128 
1129 		child = bch2_bkey_get_mut_typed(trans, &c_iter,
1130 				     BTREE_ID_snapshots, POS(0, child_id),
1131 				     0, snapshot);
1132 		ret = PTR_ERR_OR_ZERO(child);
1133 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1134 					"missing snapshot %u", child_id);
1135 		if (unlikely(ret))
1136 			goto err;
1137 
1138 		child->v.parent = cpu_to_le32(parent_id);
1139 
1140 		if (!child->v.parent) {
1141 			child->v.skip[0] = 0;
1142 			child->v.skip[1] = 0;
1143 			child->v.skip[2] = 0;
1144 		}
1145 	}
1146 
1147 	if (!parent_id) {
1148 		/*
1149 		 * We're deleting the root of a snapshot tree: update the
1150 		 * snapshot_tree entry to point to the new root, or delete it if
1151 		 * this is the last snapshot ID in this tree:
1152 		 */
1153 		struct bkey_i_snapshot_tree *s_t;
1154 
1155 		BUG_ON(s.v->children[1]);
1156 
1157 		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1158 				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
1159 				0, snapshot_tree);
1160 		ret = PTR_ERR_OR_ZERO(s_t);
1161 		if (ret)
1162 			goto err;
1163 
1164 		if (s.v->children[0]) {
1165 			s_t->v.root_snapshot = s.v->children[0];
1166 		} else {
1167 			s_t->k.type = KEY_TYPE_deleted;
1168 			set_bkey_val_u64s(&s_t->k, 0);
1169 		}
1170 	}
1171 
1172 	ret = bch2_btree_delete_at(trans, &iter, 0);
1173 err:
1174 	bch2_trans_iter_exit(trans, &tree_iter);
1175 	bch2_trans_iter_exit(trans, &p_iter);
1176 	bch2_trans_iter_exit(trans, &c_iter);
1177 	bch2_trans_iter_exit(trans, &iter);
1178 	return ret;
1179 }
1180 
create_snapids(struct btree_trans * trans,u32 parent,u32 tree,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1181 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1182 			  u32 *new_snapids,
1183 			  u32 *snapshot_subvols,
1184 			  unsigned nr_snapids)
1185 {
1186 	struct bch_fs *c = trans->c;
1187 	struct btree_iter iter;
1188 	struct bkey_i_snapshot *n;
1189 	struct bkey_s_c k;
1190 	unsigned i, j;
1191 	u32 depth = bch2_snapshot_depth(c, parent);
1192 	int ret;
1193 
1194 	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1195 			     POS_MIN, BTREE_ITER_intent);
1196 	k = bch2_btree_iter_peek(&iter);
1197 	ret = bkey_err(k);
1198 	if (ret)
1199 		goto err;
1200 
1201 	for (i = 0; i < nr_snapids; i++) {
1202 		k = bch2_btree_iter_prev_slot(&iter);
1203 		ret = bkey_err(k);
1204 		if (ret)
1205 			goto err;
1206 
1207 		if (!k.k || !k.k->p.offset) {
1208 			ret = -BCH_ERR_ENOSPC_snapshot_create;
1209 			goto err;
1210 		}
1211 
1212 		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1213 		ret = PTR_ERR_OR_ZERO(n);
1214 		if (ret)
1215 			goto err;
1216 
1217 		n->v.flags	= 0;
1218 		n->v.parent	= cpu_to_le32(parent);
1219 		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
1220 		n->v.tree	= cpu_to_le32(tree);
1221 		n->v.depth	= cpu_to_le32(depth);
1222 		n->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
1223 		n->v.btime.hi	= 0;
1224 
1225 		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1226 			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1227 
1228 		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1229 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1230 
1231 		ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1232 					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1233 		if (ret)
1234 			goto err;
1235 
1236 		new_snapids[i]	= iter.pos.offset;
1237 	}
1238 err:
1239 	bch2_trans_iter_exit(trans, &iter);
1240 	return ret;
1241 }
1242 
1243 /*
1244  * Create new snapshot IDs as children of an existing snapshot ID:
1245  */
bch2_snapshot_node_create_children(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1246 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1247 			      u32 *new_snapids,
1248 			      u32 *snapshot_subvols,
1249 			      unsigned nr_snapids)
1250 {
1251 	struct btree_iter iter;
1252 	struct bkey_i_snapshot *n_parent;
1253 	int ret = 0;
1254 
1255 	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1256 			BTREE_ID_snapshots, POS(0, parent),
1257 			0, snapshot);
1258 	ret = PTR_ERR_OR_ZERO(n_parent);
1259 	if (unlikely(ret)) {
1260 		if (bch2_err_matches(ret, ENOENT))
1261 			bch_err(trans->c, "snapshot %u not found", parent);
1262 		return ret;
1263 	}
1264 
1265 	if (n_parent->v.children[0] || n_parent->v.children[1]) {
1266 		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1267 		ret = -EINVAL;
1268 		goto err;
1269 	}
1270 
1271 	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1272 			     new_snapids, snapshot_subvols, nr_snapids);
1273 	if (ret)
1274 		goto err;
1275 
1276 	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1277 	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1278 	n_parent->v.subvol = 0;
1279 	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1280 err:
1281 	bch2_trans_iter_exit(trans, &iter);
1282 	return ret;
1283 }
1284 
1285 /*
1286  * Create a snapshot node that is the root of a new tree:
1287  */
bch2_snapshot_node_create_tree(struct btree_trans * trans,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1288 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1289 			      u32 *new_snapids,
1290 			      u32 *snapshot_subvols,
1291 			      unsigned nr_snapids)
1292 {
1293 	struct bkey_i_snapshot_tree *n_tree;
1294 	int ret;
1295 
1296 	n_tree = __bch2_snapshot_tree_create(trans);
1297 	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1298 		create_snapids(trans, 0, n_tree->k.p.offset,
1299 			     new_snapids, snapshot_subvols, nr_snapids);
1300 	if (ret)
1301 		return ret;
1302 
1303 	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
1304 	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
1305 	return 0;
1306 }
1307 
bch2_snapshot_node_create(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1308 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1309 			      u32 *new_snapids,
1310 			      u32 *snapshot_subvols,
1311 			      unsigned nr_snapids)
1312 {
1313 	BUG_ON((parent == 0) != (nr_snapids == 1));
1314 	BUG_ON((parent != 0) != (nr_snapids == 2));
1315 
1316 	return parent
1317 		? bch2_snapshot_node_create_children(trans, parent,
1318 				new_snapids, snapshot_subvols, nr_snapids)
1319 		: bch2_snapshot_node_create_tree(trans,
1320 				new_snapids, snapshot_subvols, nr_snapids);
1321 
1322 }
1323 
1324 /*
1325  * If we have an unlinked inode in an internal snapshot node, and the inode
1326  * really has been deleted in all child snapshots, how does this get cleaned up?
1327  *
1328  * first there is the problem of how keys that have been overwritten in all
1329  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1330  * special?
1331  *
1332  * also: unlinked inode in internal snapshot appears to not be getting deleted
1333  * correctly if inode doesn't exist in leaf snapshots
1334  *
1335  * solution:
1336  *
1337  * for a key in an interior snapshot node that needs work to be done that
1338  * requires it to be mutated: iterate over all descendent leaf nodes and copy
1339  * that key to snapshot leaf nodes, where we can mutate it
1340  */
1341 
1342 struct snapshot_interior_delete {
1343 	u32	id;
1344 	u32	live_child;
1345 };
1346 typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
1347 
interior_delete_has_id(interior_delete_list * l,u32 id)1348 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
1349 {
1350 	darray_for_each(*l, i)
1351 		if (i->id == id)
1352 			return i->live_child;
1353 	return 0;
1354 }
1355 
__live_child(struct snapshot_table * t,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1356 static unsigned __live_child(struct snapshot_table *t, u32 id,
1357 			     snapshot_id_list *delete_leaves,
1358 			     interior_delete_list *delete_interior)
1359 {
1360 	struct snapshot_t *s = __snapshot_t(t, id);
1361 	if (!s)
1362 		return 0;
1363 
1364 	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
1365 		if (s->children[i] &&
1366 		    !snapshot_list_has_id(delete_leaves, s->children[i]) &&
1367 		    !interior_delete_has_id(delete_interior, s->children[i]))
1368 			return s->children[i];
1369 
1370 	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
1371 		u32 live_child = s->children[i]
1372 			? __live_child(t, s->children[i], delete_leaves, delete_interior)
1373 			: 0;
1374 		if (live_child)
1375 			return live_child;
1376 	}
1377 
1378 	return 0;
1379 }
1380 
live_child(struct bch_fs * c,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1381 static unsigned live_child(struct bch_fs *c, u32 id,
1382 			   snapshot_id_list *delete_leaves,
1383 			   interior_delete_list *delete_interior)
1384 {
1385 	rcu_read_lock();
1386 	u32 ret = __live_child(rcu_dereference(c->snapshots), id,
1387 			       delete_leaves, delete_interior);
1388 	rcu_read_unlock();
1389 	return ret;
1390 }
1391 
delete_dead_snapshots_process_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1392 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
1393 					     struct btree_iter *iter,
1394 					     struct bkey_s_c k,
1395 					     snapshot_id_list *delete_leaves,
1396 					     interior_delete_list *delete_interior)
1397 {
1398 	if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
1399 		return bch2_btree_delete_at(trans, iter,
1400 					    BTREE_UPDATE_internal_snapshot_node);
1401 
1402 	u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
1403 	if (live_child) {
1404 		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1405 		int ret = PTR_ERR_OR_ZERO(new);
1406 		if (ret)
1407 			return ret;
1408 
1409 		new->k.p.snapshot = live_child;
1410 
1411 		struct btree_iter dst_iter;
1412 		struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
1413 							   iter->btree_id, new->k.p,
1414 							   BTREE_ITER_all_snapshots|
1415 							   BTREE_ITER_intent);
1416 		ret = bkey_err(dst_k);
1417 		if (ret)
1418 			return ret;
1419 
1420 		ret =   (bkey_deleted(dst_k.k)
1421 			 ? bch2_trans_update(trans, &dst_iter, new,
1422 					     BTREE_UPDATE_internal_snapshot_node)
1423 			 : 0) ?:
1424 			bch2_btree_delete_at(trans, iter,
1425 					     BTREE_UPDATE_internal_snapshot_node);
1426 		bch2_trans_iter_exit(trans, &dst_iter);
1427 		return ret;
1428 	}
1429 
1430 	return 0;
1431 }
1432 
1433 /*
1434  * For a given snapshot, if it doesn't have a subvolume that points to it, and
1435  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1436  * as deleted.
1437  */
check_should_delete_snapshot(struct btree_trans * trans,struct bkey_s_c k,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1438 static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
1439 					snapshot_id_list *delete_leaves,
1440 					interior_delete_list *delete_interior)
1441 {
1442 	if (k.k->type != KEY_TYPE_snapshot)
1443 		return 0;
1444 
1445 	struct bch_fs *c = trans->c;
1446 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
1447 	unsigned live_children = 0;
1448 
1449 	if (BCH_SNAPSHOT_SUBVOL(s.v))
1450 		return 0;
1451 
1452 	for (unsigned i = 0; i < 2; i++) {
1453 		u32 child = le32_to_cpu(s.v->children[i]);
1454 
1455 		live_children += child &&
1456 			!snapshot_list_has_id(delete_leaves, child);
1457 	}
1458 
1459 	if (live_children == 0) {
1460 		return snapshot_list_add(c, delete_leaves, s.k->p.offset);
1461 	} else if (live_children == 1) {
1462 		struct snapshot_interior_delete d = {
1463 			.id		= s.k->p.offset,
1464 			.live_child	= live_child(c, s.k->p.offset, delete_leaves, delete_interior),
1465 		};
1466 
1467 		if (!d.live_child) {
1468 			bch_err(c, "error finding live child of snapshot %u", d.id);
1469 			return -EINVAL;
1470 		}
1471 
1472 		return darray_push(delete_interior, d);
1473 	} else {
1474 		return 0;
1475 	}
1476 }
1477 
bch2_snapshot_nth_parent_skip(struct bch_fs * c,u32 id,u32 n,interior_delete_list * skip)1478 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1479 						interior_delete_list *skip)
1480 {
1481 	rcu_read_lock();
1482 	while (interior_delete_has_id(skip, id))
1483 		id = __bch2_snapshot_parent(c, id);
1484 
1485 	while (n--) {
1486 		do {
1487 			id = __bch2_snapshot_parent(c, id);
1488 		} while (interior_delete_has_id(skip, id));
1489 	}
1490 	rcu_read_unlock();
1491 
1492 	return id;
1493 }
1494 
bch2_fix_child_of_deleted_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,interior_delete_list * deleted)1495 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1496 					      struct btree_iter *iter, struct bkey_s_c k,
1497 					      interior_delete_list *deleted)
1498 {
1499 	struct bch_fs *c = trans->c;
1500 	u32 nr_deleted_ancestors = 0;
1501 	struct bkey_i_snapshot *s;
1502 	int ret;
1503 
1504 	if (k.k->type != KEY_TYPE_snapshot)
1505 		return 0;
1506 
1507 	if (interior_delete_has_id(deleted, k.k->p.offset))
1508 		return 0;
1509 
1510 	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1511 	ret = PTR_ERR_OR_ZERO(s);
1512 	if (ret)
1513 		return ret;
1514 
1515 	darray_for_each(*deleted, i)
1516 		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
1517 
1518 	if (!nr_deleted_ancestors)
1519 		return 0;
1520 
1521 	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1522 
1523 	if (!s->v.depth) {
1524 		s->v.skip[0] = 0;
1525 		s->v.skip[1] = 0;
1526 		s->v.skip[2] = 0;
1527 	} else {
1528 		u32 depth = le32_to_cpu(s->v.depth);
1529 		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1530 
1531 		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1532 			u32 id = le32_to_cpu(s->v.skip[j]);
1533 
1534 			if (interior_delete_has_id(deleted, id)) {
1535 				id = bch2_snapshot_nth_parent_skip(c,
1536 							parent,
1537 							depth > 1
1538 							? get_random_u32_below(depth - 1)
1539 							: 0,
1540 							deleted);
1541 				s->v.skip[j] = cpu_to_le32(id);
1542 			}
1543 		}
1544 
1545 		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1546 	}
1547 
1548 	return bch2_trans_update(trans, iter, &s->k_i, 0);
1549 }
1550 
bch2_delete_dead_snapshots(struct bch_fs * c)1551 int bch2_delete_dead_snapshots(struct bch_fs *c)
1552 {
1553 	if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1554 		return 0;
1555 
1556 	struct btree_trans *trans = bch2_trans_get(c);
1557 	snapshot_id_list delete_leaves = {};
1558 	interior_delete_list delete_interior = {};
1559 	int ret = 0;
1560 
1561 	/*
1562 	 * For every snapshot node: If we have no live children and it's not
1563 	 * pointed to by a subvolume, delete it:
1564 	 */
1565 	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
1566 		check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
1567 	if (!bch2_err_matches(ret, EROFS))
1568 		bch_err_msg(c, ret, "walking snapshots");
1569 	if (ret)
1570 		goto err;
1571 
1572 	if (!delete_leaves.nr && !delete_interior.nr)
1573 		goto err;
1574 
1575 	{
1576 		struct printbuf buf = PRINTBUF;
1577 		prt_printf(&buf, "deleting leaves");
1578 		darray_for_each(delete_leaves, i)
1579 			prt_printf(&buf, " %u", *i);
1580 
1581 		prt_printf(&buf, " interior");
1582 		darray_for_each(delete_interior, i)
1583 			prt_printf(&buf, " %u->%u", i->id, i->live_child);
1584 
1585 		ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
1586 		printbuf_exit(&buf);
1587 		if (ret)
1588 			goto err;
1589 	}
1590 
1591 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
1592 		struct disk_reservation res = { 0 };
1593 
1594 		if (!btree_type_has_snapshots(btree))
1595 			continue;
1596 
1597 		ret = for_each_btree_key_commit(trans, iter,
1598 				btree, POS_MIN,
1599 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1600 				&res, NULL, BCH_TRANS_COMMIT_no_enospc,
1601 			delete_dead_snapshots_process_key(trans, &iter, k,
1602 							  &delete_leaves,
1603 							  &delete_interior));
1604 
1605 		bch2_disk_reservation_put(c, &res);
1606 
1607 		if (!bch2_err_matches(ret, EROFS))
1608 			bch_err_msg(c, ret, "deleting keys from dying snapshots");
1609 		if (ret)
1610 			goto err;
1611 	}
1612 
1613 	darray_for_each(delete_leaves, i) {
1614 		ret = commit_do(trans, NULL, NULL, 0,
1615 			bch2_snapshot_node_delete(trans, *i));
1616 		if (!bch2_err_matches(ret, EROFS))
1617 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
1618 		if (ret)
1619 			goto err;
1620 	}
1621 
1622 	/*
1623 	 * Fixing children of deleted snapshots can't be done completely
1624 	 * atomically, if we crash between here and when we delete the interior
1625 	 * nodes some depth fields will be off:
1626 	 */
1627 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1628 				  BTREE_ITER_intent, k,
1629 				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1630 		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
1631 	if (ret)
1632 		goto err;
1633 
1634 	darray_for_each(delete_interior, i) {
1635 		ret = commit_do(trans, NULL, NULL, 0,
1636 			bch2_snapshot_node_delete(trans, i->id));
1637 		if (!bch2_err_matches(ret, EROFS))
1638 			bch_err_msg(c, ret, "deleting snapshot %u", i->id);
1639 		if (ret)
1640 			goto err;
1641 	}
1642 err:
1643 	darray_exit(&delete_interior);
1644 	darray_exit(&delete_leaves);
1645 	bch2_trans_put(trans);
1646 	if (!bch2_err_matches(ret, EROFS))
1647 		bch_err_fn(c, ret);
1648 	return ret;
1649 }
1650 
bch2_delete_dead_snapshots_work(struct work_struct * work)1651 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1652 {
1653 	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1654 
1655 	set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
1656 
1657 	bch2_delete_dead_snapshots(c);
1658 	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1659 }
1660 
bch2_delete_dead_snapshots_async(struct bch_fs * c)1661 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1662 {
1663 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
1664 		return;
1665 
1666 	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
1667 
1668 	if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1669 		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1670 }
1671 
__bch2_key_has_snapshot_overwrites(struct btree_trans * trans,enum btree_id id,struct bpos pos)1672 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1673 				       enum btree_id id,
1674 				       struct bpos pos)
1675 {
1676 	struct bch_fs *c = trans->c;
1677 	struct btree_iter iter;
1678 	struct bkey_s_c k;
1679 	int ret;
1680 
1681 	for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
1682 					     BTREE_ITER_not_extents|
1683 					     BTREE_ITER_all_snapshots,
1684 					     k, ret) {
1685 		if (!bkey_eq(pos, k.k->p))
1686 			break;
1687 
1688 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1689 			ret = 1;
1690 			break;
1691 		}
1692 	}
1693 	bch2_trans_iter_exit(trans, &iter);
1694 
1695 	return ret;
1696 }
1697 
interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)1698 static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
1699 {
1700 	/* If there's one child, it's redundant and keys will be moved to the child */
1701 	return !!snap.v->children[0] + !!snap.v->children[1] == 1;
1702 }
1703 
bch2_check_snapshot_needs_deletion(struct btree_trans * trans,struct bkey_s_c k)1704 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1705 {
1706 	if (k.k->type != KEY_TYPE_snapshot)
1707 		return 0;
1708 
1709 	struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
1710 	if (BCH_SNAPSHOT_DELETED(snap.v) ||
1711 	    interior_snapshot_needs_delete(snap))
1712 		set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
1713 
1714 	return 0;
1715 }
1716 
bch2_snapshots_read(struct bch_fs * c)1717 int bch2_snapshots_read(struct bch_fs *c)
1718 {
1719 	/*
1720 	 * Initializing the is_ancestor bitmaps requires ancestors to already be
1721 	 * initialized - so mark in reverse:
1722 	 */
1723 	int ret = bch2_trans_run(c,
1724 		for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
1725 				   POS_MAX, 0, k,
1726 			__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1727 			bch2_check_snapshot_needs_deletion(trans, k)));
1728 	bch_err_fn(c, ret);
1729 
1730 	/*
1731 	 * It's important that we check if we need to reconstruct snapshots
1732 	 * before going RW, so we mark that pass as required in the superblock -
1733 	 * otherwise, we could end up deleting keys with missing snapshot nodes
1734 	 * instead
1735 	 */
1736 	BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
1737 	       test_bit(BCH_FS_may_go_rw, &c->flags));
1738 
1739 	if (bch2_err_matches(ret, EIO) ||
1740 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
1741 		ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
1742 
1743 	return ret;
1744 }
1745 
bch2_fs_snapshots_exit(struct bch_fs * c)1746 void bch2_fs_snapshots_exit(struct bch_fs *c)
1747 {
1748 	kvfree(rcu_dereference_protected(c->snapshots, true));
1749 }
1750