xref: /linux/fs/bcachefs/snapshot.c (revision 6f2a71a99ebd5dfaa7948a2e9c59eae94b741bd8)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bbpos.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_key_cache.h"
8 #include "btree_update.h"
9 #include "buckets.h"
10 #include "enumerated_ref.h"
11 #include "errcode.h"
12 #include "error.h"
13 #include "fs.h"
14 #include "recovery_passes.h"
15 #include "snapshot.h"
16 
17 #include <linux/random.h>
18 
19 /*
20  * Snapshot trees:
21  *
22  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
23  * exist to provide a stable identifier for the whole lifetime of a snapshot
24  * tree.
25  */
26 
bch2_snapshot_tree_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)27 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
28 				struct bkey_s_c k)
29 {
30 	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
31 
32 	prt_printf(out, "subvol %u root snapshot %u",
33 		   le32_to_cpu(t.v->master_subvol),
34 		   le32_to_cpu(t.v->root_snapshot));
35 }
36 
bch2_snapshot_tree_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)37 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
38 				struct bkey_validate_context from)
39 {
40 	int ret = 0;
41 
42 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
43 			 bkey_lt(k.k->p, POS(0, 1)),
44 			 c, snapshot_tree_pos_bad,
45 			 "bad pos");
46 fsck_err:
47 	return ret;
48 }
49 
bch2_snapshot_tree_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot_tree * s)50 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
51 			      struct bch_snapshot_tree *s)
52 {
53 	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
54 					  BTREE_ITER_with_updates, snapshot_tree, s);
55 
56 	if (bch2_err_matches(ret, ENOENT))
57 		ret = bch_err_throw(trans->c, ENOENT_snapshot_tree);
58 	return ret;
59 }
60 
61 struct bkey_i_snapshot_tree *
__bch2_snapshot_tree_create(struct btree_trans * trans)62 __bch2_snapshot_tree_create(struct btree_trans *trans)
63 {
64 	struct btree_iter iter;
65 	int ret = bch2_bkey_get_empty_slot(trans, &iter,
66 			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
67 	struct bkey_i_snapshot_tree *s_t;
68 
69 	if (ret == -BCH_ERR_ENOSPC_btree_slot)
70 		ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree);
71 	if (ret)
72 		return ERR_PTR(ret);
73 
74 	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
75 	ret = PTR_ERR_OR_ZERO(s_t);
76 	bch2_trans_iter_exit(trans, &iter);
77 	return ret ? ERR_PTR(ret) : s_t;
78 }
79 
bch2_snapshot_tree_create(struct btree_trans * trans,u32 root_id,u32 subvol_id,u32 * tree_id)80 static int bch2_snapshot_tree_create(struct btree_trans *trans,
81 				u32 root_id, u32 subvol_id, u32 *tree_id)
82 {
83 	struct bkey_i_snapshot_tree *n_tree =
84 		__bch2_snapshot_tree_create(trans);
85 
86 	if (IS_ERR(n_tree))
87 		return PTR_ERR(n_tree);
88 
89 	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
90 	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
91 	*tree_id = n_tree->k.p.offset;
92 	return 0;
93 }
94 
95 /* Snapshot nodes: */
96 
__bch2_snapshot_is_ancestor_early(struct snapshot_table * t,u32 id,u32 ancestor)97 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
98 {
99 	while (id && id < ancestor) {
100 		const struct snapshot_t *s = __snapshot_t(t, id);
101 		id = s ? s->parent : 0;
102 	}
103 	return id == ancestor;
104 }
105 
bch2_snapshot_is_ancestor_early(struct bch_fs * c,u32 id,u32 ancestor)106 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
107 {
108 	guard(rcu)();
109 	return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
110 }
111 
get_ancestor_below(struct snapshot_table * t,u32 id,u32 ancestor)112 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
113 {
114 	const struct snapshot_t *s = __snapshot_t(t, id);
115 	if (!s)
116 		return 0;
117 
118 	if (s->skip[2] <= ancestor)
119 		return s->skip[2];
120 	if (s->skip[1] <= ancestor)
121 		return s->skip[1];
122 	if (s->skip[0] <= ancestor)
123 		return s->skip[0];
124 	return s->parent;
125 }
126 
test_ancestor_bitmap(struct snapshot_table * t,u32 id,u32 ancestor)127 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
128 {
129 	const struct snapshot_t *s = __snapshot_t(t, id);
130 	if (!s)
131 		return false;
132 
133 	return test_bit(ancestor - id - 1, s->is_ancestor);
134 }
135 
__bch2_snapshot_is_ancestor(struct bch_fs * c,u32 id,u32 ancestor)136 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
137 {
138 #ifdef CONFIG_BCACHEFS_DEBUG
139 	u32 orig_id = id;
140 #endif
141 
142 	guard(rcu)();
143 	struct snapshot_table *t = rcu_dereference(c->snapshots);
144 
145 	if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots))
146 		return __bch2_snapshot_is_ancestor_early(t, id, ancestor);
147 
148 	if (likely(ancestor >= IS_ANCESTOR_BITMAP))
149 		while (id && id < ancestor - IS_ANCESTOR_BITMAP)
150 			id = get_ancestor_below(t, id, ancestor);
151 
152 	bool ret = id && id < ancestor
153 		? test_ancestor_bitmap(t, id, ancestor)
154 		: id == ancestor;
155 
156 	EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor));
157 	return ret;
158 }
159 
__snapshot_t_mut(struct bch_fs * c,u32 id)160 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
161 {
162 	size_t idx = U32_MAX - id;
163 	struct snapshot_table *new, *old;
164 
165 	size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
166 	size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
167 
168 	if (unlikely(new_bytes > INT_MAX))
169 		return NULL;
170 
171 	new = kvzalloc(new_bytes, GFP_KERNEL);
172 	if (!new)
173 		return NULL;
174 
175 	new->nr = new_size;
176 
177 	old = rcu_dereference_protected(c->snapshots, true);
178 	if (old)
179 		memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
180 
181 	rcu_assign_pointer(c->snapshots, new);
182 	kvfree_rcu(old, rcu);
183 
184 	return &rcu_dereference_protected(c->snapshots,
185 				lockdep_is_held(&c->snapshot_table_lock))->s[idx];
186 }
187 
snapshot_t_mut(struct bch_fs * c,u32 id)188 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
189 {
190 	size_t idx = U32_MAX - id;
191 	struct snapshot_table *table =
192 		rcu_dereference_protected(c->snapshots,
193 				lockdep_is_held(&c->snapshot_table_lock));
194 
195 	lockdep_assert_held(&c->snapshot_table_lock);
196 
197 	if (likely(table && idx < table->nr))
198 		return &table->s[idx];
199 
200 	return __snapshot_t_mut(c, id);
201 }
202 
bch2_snapshot_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)203 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
204 			   struct bkey_s_c k)
205 {
206 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
207 
208 	if (BCH_SNAPSHOT_SUBVOL(s.v))
209 		prt_str(out, "subvol ");
210 	if (BCH_SNAPSHOT_WILL_DELETE(s.v))
211 		prt_str(out, "will_delete ");
212 	if (BCH_SNAPSHOT_DELETED(s.v))
213 		prt_str(out, "deleted ");
214 
215 	prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u",
216 	       le32_to_cpu(s.v->parent),
217 	       le32_to_cpu(s.v->children[0]),
218 	       le32_to_cpu(s.v->children[1]),
219 	       le32_to_cpu(s.v->subvol),
220 	       le32_to_cpu(s.v->tree));
221 
222 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
223 		prt_printf(out, " depth %u skiplist %u %u %u",
224 			   le32_to_cpu(s.v->depth),
225 			   le32_to_cpu(s.v->skip[0]),
226 			   le32_to_cpu(s.v->skip[1]),
227 			   le32_to_cpu(s.v->skip[2]));
228 }
229 
bch2_snapshot_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)230 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
231 			   struct bkey_validate_context from)
232 {
233 	struct bkey_s_c_snapshot s;
234 	u32 i, id;
235 	int ret = 0;
236 
237 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
238 			 bkey_lt(k.k->p, POS(0, 1)),
239 			 c, snapshot_pos_bad,
240 			 "bad pos");
241 
242 	s = bkey_s_c_to_snapshot(k);
243 
244 	id = le32_to_cpu(s.v->parent);
245 	bkey_fsck_err_on(id && id <= k.k->p.offset,
246 			 c, snapshot_parent_bad,
247 			 "bad parent node (%u <= %llu)",
248 			 id, k.k->p.offset);
249 
250 	bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
251 			 c, snapshot_children_not_normalized,
252 			 "children not normalized");
253 
254 	bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
255 			 c, snapshot_child_duplicate,
256 			 "duplicate child nodes");
257 
258 	for (i = 0; i < 2; i++) {
259 		id = le32_to_cpu(s.v->children[i]);
260 
261 		bkey_fsck_err_on(id >= k.k->p.offset,
262 				 c, snapshot_child_bad,
263 				 "bad child node (%u >= %llu)",
264 				 id, k.k->p.offset);
265 	}
266 
267 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
268 		bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
269 				 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
270 				 c, snapshot_skiplist_not_normalized,
271 				 "skiplist not normalized");
272 
273 		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
274 			id = le32_to_cpu(s.v->skip[i]);
275 
276 			bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
277 					 c, snapshot_skiplist_bad,
278 					 "bad skiplist node %u", id);
279 		}
280 	}
281 fsck_err:
282 	return ret;
283 }
284 
bch2_snapshot_table_make_room(struct bch_fs * c,u32 id)285 static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
286 {
287 	mutex_lock(&c->snapshot_table_lock);
288 	int ret = snapshot_t_mut(c, id)
289 		? 0
290 		: bch_err_throw(c, ENOMEM_mark_snapshot);
291 	mutex_unlock(&c->snapshot_table_lock);
292 	return ret;
293 }
294 
__bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s_c new,enum btree_iter_update_trigger_flags flags)295 static int __bch2_mark_snapshot(struct btree_trans *trans,
296 		       enum btree_id btree, unsigned level,
297 		       struct bkey_s_c old, struct bkey_s_c new,
298 		       enum btree_iter_update_trigger_flags flags)
299 {
300 	struct bch_fs *c = trans->c;
301 	struct snapshot_t *t;
302 	u32 id = new.k->p.offset;
303 	int ret = 0;
304 
305 	mutex_lock(&c->snapshot_table_lock);
306 
307 	t = snapshot_t_mut(c, id);
308 	if (!t) {
309 		ret = bch_err_throw(c, ENOMEM_mark_snapshot);
310 		goto err;
311 	}
312 
313 	if (new.k->type == KEY_TYPE_snapshot) {
314 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
315 
316 		t->state	= !BCH_SNAPSHOT_DELETED(s.v)
317 			? SNAPSHOT_ID_live
318 			: SNAPSHOT_ID_deleted;
319 		t->parent	= le32_to_cpu(s.v->parent);
320 		t->children[0]	= le32_to_cpu(s.v->children[0]);
321 		t->children[1]	= le32_to_cpu(s.v->children[1]);
322 		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
323 		t->tree		= le32_to_cpu(s.v->tree);
324 
325 		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
326 			t->depth	= le32_to_cpu(s.v->depth);
327 			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
328 			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
329 			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
330 		} else {
331 			t->depth	= 0;
332 			t->skip[0]	= 0;
333 			t->skip[1]	= 0;
334 			t->skip[2]	= 0;
335 		}
336 
337 		u32 parent = id;
338 
339 		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
340 		       parent - id - 1 < IS_ANCESTOR_BITMAP)
341 			__set_bit(parent - id - 1, t->is_ancestor);
342 
343 		if (BCH_SNAPSHOT_WILL_DELETE(s.v)) {
344 			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
345 			if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots)
346 				bch2_delete_dead_snapshots_async(c);
347 		}
348 	} else {
349 		memset(t, 0, sizeof(*t));
350 	}
351 err:
352 	mutex_unlock(&c->snapshot_table_lock);
353 	return ret;
354 }
355 
bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s new,enum btree_iter_update_trigger_flags flags)356 int bch2_mark_snapshot(struct btree_trans *trans,
357 		       enum btree_id btree, unsigned level,
358 		       struct bkey_s_c old, struct bkey_s new,
359 		       enum btree_iter_update_trigger_flags flags)
360 {
361 	return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
362 }
363 
bch2_snapshot_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot * s)364 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
365 			 struct bch_snapshot *s)
366 {
367 	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
368 				       BTREE_ITER_with_updates, snapshot, s);
369 }
370 
371 /* fsck: */
372 
bch2_snapshot_child(struct bch_fs * c,u32 id,unsigned child)373 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
374 {
375 	return snapshot_t(c, id)->children[child];
376 }
377 
bch2_snapshot_left_child(struct bch_fs * c,u32 id)378 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
379 {
380 	return bch2_snapshot_child(c, id, 0);
381 }
382 
bch2_snapshot_right_child(struct bch_fs * c,u32 id)383 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
384 {
385 	return bch2_snapshot_child(c, id, 1);
386 }
387 
bch2_snapshot_tree_next(struct bch_fs * c,u32 id)388 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
389 {
390 	u32 n, parent;
391 
392 	n = bch2_snapshot_left_child(c, id);
393 	if (n)
394 		return n;
395 
396 	while ((parent = bch2_snapshot_parent(c, id))) {
397 		n = bch2_snapshot_right_child(c, parent);
398 		if (n && n != id)
399 			return n;
400 		id = parent;
401 	}
402 
403 	return 0;
404 }
405 
bch2_snapshot_oldest_subvol(struct bch_fs * c,u32 snapshot_root,snapshot_id_list * skip)406 u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
407 				snapshot_id_list *skip)
408 {
409 	guard(rcu)();
410 	u32 id, subvol = 0, s;
411 retry:
412 	id = snapshot_root;
413 	while (id && bch2_snapshot_exists(c, id)) {
414 		if (!(skip && snapshot_list_has_id(skip, id))) {
415 			s = snapshot_t(c, id)->subvol;
416 
417 			if (s && (!subvol || s < subvol))
418 				subvol = s;
419 		}
420 		id = bch2_snapshot_tree_next(c, id);
421 		if (id == snapshot_root)
422 			break;
423 	}
424 
425 	if (!subvol && skip) {
426 		skip = NULL;
427 		goto retry;
428 	}
429 
430 	return subvol;
431 }
432 
bch2_snapshot_tree_master_subvol(struct btree_trans * trans,u32 snapshot_root,u32 * subvol_id)433 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
434 					    u32 snapshot_root, u32 *subvol_id)
435 {
436 	struct bch_fs *c = trans->c;
437 	struct btree_iter iter;
438 	struct bkey_s_c k;
439 	bool found = false;
440 	int ret;
441 
442 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
443 				     0, k, ret) {
444 		if (k.k->type != KEY_TYPE_subvolume)
445 			continue;
446 
447 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
448 		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
449 			continue;
450 		if (!BCH_SUBVOLUME_SNAP(s.v)) {
451 			*subvol_id = s.k->p.offset;
452 			found = true;
453 			break;
454 		}
455 	}
456 	bch2_trans_iter_exit(trans, &iter);
457 
458 	if (!ret && !found) {
459 		struct bkey_i_subvolume *u;
460 
461 		*subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL);
462 
463 		u = bch2_bkey_get_mut_typed(trans, &iter,
464 					    BTREE_ID_subvolumes, POS(0, *subvol_id),
465 					    0, subvolume);
466 		ret = PTR_ERR_OR_ZERO(u);
467 		if (ret)
468 			return ret;
469 
470 		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
471 	}
472 
473 	return ret;
474 }
475 
check_snapshot_tree(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)476 static int check_snapshot_tree(struct btree_trans *trans,
477 			       struct btree_iter *iter,
478 			       struct bkey_s_c k)
479 {
480 	struct bch_fs *c = trans->c;
481 	struct bkey_s_c_snapshot_tree st;
482 	struct bch_snapshot s;
483 	struct bch_subvolume subvol;
484 	struct printbuf buf = PRINTBUF;
485 	struct btree_iter snapshot_iter = {};
486 	u32 root_id;
487 	int ret;
488 
489 	if (k.k->type != KEY_TYPE_snapshot_tree)
490 		return 0;
491 
492 	st = bkey_s_c_to_snapshot_tree(k);
493 	root_id = le32_to_cpu(st.v->root_snapshot);
494 
495 	struct bkey_s_c_snapshot snapshot_k =
496 		bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
497 					 POS(0, root_id), 0, snapshot);
498 	ret = bkey_err(snapshot_k);
499 	if (ret && !bch2_err_matches(ret, ENOENT))
500 		goto err;
501 
502 	if (!ret)
503 		bkey_val_copy(&s, snapshot_k);
504 
505 	if (fsck_err_on(ret ||
506 			root_id != bch2_snapshot_root(c, root_id) ||
507 			st.k->p.offset != le32_to_cpu(s.tree),
508 			trans, snapshot_tree_to_missing_snapshot,
509 			"snapshot tree points to missing/incorrect snapshot:\n%s",
510 			(bch2_bkey_val_to_text(&buf, c, st.s_c),
511 			 prt_newline(&buf),
512 			 ret
513 			 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
514 			 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
515 			 buf.buf))) {
516 		ret = bch2_btree_delete_at(trans, iter, 0);
517 		goto err;
518 	}
519 
520 	if (!st.v->master_subvol)
521 		goto out;
522 
523 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
524 	if (ret && !bch2_err_matches(ret, ENOENT))
525 		goto err;
526 
527 	if (fsck_err_on(ret,
528 			trans, snapshot_tree_to_missing_subvol,
529 			"snapshot tree points to missing subvolume:\n%s",
530 			(printbuf_reset(&buf),
531 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
532 	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
533 						le32_to_cpu(subvol.snapshot),
534 						root_id),
535 			trans, snapshot_tree_to_wrong_subvol,
536 			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s",
537 			(printbuf_reset(&buf),
538 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
539 	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
540 			trans, snapshot_tree_to_snapshot_subvol,
541 			"snapshot tree points to snapshot subvolume:\n%s",
542 			(printbuf_reset(&buf),
543 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
544 		struct bkey_i_snapshot_tree *u;
545 		u32 subvol_id;
546 
547 		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
548 		bch_err_fn(c, ret);
549 
550 		if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
551 			ret = 0;
552 			goto err;
553 		}
554 
555 		if (ret)
556 			goto err;
557 
558 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
559 		ret = PTR_ERR_OR_ZERO(u);
560 		if (ret)
561 			goto err;
562 
563 		u->v.master_subvol = cpu_to_le32(subvol_id);
564 		st = snapshot_tree_i_to_s_c(u);
565 	}
566 out:
567 err:
568 fsck_err:
569 	bch2_trans_iter_exit(trans, &snapshot_iter);
570 	printbuf_exit(&buf);
571 	return ret;
572 }
573 
574 /*
575  * For each snapshot_tree, make sure it points to the root of a snapshot tree
576  * and that snapshot entry points back to it, or delete it.
577  *
578  * And, make sure it points to a subvolume within that snapshot tree, or correct
579  * it to point to the oldest subvolume within that snapshot tree.
580  */
bch2_check_snapshot_trees(struct bch_fs * c)581 int bch2_check_snapshot_trees(struct bch_fs *c)
582 {
583 	int ret = bch2_trans_run(c,
584 		for_each_btree_key_commit(trans, iter,
585 			BTREE_ID_snapshot_trees, POS_MIN,
586 			BTREE_ITER_prefetch, k,
587 			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
588 		check_snapshot_tree(trans, &iter, k)));
589 	bch_err_fn(c, ret);
590 	return ret;
591 }
592 
593 /*
594  * Look up snapshot tree for @tree_id and find root,
595  * make sure @snap_id is a descendent:
596  */
snapshot_tree_ptr_good(struct btree_trans * trans,u32 snap_id,u32 tree_id)597 static int snapshot_tree_ptr_good(struct btree_trans *trans,
598 				  u32 snap_id, u32 tree_id)
599 {
600 	struct bch_snapshot_tree s_t;
601 	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
602 
603 	if (bch2_err_matches(ret, ENOENT))
604 		return 0;
605 	if (ret)
606 		return ret;
607 
608 	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
609 }
610 
bch2_snapshot_skiplist_get(struct bch_fs * c,u32 id)611 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
612 {
613 	if (!id)
614 		return 0;
615 
616 	guard(rcu)();
617 	const struct snapshot_t *s = snapshot_t(c, id);
618 	return s->parent
619 		? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth))
620 		: id;
621 }
622 
snapshot_skiplist_good(struct btree_trans * trans,u32 id,struct bch_snapshot s)623 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
624 {
625 	unsigned i;
626 
627 	for (i = 0; i < 3; i++)
628 		if (!s.parent) {
629 			if (s.skip[i])
630 				return false;
631 		} else {
632 			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
633 				return false;
634 		}
635 
636 	return true;
637 }
638 
639 /*
640  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
641  * its snapshot_tree pointer is correct (allocate new one if necessary), then
642  * update this node's pointer to root node's pointer:
643  */
snapshot_tree_ptr_repair(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_snapshot * s)644 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
645 				    struct btree_iter *iter,
646 				    struct bkey_s_c k,
647 				    struct bch_snapshot *s)
648 {
649 	struct bch_fs *c = trans->c;
650 	struct btree_iter root_iter;
651 	struct bch_snapshot_tree s_t;
652 	struct bkey_s_c_snapshot root;
653 	struct bkey_i_snapshot *u;
654 	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
655 	int ret;
656 
657 	root = bch2_bkey_get_iter_typed(trans, &root_iter,
658 			       BTREE_ID_snapshots, POS(0, root_id),
659 			       BTREE_ITER_with_updates, snapshot);
660 	ret = bkey_err(root);
661 	if (ret)
662 		goto err;
663 
664 	tree_id = le32_to_cpu(root.v->tree);
665 
666 	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
667 	if (ret && !bch2_err_matches(ret, ENOENT))
668 		return ret;
669 
670 	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
671 		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
672 		ret =   PTR_ERR_OR_ZERO(u) ?:
673 			bch2_snapshot_tree_create(trans, root_id,
674 				bch2_snapshot_oldest_subvol(c, root_id, NULL),
675 				&tree_id);
676 		if (ret)
677 			goto err;
678 
679 		u->v.tree = cpu_to_le32(tree_id);
680 		if (k.k->p.offset == root_id)
681 			*s = u->v;
682 	}
683 
684 	if (k.k->p.offset != root_id) {
685 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
686 		ret = PTR_ERR_OR_ZERO(u);
687 		if (ret)
688 			goto err;
689 
690 		u->v.tree = cpu_to_le32(tree_id);
691 		*s = u->v;
692 	}
693 err:
694 	bch2_trans_iter_exit(trans, &root_iter);
695 	return ret;
696 }
697 
check_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)698 static int check_snapshot(struct btree_trans *trans,
699 			  struct btree_iter *iter,
700 			  struct bkey_s_c k)
701 {
702 	struct bch_fs *c = trans->c;
703 	struct bch_snapshot s;
704 	struct bch_subvolume subvol;
705 	struct bch_snapshot v;
706 	struct bkey_i_snapshot *u;
707 	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
708 	u32 real_depth;
709 	struct printbuf buf = PRINTBUF;
710 	u32 i, id;
711 	int ret = 0;
712 
713 	if (k.k->type != KEY_TYPE_snapshot)
714 		return 0;
715 
716 	memset(&s, 0, sizeof(s));
717 	memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
718 
719 	if (BCH_SNAPSHOT_DELETED(&s))
720 		return 0;
721 
722 	id = le32_to_cpu(s.parent);
723 	if (id) {
724 		ret = bch2_snapshot_lookup(trans, id, &v);
725 		if (bch2_err_matches(ret, ENOENT))
726 			bch_err(c, "snapshot with nonexistent parent:\n  %s",
727 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
728 		if (ret)
729 			goto err;
730 
731 		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
732 		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
733 			bch_err(c, "snapshot parent %u missing pointer to child %llu",
734 				id, k.k->p.offset);
735 			ret = -EINVAL;
736 			goto err;
737 		}
738 	}
739 
740 	for (i = 0; i < 2 && s.children[i]; i++) {
741 		id = le32_to_cpu(s.children[i]);
742 
743 		ret = bch2_snapshot_lookup(trans, id, &v);
744 		if (bch2_err_matches(ret, ENOENT))
745 			bch_err(c, "snapshot node %llu has nonexistent child %u",
746 				k.k->p.offset, id);
747 		if (ret)
748 			goto err;
749 
750 		if (le32_to_cpu(v.parent) != k.k->p.offset) {
751 			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
752 				id, le32_to_cpu(v.parent), k.k->p.offset);
753 			ret = -EINVAL;
754 			goto err;
755 		}
756 	}
757 
758 	bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
759 		!BCH_SNAPSHOT_WILL_DELETE(&s);
760 
761 	if (should_have_subvol) {
762 		id = le32_to_cpu(s.subvol);
763 		ret = bch2_subvolume_get(trans, id, false, &subvol);
764 		if (bch2_err_matches(ret, ENOENT))
765 			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
766 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
767 		if (ret)
768 			goto err;
769 
770 		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
771 			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
772 				k.k->p.offset);
773 			ret = -EINVAL;
774 			goto err;
775 		}
776 	} else {
777 		if (fsck_err_on(s.subvol,
778 				trans, snapshot_should_not_have_subvol,
779 				"snapshot should not point to subvol:\n%s",
780 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
781 			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
782 			ret = PTR_ERR_OR_ZERO(u);
783 			if (ret)
784 				goto err;
785 
786 			u->v.subvol = 0;
787 			s = u->v;
788 		}
789 	}
790 
791 	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
792 	if (ret < 0)
793 		goto err;
794 
795 	if (fsck_err_on(!ret,
796 			trans, snapshot_to_bad_snapshot_tree,
797 			"snapshot points to missing/incorrect tree:\n%s",
798 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
799 		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
800 		if (ret)
801 			goto err;
802 	}
803 	ret = 0;
804 
805 	real_depth = bch2_snapshot_depth(c, parent_id);
806 
807 	if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
808 			trans, snapshot_bad_depth,
809 			"snapshot with incorrect depth field, should be %u:\n%s",
810 			real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
811 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
812 		ret = PTR_ERR_OR_ZERO(u);
813 		if (ret)
814 			goto err;
815 
816 		u->v.depth = cpu_to_le32(real_depth);
817 		s = u->v;
818 	}
819 
820 	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
821 	if (ret < 0)
822 		goto err;
823 
824 	if (fsck_err_on(!ret,
825 			trans, snapshot_bad_skiplist,
826 			"snapshot with bad skiplist field:\n%s",
827 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
828 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
829 		ret = PTR_ERR_OR_ZERO(u);
830 		if (ret)
831 			goto err;
832 
833 		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
834 			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
835 
836 		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
837 		s = u->v;
838 	}
839 	ret = 0;
840 err:
841 fsck_err:
842 	printbuf_exit(&buf);
843 	return ret;
844 }
845 
bch2_check_snapshots(struct bch_fs * c)846 int bch2_check_snapshots(struct bch_fs *c)
847 {
848 	/*
849 	 * We iterate backwards as checking/fixing the depth field requires that
850 	 * the parent's depth already be correct:
851 	 */
852 	int ret = bch2_trans_run(c,
853 		for_each_btree_key_reverse_commit(trans, iter,
854 				BTREE_ID_snapshots, POS_MAX,
855 				BTREE_ITER_prefetch, k,
856 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
857 			check_snapshot(trans, &iter, k)));
858 	bch_err_fn(c, ret);
859 	return ret;
860 }
861 
check_snapshot_exists(struct btree_trans * trans,u32 id)862 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
863 {
864 	struct bch_fs *c = trans->c;
865 
866 	/* Do we need to reconstruct the snapshot_tree entry as well? */
867 	struct btree_iter iter;
868 	struct bkey_s_c k;
869 	int ret = 0;
870 	u32 tree_id = 0;
871 
872 	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
873 				     0, k, ret) {
874 		if (k.k->type == KEY_TYPE_snapshot_tree &&
875 		    le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
876 			tree_id = k.k->p.offset;
877 			break;
878 		}
879 	}
880 	bch2_trans_iter_exit(trans, &iter);
881 
882 	if (ret)
883 		return ret;
884 
885 	if (!tree_id) {
886 		ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
887 		if (ret)
888 			return ret;
889 	}
890 
891 	struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
892 	ret = PTR_ERR_OR_ZERO(snapshot);
893 	if (ret)
894 		return ret;
895 
896 	bkey_snapshot_init(&snapshot->k_i);
897 	snapshot->k.p		= POS(0, id);
898 	snapshot->v.tree	= cpu_to_le32(tree_id);
899 	snapshot->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
900 
901 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
902 				     0, k, ret) {
903 		if (k.k->type == KEY_TYPE_subvolume &&
904 		    le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
905 			snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
906 			SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
907 			break;
908 		}
909 	}
910 	bch2_trans_iter_exit(trans, &iter);
911 
912 	return  bch2_snapshot_table_make_room(c, id) ?:
913 		bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0);
914 }
915 
916 /* Figure out which snapshot nodes belong in the same tree: */
917 struct snapshot_tree_reconstruct {
918 	enum btree_id			btree;
919 	struct bpos			cur_pos;
920 	snapshot_id_list		cur_ids;
921 	DARRAY(snapshot_id_list)	trees;
922 };
923 
snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct * r)924 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
925 {
926 	darray_for_each(r->trees, i)
927 		darray_exit(i);
928 	darray_exit(&r->trees);
929 	darray_exit(&r->cur_ids);
930 }
931 
same_snapshot(struct snapshot_tree_reconstruct * r,struct bpos pos)932 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
933 {
934 	return r->btree == BTREE_ID_inodes
935 		? r->cur_pos.offset == pos.offset
936 		: r->cur_pos.inode == pos.inode;
937 }
938 
snapshot_id_lists_have_common(snapshot_id_list * l,snapshot_id_list * r)939 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
940 {
941 	return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL;
942 }
943 
snapshot_id_list_to_text(struct printbuf * out,snapshot_id_list * s)944 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
945 {
946 	bool first = true;
947 	darray_for_each(*s, i) {
948 		if (!first)
949 			prt_char(out, ' ');
950 		first = false;
951 		prt_printf(out, "%u", *i);
952 	}
953 }
954 
snapshot_tree_reconstruct_next(struct bch_fs * c,struct snapshot_tree_reconstruct * r)955 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
956 {
957 	if (r->cur_ids.nr) {
958 		darray_for_each(r->trees, i)
959 			if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
960 				int ret = snapshot_list_merge(c, i, &r->cur_ids);
961 				if (ret)
962 					return ret;
963 				goto out;
964 			}
965 		darray_push(&r->trees, r->cur_ids);
966 		darray_init(&r->cur_ids);
967 	}
968 out:
969 	r->cur_ids.nr = 0;
970 	return 0;
971 }
972 
get_snapshot_trees(struct bch_fs * c,struct snapshot_tree_reconstruct * r,struct bpos pos)973 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
974 {
975 	if (!same_snapshot(r, pos))
976 		snapshot_tree_reconstruct_next(c, r);
977 	r->cur_pos = pos;
978 	return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
979 }
980 
bch2_reconstruct_snapshots(struct bch_fs * c)981 int bch2_reconstruct_snapshots(struct bch_fs *c)
982 {
983 	struct btree_trans *trans = bch2_trans_get(c);
984 	struct printbuf buf = PRINTBUF;
985 	struct snapshot_tree_reconstruct r = {};
986 	int ret = 0;
987 
988 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
989 		if (btree_type_has_snapshots(btree)) {
990 			r.btree = btree;
991 
992 			ret = for_each_btree_key(trans, iter, btree, POS_MIN,
993 					BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
994 				get_snapshot_trees(c, &r, k.k->p);
995 			}));
996 			if (ret)
997 				goto err;
998 
999 			snapshot_tree_reconstruct_next(c, &r);
1000 		}
1001 	}
1002 
1003 	darray_for_each(r.trees, t) {
1004 		printbuf_reset(&buf);
1005 		snapshot_id_list_to_text(&buf, t);
1006 
1007 		darray_for_each(*t, id) {
1008 			if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty,
1009 					trans, snapshot_node_missing,
1010 					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
1011 				if (t->nr > 1) {
1012 					bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
1013 					ret = bch_err_throw(c, fsck_repair_unimplemented);
1014 					goto err;
1015 				}
1016 
1017 				ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1018 						check_snapshot_exists(trans, *id));
1019 				if (ret)
1020 					goto err;
1021 			}
1022 		}
1023 	}
1024 fsck_err:
1025 err:
1026 	bch2_trans_put(trans);
1027 	snapshot_tree_reconstruct_exit(&r);
1028 	printbuf_exit(&buf);
1029 	bch_err_fn(c, ret);
1030 	return ret;
1031 }
1032 
__bch2_check_key_has_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1033 int __bch2_check_key_has_snapshot(struct btree_trans *trans,
1034 				  struct btree_iter *iter,
1035 				  struct bkey_s_c k)
1036 {
1037 	struct bch_fs *c = trans->c;
1038 	struct printbuf buf = PRINTBUF;
1039 	int ret = 0;
1040 	enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot);
1041 
1042 	/* Snapshot was definitively deleted, this error is marked autofix */
1043 	if (fsck_err_on(state == SNAPSHOT_ID_deleted,
1044 			trans, bkey_in_deleted_snapshot,
1045 			"key in deleted snapshot %s, delete?",
1046 			(bch2_btree_id_to_text(&buf, iter->btree_id),
1047 			 prt_char(&buf, ' '),
1048 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1049 		ret = bch2_btree_delete_at(trans, iter,
1050 					   BTREE_UPDATE_internal_snapshot_node) ?: 1;
1051 
1052 	if (state == SNAPSHOT_ID_empty) {
1053 		/*
1054 		 * Snapshot missing: we should have caught this with btree_lost_data and
1055 		 * kicked off reconstruct_snapshots, so if we end up here we have no
1056 		 * idea what happened.
1057 		 *
1058 		 * Do not delete unless we know that subvolumes and snapshots
1059 		 * are consistent:
1060 		 *
1061 		 * XXX:
1062 		 *
1063 		 * We could be smarter here, and instead of using the generic
1064 		 * recovery pass ratelimiting, track if there have been any
1065 		 * changes to the snapshots or inodes btrees since those passes
1066 		 * last ran.
1067 		 */
1068 		ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
1069 		ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
1070 
1071 		if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
1072 			ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
1073 
1074 		unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
1075 
1076 		if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
1077 			     "key in missing snapshot %s, delete?",
1078 			     (bch2_btree_id_to_text(&buf, iter->btree_id),
1079 			      prt_char(&buf, ' '),
1080 			      bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1081 			ret = bch2_btree_delete_at(trans, iter,
1082 						   BTREE_UPDATE_internal_snapshot_node) ?: 1;
1083 		}
1084 	}
1085 fsck_err:
1086 	printbuf_exit(&buf);
1087 	return ret;
1088 }
1089 
__bch2_get_snapshot_overwrites(struct btree_trans * trans,enum btree_id btree,struct bpos pos,snapshot_id_list * s)1090 int __bch2_get_snapshot_overwrites(struct btree_trans *trans,
1091 				   enum btree_id btree, struct bpos pos,
1092 				   snapshot_id_list *s)
1093 {
1094 	struct bch_fs *c = trans->c;
1095 	struct btree_iter iter;
1096 	struct bkey_s_c k;
1097 	int ret = 0;
1098 
1099 	for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos),
1100 					     BTREE_ITER_all_snapshots, k, ret) {
1101 		if (!bkey_eq(k.k->p, pos))
1102 			break;
1103 
1104 		if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) ||
1105 		    snapshot_list_has_ancestor(c, s, k.k->p.snapshot))
1106 			continue;
1107 
1108 		ret = snapshot_list_add(c, s, k.k->p.snapshot);
1109 		if (ret)
1110 			break;
1111 	}
1112 	bch2_trans_iter_exit(trans, &iter);
1113 	if (ret)
1114 		darray_exit(s);
1115 
1116 	return ret;
1117 }
1118 
1119 /*
1120  * Mark a snapshot as deleted, for future cleanup:
1121  */
bch2_snapshot_node_set_deleted(struct btree_trans * trans,u32 id)1122 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1123 {
1124 	struct btree_iter iter;
1125 	struct bkey_i_snapshot *s =
1126 		bch2_bkey_get_mut_typed(trans, &iter,
1127 				    BTREE_ID_snapshots, POS(0, id),
1128 				    0, snapshot);
1129 	int ret = PTR_ERR_OR_ZERO(s);
1130 	if (unlikely(ret)) {
1131 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1132 					trans->c, "missing snapshot %u", id);
1133 		return ret;
1134 	}
1135 
1136 	/* already deleted? */
1137 	if (BCH_SNAPSHOT_WILL_DELETE(&s->v))
1138 		goto err;
1139 
1140 	SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true);
1141 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1142 	s->v.subvol = 0;
1143 err:
1144 	bch2_trans_iter_exit(trans, &iter);
1145 	return ret;
1146 }
1147 
normalize_snapshot_child_pointers(struct bch_snapshot * s)1148 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1149 {
1150 	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1151 		swap(s->children[0], s->children[1]);
1152 }
1153 
bch2_snapshot_node_delete(struct btree_trans * trans,u32 id)1154 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1155 {
1156 	struct bch_fs *c = trans->c;
1157 	struct btree_iter iter, p_iter = {};
1158 	struct btree_iter c_iter = {};
1159 	struct btree_iter tree_iter = {};
1160 	u32 parent_id, child_id;
1161 	unsigned i;
1162 	int ret = 0;
1163 
1164 	struct bkey_i_snapshot *s =
1165 		bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1166 					BTREE_ITER_intent, snapshot);
1167 	ret = PTR_ERR_OR_ZERO(s);
1168 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1169 				"missing snapshot %u", id);
1170 
1171 	if (ret)
1172 		goto err;
1173 
1174 	BUG_ON(BCH_SNAPSHOT_DELETED(&s->v));
1175 	BUG_ON(s->v.children[1]);
1176 
1177 	parent_id = le32_to_cpu(s->v.parent);
1178 	child_id = le32_to_cpu(s->v.children[0]);
1179 
1180 	if (parent_id) {
1181 		struct bkey_i_snapshot *parent;
1182 
1183 		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1184 				     BTREE_ID_snapshots, POS(0, parent_id),
1185 				     0, snapshot);
1186 		ret = PTR_ERR_OR_ZERO(parent);
1187 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1188 					"missing snapshot %u", parent_id);
1189 		if (unlikely(ret))
1190 			goto err;
1191 
1192 		/* find entry in parent->children for node being deleted */
1193 		for (i = 0; i < 2; i++)
1194 			if (le32_to_cpu(parent->v.children[i]) == id)
1195 				break;
1196 
1197 		if (bch2_fs_inconsistent_on(i == 2, c,
1198 					"snapshot %u missing child pointer to %u",
1199 					parent_id, id))
1200 			goto err;
1201 
1202 		parent->v.children[i] = cpu_to_le32(child_id);
1203 
1204 		normalize_snapshot_child_pointers(&parent->v);
1205 	}
1206 
1207 	if (child_id) {
1208 		struct bkey_i_snapshot *child;
1209 
1210 		child = bch2_bkey_get_mut_typed(trans, &c_iter,
1211 				     BTREE_ID_snapshots, POS(0, child_id),
1212 				     0, snapshot);
1213 		ret = PTR_ERR_OR_ZERO(child);
1214 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1215 					"missing snapshot %u", child_id);
1216 		if (unlikely(ret))
1217 			goto err;
1218 
1219 		child->v.parent = cpu_to_le32(parent_id);
1220 
1221 		if (!child->v.parent) {
1222 			child->v.skip[0] = 0;
1223 			child->v.skip[1] = 0;
1224 			child->v.skip[2] = 0;
1225 		}
1226 	}
1227 
1228 	if (!parent_id) {
1229 		/*
1230 		 * We're deleting the root of a snapshot tree: update the
1231 		 * snapshot_tree entry to point to the new root, or delete it if
1232 		 * this is the last snapshot ID in this tree:
1233 		 */
1234 		struct bkey_i_snapshot_tree *s_t;
1235 
1236 		BUG_ON(s->v.children[1]);
1237 
1238 		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1239 				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)),
1240 				0, snapshot_tree);
1241 		ret = PTR_ERR_OR_ZERO(s_t);
1242 		if (ret)
1243 			goto err;
1244 
1245 		if (s->v.children[0]) {
1246 			s_t->v.root_snapshot = s->v.children[0];
1247 		} else {
1248 			s_t->k.type = KEY_TYPE_deleted;
1249 			set_bkey_val_u64s(&s_t->k, 0);
1250 		}
1251 	}
1252 
1253 	if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) {
1254 		SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1255 		s->v.parent		= 0;
1256 		s->v.children[0]	= 0;
1257 		s->v.children[1]	= 0;
1258 		s->v.subvol		= 0;
1259 		s->v.tree		= 0;
1260 		s->v.depth		= 0;
1261 		s->v.skip[0]		= 0;
1262 		s->v.skip[1]		= 0;
1263 		s->v.skip[2]		= 0;
1264 	} else {
1265 		s->k.type = KEY_TYPE_deleted;
1266 		set_bkey_val_u64s(&s->k, 0);
1267 	}
1268 err:
1269 	bch2_trans_iter_exit(trans, &tree_iter);
1270 	bch2_trans_iter_exit(trans, &p_iter);
1271 	bch2_trans_iter_exit(trans, &c_iter);
1272 	bch2_trans_iter_exit(trans, &iter);
1273 	return ret;
1274 }
1275 
create_snapids(struct btree_trans * trans,u32 parent,u32 tree,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1276 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1277 			  u32 *new_snapids,
1278 			  u32 *snapshot_subvols,
1279 			  unsigned nr_snapids)
1280 {
1281 	struct bch_fs *c = trans->c;
1282 	struct btree_iter iter;
1283 	struct bkey_i_snapshot *n;
1284 	struct bkey_s_c k;
1285 	unsigned i, j;
1286 	u32 depth = bch2_snapshot_depth(c, parent);
1287 	int ret;
1288 
1289 	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1290 			     POS_MIN, BTREE_ITER_intent);
1291 	k = bch2_btree_iter_peek(trans, &iter);
1292 	ret = bkey_err(k);
1293 	if (ret)
1294 		goto err;
1295 
1296 	for (i = 0; i < nr_snapids; i++) {
1297 		k = bch2_btree_iter_prev_slot(trans, &iter);
1298 		ret = bkey_err(k);
1299 		if (ret)
1300 			goto err;
1301 
1302 		if (!k.k || !k.k->p.offset) {
1303 			ret = bch_err_throw(c, ENOSPC_snapshot_create);
1304 			goto err;
1305 		}
1306 
1307 		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1308 		ret = PTR_ERR_OR_ZERO(n);
1309 		if (ret)
1310 			goto err;
1311 
1312 		n->v.flags	= 0;
1313 		n->v.parent	= cpu_to_le32(parent);
1314 		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
1315 		n->v.tree	= cpu_to_le32(tree);
1316 		n->v.depth	= cpu_to_le32(depth);
1317 		n->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
1318 		n->v.btime.hi	= 0;
1319 
1320 		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1321 			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1322 
1323 		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1324 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1325 
1326 		ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1327 					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1328 		if (ret)
1329 			goto err;
1330 
1331 		new_snapids[i]	= iter.pos.offset;
1332 	}
1333 err:
1334 	bch2_trans_iter_exit(trans, &iter);
1335 	return ret;
1336 }
1337 
1338 /*
1339  * Create new snapshot IDs as children of an existing snapshot ID:
1340  */
bch2_snapshot_node_create_children(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1341 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1342 			      u32 *new_snapids,
1343 			      u32 *snapshot_subvols,
1344 			      unsigned nr_snapids)
1345 {
1346 	struct btree_iter iter;
1347 	struct bkey_i_snapshot *n_parent;
1348 	int ret = 0;
1349 
1350 	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1351 			BTREE_ID_snapshots, POS(0, parent),
1352 			0, snapshot);
1353 	ret = PTR_ERR_OR_ZERO(n_parent);
1354 	if (unlikely(ret)) {
1355 		if (bch2_err_matches(ret, ENOENT))
1356 			bch_err(trans->c, "snapshot %u not found", parent);
1357 		return ret;
1358 	}
1359 
1360 	if (n_parent->v.children[0] || n_parent->v.children[1]) {
1361 		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1362 		ret = -EINVAL;
1363 		goto err;
1364 	}
1365 
1366 	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1367 			     new_snapids, snapshot_subvols, nr_snapids);
1368 	if (ret)
1369 		goto err;
1370 
1371 	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1372 	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1373 	n_parent->v.subvol = 0;
1374 	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1375 err:
1376 	bch2_trans_iter_exit(trans, &iter);
1377 	return ret;
1378 }
1379 
1380 /*
1381  * Create a snapshot node that is the root of a new tree:
1382  */
bch2_snapshot_node_create_tree(struct btree_trans * trans,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1383 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1384 			      u32 *new_snapids,
1385 			      u32 *snapshot_subvols,
1386 			      unsigned nr_snapids)
1387 {
1388 	struct bkey_i_snapshot_tree *n_tree;
1389 	int ret;
1390 
1391 	n_tree = __bch2_snapshot_tree_create(trans);
1392 	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1393 		create_snapids(trans, 0, n_tree->k.p.offset,
1394 			     new_snapids, snapshot_subvols, nr_snapids);
1395 	if (ret)
1396 		return ret;
1397 
1398 	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
1399 	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
1400 	return 0;
1401 }
1402 
bch2_snapshot_node_create(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1403 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1404 			      u32 *new_snapids,
1405 			      u32 *snapshot_subvols,
1406 			      unsigned nr_snapids)
1407 {
1408 	BUG_ON((parent == 0) != (nr_snapids == 1));
1409 	BUG_ON((parent != 0) != (nr_snapids == 2));
1410 
1411 	return parent
1412 		? bch2_snapshot_node_create_children(trans, parent,
1413 				new_snapids, snapshot_subvols, nr_snapids)
1414 		: bch2_snapshot_node_create_tree(trans,
1415 				new_snapids, snapshot_subvols, nr_snapids);
1416 
1417 }
1418 
1419 /*
1420  * If we have an unlinked inode in an internal snapshot node, and the inode
1421  * really has been deleted in all child snapshots, how does this get cleaned up?
1422  *
1423  * first there is the problem of how keys that have been overwritten in all
1424  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1425  * special?
1426  *
1427  * also: unlinked inode in internal snapshot appears to not be getting deleted
1428  * correctly if inode doesn't exist in leaf snapshots
1429  *
1430  * solution:
1431  *
1432  * for a key in an interior snapshot node that needs work to be done that
1433  * requires it to be mutated: iterate over all descendent leaf nodes and copy
1434  * that key to snapshot leaf nodes, where we can mutate it
1435  */
1436 
interior_delete_has_id(interior_delete_list * l,u32 id)1437 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
1438 {
1439 	struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id);
1440 	return i ? i->live_child : 0;
1441 }
1442 
__live_child(struct snapshot_table * t,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1443 static unsigned __live_child(struct snapshot_table *t, u32 id,
1444 			     snapshot_id_list *delete_leaves,
1445 			     interior_delete_list *delete_interior)
1446 {
1447 	struct snapshot_t *s = __snapshot_t(t, id);
1448 	if (!s)
1449 		return 0;
1450 
1451 	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
1452 		if (s->children[i] &&
1453 		    !snapshot_list_has_id(delete_leaves, s->children[i]) &&
1454 		    !interior_delete_has_id(delete_interior, s->children[i]))
1455 			return s->children[i];
1456 
1457 	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
1458 		u32 live_child = s->children[i]
1459 			? __live_child(t, s->children[i], delete_leaves, delete_interior)
1460 			: 0;
1461 		if (live_child)
1462 			return live_child;
1463 	}
1464 
1465 	return 0;
1466 }
1467 
live_child(struct bch_fs * c,u32 id)1468 static unsigned live_child(struct bch_fs *c, u32 id)
1469 {
1470 	struct snapshot_delete *d = &c->snapshot_delete;
1471 
1472 	guard(rcu)();
1473 	return __live_child(rcu_dereference(c->snapshots), id,
1474 			    &d->delete_leaves, &d->delete_interior);
1475 }
1476 
snapshot_id_dying(struct snapshot_delete * d,unsigned id)1477 static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
1478 {
1479 	return snapshot_list_has_id(&d->delete_leaves, id) ||
1480 		interior_delete_has_id(&d->delete_interior, id) != 0;
1481 }
1482 
delete_dead_snapshots_process_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1483 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
1484 					     struct btree_iter *iter,
1485 					     struct bkey_s_c k)
1486 {
1487 	struct snapshot_delete *d = &trans->c->snapshot_delete;
1488 
1489 	if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot))
1490 		return bch2_btree_delete_at(trans, iter,
1491 					    BTREE_UPDATE_internal_snapshot_node);
1492 
1493 	u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot);
1494 	if (live_child) {
1495 		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1496 		int ret = PTR_ERR_OR_ZERO(new);
1497 		if (ret)
1498 			return ret;
1499 
1500 		new->k.p.snapshot = live_child;
1501 
1502 		struct btree_iter dst_iter;
1503 		struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
1504 							   iter->btree_id, new->k.p,
1505 							   BTREE_ITER_all_snapshots|
1506 							   BTREE_ITER_intent);
1507 		ret = bkey_err(dst_k);
1508 		if (ret)
1509 			return ret;
1510 
1511 		ret =   (bkey_deleted(dst_k.k)
1512 			 ? bch2_trans_update(trans, &dst_iter, new,
1513 					     BTREE_UPDATE_internal_snapshot_node)
1514 			 : 0) ?:
1515 			bch2_btree_delete_at(trans, iter,
1516 					     BTREE_UPDATE_internal_snapshot_node);
1517 		bch2_trans_iter_exit(trans, &dst_iter);
1518 		return ret;
1519 	}
1520 
1521 	return 0;
1522 }
1523 
skip_unrelated_snapshot_tree(struct btree_trans * trans,struct btree_iter * iter,u64 * prev_inum)1524 static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
1525 {
1526 	struct bch_fs *c = trans->c;
1527 	struct snapshot_delete *d = &c->snapshot_delete;
1528 
1529 	u64 inum = iter->btree_id != BTREE_ID_inodes
1530 		? iter->pos.inode
1531 		: iter->pos.offset;
1532 
1533 	if (*prev_inum == inum)
1534 		return false;
1535 
1536 	*prev_inum = inum;
1537 
1538 	bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
1539 					 bch2_snapshot_tree(c, iter->pos.snapshot));
1540 	if (unlikely(ret)) {
1541 		struct bpos pos = iter->pos;
1542 		pos.snapshot = 0;
1543 		if (iter->btree_id != BTREE_ID_inodes)
1544 			pos.offset = U64_MAX;
1545 		bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos));
1546 	}
1547 
1548 	return ret;
1549 }
1550 
delete_dead_snapshot_keys_v1(struct btree_trans * trans)1551 static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
1552 {
1553 	struct bch_fs *c = trans->c;
1554 	struct snapshot_delete *d = &c->snapshot_delete;
1555 
1556 	for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
1557 		struct disk_reservation res = { 0 };
1558 		u64 prev_inum = 0;
1559 
1560 		d->pos.pos = POS_MIN;
1561 
1562 		if (!btree_type_has_snapshots(d->pos.btree))
1563 			continue;
1564 
1565 		int ret = for_each_btree_key_commit(trans, iter,
1566 				d->pos.btree, POS_MIN,
1567 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1568 				&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1569 			d->pos.pos = iter.pos;
1570 
1571 			if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
1572 				continue;
1573 
1574 			delete_dead_snapshots_process_key(trans, &iter, k);
1575 		}));
1576 
1577 		bch2_disk_reservation_put(c, &res);
1578 
1579 		if (ret)
1580 			return ret;
1581 	}
1582 
1583 	return 0;
1584 }
1585 
delete_dead_snapshot_keys_range(struct btree_trans * trans,enum btree_id btree,struct bpos start,struct bpos end)1586 static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
1587 					   struct bpos start, struct bpos end)
1588 {
1589 	struct bch_fs *c = trans->c;
1590 	struct snapshot_delete *d = &c->snapshot_delete;
1591 	struct disk_reservation res = { 0 };
1592 
1593 	d->pos.btree	= btree;
1594 	d->pos.pos	= POS_MIN;
1595 
1596 	int ret = for_each_btree_key_max_commit(trans, iter,
1597 			btree, start, end,
1598 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1599 			&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1600 		d->pos.pos = iter.pos;
1601 		delete_dead_snapshots_process_key(trans, &iter, k);
1602 	}));
1603 
1604 	bch2_disk_reservation_put(c, &res);
1605 	return ret;
1606 }
1607 
delete_dead_snapshot_keys_v2(struct btree_trans * trans)1608 static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
1609 {
1610 	struct bch_fs *c = trans->c;
1611 	struct snapshot_delete *d = &c->snapshot_delete;
1612 	struct disk_reservation res = { 0 };
1613 	u64 prev_inum = 0;
1614 	int ret = 0;
1615 
1616 	struct btree_iter iter;
1617 	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
1618 			     BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
1619 
1620 	while (1) {
1621 		struct bkey_s_c k;
1622 		ret = lockrestart_do(trans,
1623 				bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
1624 		if (ret)
1625 			break;
1626 
1627 		if (!k.k)
1628 			break;
1629 
1630 		d->pos.btree	= iter.btree_id;
1631 		d->pos.pos	= iter.pos;
1632 
1633 		if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
1634 			continue;
1635 
1636 		if (snapshot_id_dying(d, k.k->p.snapshot)) {
1637 			struct bpos start	= POS(k.k->p.offset, 0);
1638 			struct bpos end		= POS(k.k->p.offset, U64_MAX);
1639 
1640 			ret   = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
1641 				delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
1642 				delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
1643 			if (ret)
1644 				break;
1645 
1646 			bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
1647 		} else {
1648 			bch2_btree_iter_advance(trans, &iter);
1649 		}
1650 	}
1651 	bch2_trans_iter_exit(trans, &iter);
1652 
1653 	if (ret)
1654 		goto err;
1655 
1656 	prev_inum = 0;
1657 	ret = for_each_btree_key_commit(trans, iter,
1658 			BTREE_ID_inodes, POS_MIN,
1659 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1660 			&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1661 		d->pos.btree	= iter.btree_id;
1662 		d->pos.pos	= iter.pos;
1663 
1664 		if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
1665 			continue;
1666 
1667 		delete_dead_snapshots_process_key(trans, &iter, k);
1668 	}));
1669 err:
1670 	bch2_disk_reservation_put(c, &res);
1671 	return ret;
1672 }
1673 
1674 /*
1675  * For a given snapshot, if it doesn't have a subvolume that points to it, and
1676  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1677  * as deleted.
1678  */
check_should_delete_snapshot(struct btree_trans * trans,struct bkey_s_c k)1679 static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k)
1680 {
1681 	if (k.k->type != KEY_TYPE_snapshot)
1682 		return 0;
1683 
1684 	struct bch_fs *c = trans->c;
1685 	struct snapshot_delete *d = &c->snapshot_delete;
1686 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
1687 	unsigned live_children = 0;
1688 	int ret = 0;
1689 
1690 	if (BCH_SNAPSHOT_SUBVOL(s.v))
1691 		return 0;
1692 
1693 	if (BCH_SNAPSHOT_DELETED(s.v))
1694 		return 0;
1695 
1696 	mutex_lock(&d->progress_lock);
1697 	for (unsigned i = 0; i < 2; i++) {
1698 		u32 child = le32_to_cpu(s.v->children[i]);
1699 
1700 		live_children += child &&
1701 			!snapshot_list_has_id(&d->delete_leaves, child);
1702 	}
1703 
1704 	u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
1705 
1706 	if (live_children == 0) {
1707 		ret =   snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
1708 			snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
1709 	} else if (live_children == 1) {
1710 		struct snapshot_interior_delete n = {
1711 			.id		= s.k->p.offset,
1712 			.live_child	= live_child(c, s.k->p.offset),
1713 		};
1714 
1715 		if (!n.live_child) {
1716 			bch_err(c, "error finding live child of snapshot %u", n.id);
1717 			ret = -EINVAL;
1718 		} else {
1719 			ret =   snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
1720 				darray_push(&d->delete_interior, n);
1721 		}
1722 	}
1723 	mutex_unlock(&d->progress_lock);
1724 
1725 	return ret;
1726 }
1727 
bch2_snapshot_nth_parent_skip(struct bch_fs * c,u32 id,u32 n,interior_delete_list * skip)1728 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1729 						interior_delete_list *skip)
1730 {
1731 	guard(rcu)();
1732 	while (interior_delete_has_id(skip, id))
1733 		id = __bch2_snapshot_parent(c, id);
1734 
1735 	while (n--) {
1736 		do {
1737 			id = __bch2_snapshot_parent(c, id);
1738 		} while (interior_delete_has_id(skip, id));
1739 	}
1740 
1741 	return id;
1742 }
1743 
bch2_fix_child_of_deleted_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,interior_delete_list * deleted)1744 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1745 					      struct btree_iter *iter, struct bkey_s_c k,
1746 					      interior_delete_list *deleted)
1747 {
1748 	struct bch_fs *c = trans->c;
1749 	u32 nr_deleted_ancestors = 0;
1750 	struct bkey_i_snapshot *s;
1751 	int ret;
1752 
1753 	if (!bch2_snapshot_exists(c, k.k->p.offset))
1754 		return 0;
1755 
1756 	if (k.k->type != KEY_TYPE_snapshot)
1757 		return 0;
1758 
1759 	if (interior_delete_has_id(deleted, k.k->p.offset))
1760 		return 0;
1761 
1762 	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1763 	ret = PTR_ERR_OR_ZERO(s);
1764 	if (ret)
1765 		return ret;
1766 
1767 	darray_for_each(*deleted, i)
1768 		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
1769 
1770 	if (!nr_deleted_ancestors)
1771 		return 0;
1772 
1773 	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1774 
1775 	if (!s->v.depth) {
1776 		s->v.skip[0] = 0;
1777 		s->v.skip[1] = 0;
1778 		s->v.skip[2] = 0;
1779 	} else {
1780 		u32 depth = le32_to_cpu(s->v.depth);
1781 		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1782 
1783 		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1784 			u32 id = le32_to_cpu(s->v.skip[j]);
1785 
1786 			if (interior_delete_has_id(deleted, id)) {
1787 				id = bch2_snapshot_nth_parent_skip(c,
1788 							parent,
1789 							depth > 1
1790 							? get_random_u32_below(depth - 1)
1791 							: 0,
1792 							deleted);
1793 				s->v.skip[j] = cpu_to_le32(id);
1794 			}
1795 		}
1796 
1797 		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1798 	}
1799 
1800 	return bch2_trans_update(trans, iter, &s->k_i, 0);
1801 }
1802 
bch2_snapshot_delete_nodes_to_text(struct printbuf * out,struct snapshot_delete * d)1803 static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d)
1804 {
1805 	prt_printf(out, "deleting from trees");
1806 	darray_for_each(d->deleting_from_trees, i)
1807 		prt_printf(out, " %u", *i);
1808 
1809 	prt_printf(out, "deleting leaves");
1810 	darray_for_each(d->delete_leaves, i)
1811 		prt_printf(out, " %u", *i);
1812 	prt_newline(out);
1813 
1814 	prt_printf(out, "interior");
1815 	darray_for_each(d->delete_interior, i)
1816 		prt_printf(out, " %u->%u", i->id, i->live_child);
1817 	prt_newline(out);
1818 }
1819 
__bch2_delete_dead_snapshots(struct bch_fs * c)1820 int __bch2_delete_dead_snapshots(struct bch_fs *c)
1821 {
1822 	struct snapshot_delete *d = &c->snapshot_delete;
1823 	int ret = 0;
1824 
1825 	if (!mutex_trylock(&d->lock))
1826 		return 0;
1827 
1828 	if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1829 		goto out_unlock;
1830 
1831 	struct btree_trans *trans = bch2_trans_get(c);
1832 
1833 	/*
1834 	 * For every snapshot node: If we have no live children and it's not
1835 	 * pointed to by a subvolume, delete it:
1836 	 */
1837 	d->running = true;
1838 	d->pos = BBPOS_MIN;
1839 
1840 	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
1841 		check_should_delete_snapshot(trans, k));
1842 	if (!bch2_err_matches(ret, EROFS))
1843 		bch_err_msg(c, ret, "walking snapshots");
1844 	if (ret)
1845 		goto err;
1846 
1847 	if (!d->delete_leaves.nr && !d->delete_interior.nr)
1848 		goto err;
1849 
1850 	{
1851 		struct printbuf buf = PRINTBUF;
1852 		bch2_snapshot_delete_nodes_to_text(&buf, d);
1853 
1854 		ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
1855 		printbuf_exit(&buf);
1856 		if (ret)
1857 			goto err;
1858 	}
1859 
1860 	ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
1861 		? delete_dead_snapshot_keys_v2(trans)
1862 		: delete_dead_snapshot_keys_v1(trans);
1863 	if (!bch2_err_matches(ret, EROFS))
1864 		bch_err_msg(c, ret, "deleting keys from dying snapshots");
1865 	if (ret)
1866 		goto err;
1867 
1868 	darray_for_each(d->delete_leaves, i) {
1869 		ret = commit_do(trans, NULL, NULL, 0,
1870 			bch2_snapshot_node_delete(trans, *i));
1871 		if (!bch2_err_matches(ret, EROFS))
1872 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
1873 		if (ret)
1874 			goto err;
1875 	}
1876 
1877 	/*
1878 	 * Fixing children of deleted snapshots can't be done completely
1879 	 * atomically, if we crash between here and when we delete the interior
1880 	 * nodes some depth fields will be off:
1881 	 */
1882 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1883 				  BTREE_ITER_intent, k,
1884 				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1885 		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior));
1886 	if (ret)
1887 		goto err;
1888 
1889 	darray_for_each(d->delete_interior, i) {
1890 		ret = commit_do(trans, NULL, NULL, 0,
1891 			bch2_snapshot_node_delete(trans, i->id));
1892 		if (!bch2_err_matches(ret, EROFS))
1893 			bch_err_msg(c, ret, "deleting snapshot %u", i->id);
1894 		if (ret)
1895 			goto err;
1896 	}
1897 err:
1898 	mutex_lock(&d->progress_lock);
1899 	darray_exit(&d->deleting_from_trees);
1900 	darray_exit(&d->delete_interior);
1901 	darray_exit(&d->delete_leaves);
1902 	d->running = false;
1903 	mutex_unlock(&d->progress_lock);
1904 	bch2_trans_put(trans);
1905 
1906 	bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots);
1907 out_unlock:
1908 	mutex_unlock(&d->lock);
1909 	if (!bch2_err_matches(ret, EROFS))
1910 		bch_err_fn(c, ret);
1911 	return ret;
1912 }
1913 
bch2_delete_dead_snapshots(struct bch_fs * c)1914 int bch2_delete_dead_snapshots(struct bch_fs *c)
1915 {
1916 	if (!c->opts.auto_snapshot_deletion)
1917 		return 0;
1918 
1919 	return __bch2_delete_dead_snapshots(c);
1920 }
1921 
bch2_delete_dead_snapshots_work(struct work_struct * work)1922 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1923 {
1924 	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work);
1925 
1926 	set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
1927 
1928 	bch2_delete_dead_snapshots(c);
1929 	enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
1930 }
1931 
bch2_delete_dead_snapshots_async(struct bch_fs * c)1932 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1933 {
1934 	if (!c->opts.auto_snapshot_deletion)
1935 		return;
1936 
1937 	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots))
1938 		return;
1939 
1940 	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
1941 
1942 	if (!queue_work(system_long_wq, &c->snapshot_delete.work))
1943 		enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
1944 }
1945 
bch2_snapshot_delete_status_to_text(struct printbuf * out,struct bch_fs * c)1946 void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
1947 {
1948 	struct snapshot_delete *d = &c->snapshot_delete;
1949 
1950 	if (!d->running) {
1951 		prt_str(out, "(not running)");
1952 		return;
1953 	}
1954 
1955 	mutex_lock(&d->progress_lock);
1956 	bch2_snapshot_delete_nodes_to_text(out, d);
1957 
1958 	bch2_bbpos_to_text(out, d->pos);
1959 	mutex_unlock(&d->progress_lock);
1960 }
1961 
__bch2_key_has_snapshot_overwrites(struct btree_trans * trans,enum btree_id id,struct bpos pos)1962 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1963 				       enum btree_id id,
1964 				       struct bpos pos)
1965 {
1966 	struct bch_fs *c = trans->c;
1967 	struct btree_iter iter;
1968 	struct bkey_s_c k;
1969 	int ret;
1970 
1971 	for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
1972 					     BTREE_ITER_not_extents|
1973 					     BTREE_ITER_all_snapshots,
1974 					     k, ret) {
1975 		if (!bkey_eq(pos, k.k->p))
1976 			break;
1977 
1978 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1979 			ret = 1;
1980 			break;
1981 		}
1982 	}
1983 	bch2_trans_iter_exit(trans, &iter);
1984 
1985 	return ret;
1986 }
1987 
interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)1988 static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
1989 {
1990 	/* If there's one child, it's redundant and keys will be moved to the child */
1991 	return !!snap.v->children[0] + !!snap.v->children[1] == 1;
1992 }
1993 
bch2_check_snapshot_needs_deletion(struct btree_trans * trans,struct bkey_s_c k)1994 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1995 {
1996 	if (k.k->type != KEY_TYPE_snapshot)
1997 		return 0;
1998 
1999 	struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
2000 	if (BCH_SNAPSHOT_WILL_DELETE(snap.v) ||
2001 	    interior_snapshot_needs_delete(snap))
2002 		set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
2003 
2004 	return 0;
2005 }
2006 
bch2_snapshots_read(struct bch_fs * c)2007 int bch2_snapshots_read(struct bch_fs *c)
2008 {
2009 	/*
2010 	 * Initializing the is_ancestor bitmaps requires ancestors to already be
2011 	 * initialized - so mark in reverse:
2012 	 */
2013 	int ret = bch2_trans_run(c,
2014 		for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
2015 				   POS_MAX, 0, k,
2016 			__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
2017 			bch2_check_snapshot_needs_deletion(trans, k)));
2018 	bch_err_fn(c, ret);
2019 
2020 	/*
2021 	 * It's important that we check if we need to reconstruct snapshots
2022 	 * before going RW, so we mark that pass as required in the superblock -
2023 	 * otherwise, we could end up deleting keys with missing snapshot nodes
2024 	 * instead
2025 	 */
2026 	BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
2027 	       test_bit(BCH_FS_may_go_rw, &c->flags));
2028 
2029 	return ret;
2030 }
2031 
bch2_fs_snapshots_exit(struct bch_fs * c)2032 void bch2_fs_snapshots_exit(struct bch_fs *c)
2033 {
2034 	kvfree(rcu_dereference_protected(c->snapshots, true));
2035 }
2036 
bch2_fs_snapshots_init_early(struct bch_fs * c)2037 void bch2_fs_snapshots_init_early(struct bch_fs *c)
2038 {
2039 	INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work);
2040 	mutex_init(&c->snapshot_delete.lock);
2041 	mutex_init(&c->snapshot_delete.progress_lock);
2042 	mutex_init(&c->snapshots_unlinked_lock);
2043 }
2044