1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bbpos.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_key_cache.h"
8 #include "btree_update.h"
9 #include "buckets.h"
10 #include "enumerated_ref.h"
11 #include "errcode.h"
12 #include "error.h"
13 #include "fs.h"
14 #include "recovery_passes.h"
15 #include "snapshot.h"
16
17 #include <linux/random.h>
18
19 /*
20 * Snapshot trees:
21 *
22 * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
23 * exist to provide a stable identifier for the whole lifetime of a snapshot
24 * tree.
25 */
26
bch2_snapshot_tree_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)27 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
28 struct bkey_s_c k)
29 {
30 struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
31
32 prt_printf(out, "subvol %u root snapshot %u",
33 le32_to_cpu(t.v->master_subvol),
34 le32_to_cpu(t.v->root_snapshot));
35 }
36
bch2_snapshot_tree_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)37 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
38 struct bkey_validate_context from)
39 {
40 int ret = 0;
41
42 bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
43 bkey_lt(k.k->p, POS(0, 1)),
44 c, snapshot_tree_pos_bad,
45 "bad pos");
46 fsck_err:
47 return ret;
48 }
49
bch2_snapshot_tree_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot_tree * s)50 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
51 struct bch_snapshot_tree *s)
52 {
53 int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
54 BTREE_ITER_with_updates, snapshot_tree, s);
55
56 if (bch2_err_matches(ret, ENOENT))
57 ret = bch_err_throw(trans->c, ENOENT_snapshot_tree);
58 return ret;
59 }
60
61 struct bkey_i_snapshot_tree *
__bch2_snapshot_tree_create(struct btree_trans * trans)62 __bch2_snapshot_tree_create(struct btree_trans *trans)
63 {
64 struct btree_iter iter;
65 int ret = bch2_bkey_get_empty_slot(trans, &iter,
66 BTREE_ID_snapshot_trees, POS(0, U32_MAX));
67 struct bkey_i_snapshot_tree *s_t;
68
69 if (ret == -BCH_ERR_ENOSPC_btree_slot)
70 ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree);
71 if (ret)
72 return ERR_PTR(ret);
73
74 s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
75 ret = PTR_ERR_OR_ZERO(s_t);
76 bch2_trans_iter_exit(trans, &iter);
77 return ret ? ERR_PTR(ret) : s_t;
78 }
79
bch2_snapshot_tree_create(struct btree_trans * trans,u32 root_id,u32 subvol_id,u32 * tree_id)80 static int bch2_snapshot_tree_create(struct btree_trans *trans,
81 u32 root_id, u32 subvol_id, u32 *tree_id)
82 {
83 struct bkey_i_snapshot_tree *n_tree =
84 __bch2_snapshot_tree_create(trans);
85
86 if (IS_ERR(n_tree))
87 return PTR_ERR(n_tree);
88
89 n_tree->v.master_subvol = cpu_to_le32(subvol_id);
90 n_tree->v.root_snapshot = cpu_to_le32(root_id);
91 *tree_id = n_tree->k.p.offset;
92 return 0;
93 }
94
95 /* Snapshot nodes: */
96
__bch2_snapshot_is_ancestor_early(struct snapshot_table * t,u32 id,u32 ancestor)97 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
98 {
99 while (id && id < ancestor) {
100 const struct snapshot_t *s = __snapshot_t(t, id);
101 id = s ? s->parent : 0;
102 }
103 return id == ancestor;
104 }
105
bch2_snapshot_is_ancestor_early(struct bch_fs * c,u32 id,u32 ancestor)106 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
107 {
108 guard(rcu)();
109 return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
110 }
111
get_ancestor_below(struct snapshot_table * t,u32 id,u32 ancestor)112 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
113 {
114 const struct snapshot_t *s = __snapshot_t(t, id);
115 if (!s)
116 return 0;
117
118 if (s->skip[2] <= ancestor)
119 return s->skip[2];
120 if (s->skip[1] <= ancestor)
121 return s->skip[1];
122 if (s->skip[0] <= ancestor)
123 return s->skip[0];
124 return s->parent;
125 }
126
test_ancestor_bitmap(struct snapshot_table * t,u32 id,u32 ancestor)127 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
128 {
129 const struct snapshot_t *s = __snapshot_t(t, id);
130 if (!s)
131 return false;
132
133 return test_bit(ancestor - id - 1, s->is_ancestor);
134 }
135
__bch2_snapshot_is_ancestor(struct bch_fs * c,u32 id,u32 ancestor)136 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
137 {
138 #ifdef CONFIG_BCACHEFS_DEBUG
139 u32 orig_id = id;
140 #endif
141
142 guard(rcu)();
143 struct snapshot_table *t = rcu_dereference(c->snapshots);
144
145 if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots))
146 return __bch2_snapshot_is_ancestor_early(t, id, ancestor);
147
148 if (likely(ancestor >= IS_ANCESTOR_BITMAP))
149 while (id && id < ancestor - IS_ANCESTOR_BITMAP)
150 id = get_ancestor_below(t, id, ancestor);
151
152 bool ret = id && id < ancestor
153 ? test_ancestor_bitmap(t, id, ancestor)
154 : id == ancestor;
155
156 EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor));
157 return ret;
158 }
159
__snapshot_t_mut(struct bch_fs * c,u32 id)160 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
161 {
162 size_t idx = U32_MAX - id;
163 struct snapshot_table *new, *old;
164
165 size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
166 size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
167
168 if (unlikely(new_bytes > INT_MAX))
169 return NULL;
170
171 new = kvzalloc(new_bytes, GFP_KERNEL);
172 if (!new)
173 return NULL;
174
175 new->nr = new_size;
176
177 old = rcu_dereference_protected(c->snapshots, true);
178 if (old)
179 memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
180
181 rcu_assign_pointer(c->snapshots, new);
182 kvfree_rcu(old, rcu);
183
184 return &rcu_dereference_protected(c->snapshots,
185 lockdep_is_held(&c->snapshot_table_lock))->s[idx];
186 }
187
snapshot_t_mut(struct bch_fs * c,u32 id)188 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
189 {
190 size_t idx = U32_MAX - id;
191 struct snapshot_table *table =
192 rcu_dereference_protected(c->snapshots,
193 lockdep_is_held(&c->snapshot_table_lock));
194
195 lockdep_assert_held(&c->snapshot_table_lock);
196
197 if (likely(table && idx < table->nr))
198 return &table->s[idx];
199
200 return __snapshot_t_mut(c, id);
201 }
202
bch2_snapshot_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)203 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
204 struct bkey_s_c k)
205 {
206 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
207
208 if (BCH_SNAPSHOT_SUBVOL(s.v))
209 prt_str(out, "subvol ");
210 if (BCH_SNAPSHOT_WILL_DELETE(s.v))
211 prt_str(out, "will_delete ");
212 if (BCH_SNAPSHOT_DELETED(s.v))
213 prt_str(out, "deleted ");
214
215 prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u",
216 le32_to_cpu(s.v->parent),
217 le32_to_cpu(s.v->children[0]),
218 le32_to_cpu(s.v->children[1]),
219 le32_to_cpu(s.v->subvol),
220 le32_to_cpu(s.v->tree));
221
222 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
223 prt_printf(out, " depth %u skiplist %u %u %u",
224 le32_to_cpu(s.v->depth),
225 le32_to_cpu(s.v->skip[0]),
226 le32_to_cpu(s.v->skip[1]),
227 le32_to_cpu(s.v->skip[2]));
228 }
229
bch2_snapshot_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)230 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
231 struct bkey_validate_context from)
232 {
233 struct bkey_s_c_snapshot s;
234 u32 i, id;
235 int ret = 0;
236
237 bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
238 bkey_lt(k.k->p, POS(0, 1)),
239 c, snapshot_pos_bad,
240 "bad pos");
241
242 s = bkey_s_c_to_snapshot(k);
243
244 id = le32_to_cpu(s.v->parent);
245 bkey_fsck_err_on(id && id <= k.k->p.offset,
246 c, snapshot_parent_bad,
247 "bad parent node (%u <= %llu)",
248 id, k.k->p.offset);
249
250 bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
251 c, snapshot_children_not_normalized,
252 "children not normalized");
253
254 bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
255 c, snapshot_child_duplicate,
256 "duplicate child nodes");
257
258 for (i = 0; i < 2; i++) {
259 id = le32_to_cpu(s.v->children[i]);
260
261 bkey_fsck_err_on(id >= k.k->p.offset,
262 c, snapshot_child_bad,
263 "bad child node (%u >= %llu)",
264 id, k.k->p.offset);
265 }
266
267 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
268 bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
269 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
270 c, snapshot_skiplist_not_normalized,
271 "skiplist not normalized");
272
273 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
274 id = le32_to_cpu(s.v->skip[i]);
275
276 bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
277 c, snapshot_skiplist_bad,
278 "bad skiplist node %u", id);
279 }
280 }
281 fsck_err:
282 return ret;
283 }
284
bch2_snapshot_table_make_room(struct bch_fs * c,u32 id)285 static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
286 {
287 mutex_lock(&c->snapshot_table_lock);
288 int ret = snapshot_t_mut(c, id)
289 ? 0
290 : bch_err_throw(c, ENOMEM_mark_snapshot);
291 mutex_unlock(&c->snapshot_table_lock);
292 return ret;
293 }
294
__bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s_c new,enum btree_iter_update_trigger_flags flags)295 static int __bch2_mark_snapshot(struct btree_trans *trans,
296 enum btree_id btree, unsigned level,
297 struct bkey_s_c old, struct bkey_s_c new,
298 enum btree_iter_update_trigger_flags flags)
299 {
300 struct bch_fs *c = trans->c;
301 struct snapshot_t *t;
302 u32 id = new.k->p.offset;
303 int ret = 0;
304
305 mutex_lock(&c->snapshot_table_lock);
306
307 t = snapshot_t_mut(c, id);
308 if (!t) {
309 ret = bch_err_throw(c, ENOMEM_mark_snapshot);
310 goto err;
311 }
312
313 if (new.k->type == KEY_TYPE_snapshot) {
314 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
315
316 t->state = !BCH_SNAPSHOT_DELETED(s.v)
317 ? SNAPSHOT_ID_live
318 : SNAPSHOT_ID_deleted;
319 t->parent = le32_to_cpu(s.v->parent);
320 t->children[0] = le32_to_cpu(s.v->children[0]);
321 t->children[1] = le32_to_cpu(s.v->children[1]);
322 t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
323 t->tree = le32_to_cpu(s.v->tree);
324
325 if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
326 t->depth = le32_to_cpu(s.v->depth);
327 t->skip[0] = le32_to_cpu(s.v->skip[0]);
328 t->skip[1] = le32_to_cpu(s.v->skip[1]);
329 t->skip[2] = le32_to_cpu(s.v->skip[2]);
330 } else {
331 t->depth = 0;
332 t->skip[0] = 0;
333 t->skip[1] = 0;
334 t->skip[2] = 0;
335 }
336
337 u32 parent = id;
338
339 while ((parent = bch2_snapshot_parent_early(c, parent)) &&
340 parent - id - 1 < IS_ANCESTOR_BITMAP)
341 __set_bit(parent - id - 1, t->is_ancestor);
342
343 if (BCH_SNAPSHOT_WILL_DELETE(s.v)) {
344 set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
345 if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots)
346 bch2_delete_dead_snapshots_async(c);
347 }
348 } else {
349 memset(t, 0, sizeof(*t));
350 }
351 err:
352 mutex_unlock(&c->snapshot_table_lock);
353 return ret;
354 }
355
bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s new,enum btree_iter_update_trigger_flags flags)356 int bch2_mark_snapshot(struct btree_trans *trans,
357 enum btree_id btree, unsigned level,
358 struct bkey_s_c old, struct bkey_s new,
359 enum btree_iter_update_trigger_flags flags)
360 {
361 return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
362 }
363
bch2_snapshot_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot * s)364 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
365 struct bch_snapshot *s)
366 {
367 return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
368 BTREE_ITER_with_updates, snapshot, s);
369 }
370
371 /* fsck: */
372
bch2_snapshot_child(struct bch_fs * c,u32 id,unsigned child)373 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
374 {
375 return snapshot_t(c, id)->children[child];
376 }
377
bch2_snapshot_left_child(struct bch_fs * c,u32 id)378 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
379 {
380 return bch2_snapshot_child(c, id, 0);
381 }
382
bch2_snapshot_right_child(struct bch_fs * c,u32 id)383 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
384 {
385 return bch2_snapshot_child(c, id, 1);
386 }
387
bch2_snapshot_tree_next(struct bch_fs * c,u32 id)388 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
389 {
390 u32 n, parent;
391
392 n = bch2_snapshot_left_child(c, id);
393 if (n)
394 return n;
395
396 while ((parent = bch2_snapshot_parent(c, id))) {
397 n = bch2_snapshot_right_child(c, parent);
398 if (n && n != id)
399 return n;
400 id = parent;
401 }
402
403 return 0;
404 }
405
bch2_snapshot_oldest_subvol(struct bch_fs * c,u32 snapshot_root,snapshot_id_list * skip)406 u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
407 snapshot_id_list *skip)
408 {
409 guard(rcu)();
410 u32 id, subvol = 0, s;
411 retry:
412 id = snapshot_root;
413 while (id && bch2_snapshot_exists(c, id)) {
414 if (!(skip && snapshot_list_has_id(skip, id))) {
415 s = snapshot_t(c, id)->subvol;
416
417 if (s && (!subvol || s < subvol))
418 subvol = s;
419 }
420 id = bch2_snapshot_tree_next(c, id);
421 if (id == snapshot_root)
422 break;
423 }
424
425 if (!subvol && skip) {
426 skip = NULL;
427 goto retry;
428 }
429
430 return subvol;
431 }
432
bch2_snapshot_tree_master_subvol(struct btree_trans * trans,u32 snapshot_root,u32 * subvol_id)433 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
434 u32 snapshot_root, u32 *subvol_id)
435 {
436 struct bch_fs *c = trans->c;
437 struct btree_iter iter;
438 struct bkey_s_c k;
439 bool found = false;
440 int ret;
441
442 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
443 0, k, ret) {
444 if (k.k->type != KEY_TYPE_subvolume)
445 continue;
446
447 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
448 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
449 continue;
450 if (!BCH_SUBVOLUME_SNAP(s.v)) {
451 *subvol_id = s.k->p.offset;
452 found = true;
453 break;
454 }
455 }
456 bch2_trans_iter_exit(trans, &iter);
457
458 if (!ret && !found) {
459 struct bkey_i_subvolume *u;
460
461 *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL);
462
463 u = bch2_bkey_get_mut_typed(trans, &iter,
464 BTREE_ID_subvolumes, POS(0, *subvol_id),
465 0, subvolume);
466 ret = PTR_ERR_OR_ZERO(u);
467 if (ret)
468 return ret;
469
470 SET_BCH_SUBVOLUME_SNAP(&u->v, false);
471 }
472
473 return ret;
474 }
475
check_snapshot_tree(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)476 static int check_snapshot_tree(struct btree_trans *trans,
477 struct btree_iter *iter,
478 struct bkey_s_c k)
479 {
480 struct bch_fs *c = trans->c;
481 struct bkey_s_c_snapshot_tree st;
482 struct bch_snapshot s;
483 struct bch_subvolume subvol;
484 struct printbuf buf = PRINTBUF;
485 struct btree_iter snapshot_iter = {};
486 u32 root_id;
487 int ret;
488
489 if (k.k->type != KEY_TYPE_snapshot_tree)
490 return 0;
491
492 st = bkey_s_c_to_snapshot_tree(k);
493 root_id = le32_to_cpu(st.v->root_snapshot);
494
495 struct bkey_s_c_snapshot snapshot_k =
496 bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
497 POS(0, root_id), 0, snapshot);
498 ret = bkey_err(snapshot_k);
499 if (ret && !bch2_err_matches(ret, ENOENT))
500 goto err;
501
502 if (!ret)
503 bkey_val_copy(&s, snapshot_k);
504
505 if (fsck_err_on(ret ||
506 root_id != bch2_snapshot_root(c, root_id) ||
507 st.k->p.offset != le32_to_cpu(s.tree),
508 trans, snapshot_tree_to_missing_snapshot,
509 "snapshot tree points to missing/incorrect snapshot:\n%s",
510 (bch2_bkey_val_to_text(&buf, c, st.s_c),
511 prt_newline(&buf),
512 ret
513 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
514 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
515 buf.buf))) {
516 ret = bch2_btree_delete_at(trans, iter, 0);
517 goto err;
518 }
519
520 if (!st.v->master_subvol)
521 goto out;
522
523 ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
524 if (ret && !bch2_err_matches(ret, ENOENT))
525 goto err;
526
527 if (fsck_err_on(ret,
528 trans, snapshot_tree_to_missing_subvol,
529 "snapshot tree points to missing subvolume:\n%s",
530 (printbuf_reset(&buf),
531 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
532 fsck_err_on(!bch2_snapshot_is_ancestor(c,
533 le32_to_cpu(subvol.snapshot),
534 root_id),
535 trans, snapshot_tree_to_wrong_subvol,
536 "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s",
537 (printbuf_reset(&buf),
538 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
539 fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
540 trans, snapshot_tree_to_snapshot_subvol,
541 "snapshot tree points to snapshot subvolume:\n%s",
542 (printbuf_reset(&buf),
543 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
544 struct bkey_i_snapshot_tree *u;
545 u32 subvol_id;
546
547 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
548 bch_err_fn(c, ret);
549
550 if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
551 ret = 0;
552 goto err;
553 }
554
555 if (ret)
556 goto err;
557
558 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
559 ret = PTR_ERR_OR_ZERO(u);
560 if (ret)
561 goto err;
562
563 u->v.master_subvol = cpu_to_le32(subvol_id);
564 st = snapshot_tree_i_to_s_c(u);
565 }
566 out:
567 err:
568 fsck_err:
569 bch2_trans_iter_exit(trans, &snapshot_iter);
570 printbuf_exit(&buf);
571 return ret;
572 }
573
574 /*
575 * For each snapshot_tree, make sure it points to the root of a snapshot tree
576 * and that snapshot entry points back to it, or delete it.
577 *
578 * And, make sure it points to a subvolume within that snapshot tree, or correct
579 * it to point to the oldest subvolume within that snapshot tree.
580 */
bch2_check_snapshot_trees(struct bch_fs * c)581 int bch2_check_snapshot_trees(struct bch_fs *c)
582 {
583 int ret = bch2_trans_run(c,
584 for_each_btree_key_commit(trans, iter,
585 BTREE_ID_snapshot_trees, POS_MIN,
586 BTREE_ITER_prefetch, k,
587 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
588 check_snapshot_tree(trans, &iter, k)));
589 bch_err_fn(c, ret);
590 return ret;
591 }
592
593 /*
594 * Look up snapshot tree for @tree_id and find root,
595 * make sure @snap_id is a descendent:
596 */
snapshot_tree_ptr_good(struct btree_trans * trans,u32 snap_id,u32 tree_id)597 static int snapshot_tree_ptr_good(struct btree_trans *trans,
598 u32 snap_id, u32 tree_id)
599 {
600 struct bch_snapshot_tree s_t;
601 int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
602
603 if (bch2_err_matches(ret, ENOENT))
604 return 0;
605 if (ret)
606 return ret;
607
608 return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
609 }
610
bch2_snapshot_skiplist_get(struct bch_fs * c,u32 id)611 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
612 {
613 if (!id)
614 return 0;
615
616 guard(rcu)();
617 const struct snapshot_t *s = snapshot_t(c, id);
618 return s->parent
619 ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth))
620 : id;
621 }
622
snapshot_skiplist_good(struct btree_trans * trans,u32 id,struct bch_snapshot s)623 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
624 {
625 unsigned i;
626
627 for (i = 0; i < 3; i++)
628 if (!s.parent) {
629 if (s.skip[i])
630 return false;
631 } else {
632 if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
633 return false;
634 }
635
636 return true;
637 }
638
639 /*
640 * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
641 * its snapshot_tree pointer is correct (allocate new one if necessary), then
642 * update this node's pointer to root node's pointer:
643 */
snapshot_tree_ptr_repair(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_snapshot * s)644 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
645 struct btree_iter *iter,
646 struct bkey_s_c k,
647 struct bch_snapshot *s)
648 {
649 struct bch_fs *c = trans->c;
650 struct btree_iter root_iter;
651 struct bch_snapshot_tree s_t;
652 struct bkey_s_c_snapshot root;
653 struct bkey_i_snapshot *u;
654 u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
655 int ret;
656
657 root = bch2_bkey_get_iter_typed(trans, &root_iter,
658 BTREE_ID_snapshots, POS(0, root_id),
659 BTREE_ITER_with_updates, snapshot);
660 ret = bkey_err(root);
661 if (ret)
662 goto err;
663
664 tree_id = le32_to_cpu(root.v->tree);
665
666 ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
667 if (ret && !bch2_err_matches(ret, ENOENT))
668 return ret;
669
670 if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
671 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
672 ret = PTR_ERR_OR_ZERO(u) ?:
673 bch2_snapshot_tree_create(trans, root_id,
674 bch2_snapshot_oldest_subvol(c, root_id, NULL),
675 &tree_id);
676 if (ret)
677 goto err;
678
679 u->v.tree = cpu_to_le32(tree_id);
680 if (k.k->p.offset == root_id)
681 *s = u->v;
682 }
683
684 if (k.k->p.offset != root_id) {
685 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
686 ret = PTR_ERR_OR_ZERO(u);
687 if (ret)
688 goto err;
689
690 u->v.tree = cpu_to_le32(tree_id);
691 *s = u->v;
692 }
693 err:
694 bch2_trans_iter_exit(trans, &root_iter);
695 return ret;
696 }
697
check_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)698 static int check_snapshot(struct btree_trans *trans,
699 struct btree_iter *iter,
700 struct bkey_s_c k)
701 {
702 struct bch_fs *c = trans->c;
703 struct bch_snapshot s;
704 struct bch_subvolume subvol;
705 struct bch_snapshot v;
706 struct bkey_i_snapshot *u;
707 u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
708 u32 real_depth;
709 struct printbuf buf = PRINTBUF;
710 u32 i, id;
711 int ret = 0;
712
713 if (k.k->type != KEY_TYPE_snapshot)
714 return 0;
715
716 memset(&s, 0, sizeof(s));
717 memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
718
719 if (BCH_SNAPSHOT_DELETED(&s))
720 return 0;
721
722 id = le32_to_cpu(s.parent);
723 if (id) {
724 ret = bch2_snapshot_lookup(trans, id, &v);
725 if (bch2_err_matches(ret, ENOENT))
726 bch_err(c, "snapshot with nonexistent parent:\n %s",
727 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
728 if (ret)
729 goto err;
730
731 if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
732 le32_to_cpu(v.children[1]) != k.k->p.offset) {
733 bch_err(c, "snapshot parent %u missing pointer to child %llu",
734 id, k.k->p.offset);
735 ret = -EINVAL;
736 goto err;
737 }
738 }
739
740 for (i = 0; i < 2 && s.children[i]; i++) {
741 id = le32_to_cpu(s.children[i]);
742
743 ret = bch2_snapshot_lookup(trans, id, &v);
744 if (bch2_err_matches(ret, ENOENT))
745 bch_err(c, "snapshot node %llu has nonexistent child %u",
746 k.k->p.offset, id);
747 if (ret)
748 goto err;
749
750 if (le32_to_cpu(v.parent) != k.k->p.offset) {
751 bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
752 id, le32_to_cpu(v.parent), k.k->p.offset);
753 ret = -EINVAL;
754 goto err;
755 }
756 }
757
758 bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
759 !BCH_SNAPSHOT_WILL_DELETE(&s);
760
761 if (should_have_subvol) {
762 id = le32_to_cpu(s.subvol);
763 ret = bch2_subvolume_get(trans, id, false, &subvol);
764 if (bch2_err_matches(ret, ENOENT))
765 bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
766 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
767 if (ret)
768 goto err;
769
770 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
771 bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
772 k.k->p.offset);
773 ret = -EINVAL;
774 goto err;
775 }
776 } else {
777 if (fsck_err_on(s.subvol,
778 trans, snapshot_should_not_have_subvol,
779 "snapshot should not point to subvol:\n%s",
780 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
781 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
782 ret = PTR_ERR_OR_ZERO(u);
783 if (ret)
784 goto err;
785
786 u->v.subvol = 0;
787 s = u->v;
788 }
789 }
790
791 ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
792 if (ret < 0)
793 goto err;
794
795 if (fsck_err_on(!ret,
796 trans, snapshot_to_bad_snapshot_tree,
797 "snapshot points to missing/incorrect tree:\n%s",
798 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
799 ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
800 if (ret)
801 goto err;
802 }
803 ret = 0;
804
805 real_depth = bch2_snapshot_depth(c, parent_id);
806
807 if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
808 trans, snapshot_bad_depth,
809 "snapshot with incorrect depth field, should be %u:\n%s",
810 real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
811 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
812 ret = PTR_ERR_OR_ZERO(u);
813 if (ret)
814 goto err;
815
816 u->v.depth = cpu_to_le32(real_depth);
817 s = u->v;
818 }
819
820 ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
821 if (ret < 0)
822 goto err;
823
824 if (fsck_err_on(!ret,
825 trans, snapshot_bad_skiplist,
826 "snapshot with bad skiplist field:\n%s",
827 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
828 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
829 ret = PTR_ERR_OR_ZERO(u);
830 if (ret)
831 goto err;
832
833 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
834 u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
835
836 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
837 s = u->v;
838 }
839 ret = 0;
840 err:
841 fsck_err:
842 printbuf_exit(&buf);
843 return ret;
844 }
845
bch2_check_snapshots(struct bch_fs * c)846 int bch2_check_snapshots(struct bch_fs *c)
847 {
848 /*
849 * We iterate backwards as checking/fixing the depth field requires that
850 * the parent's depth already be correct:
851 */
852 int ret = bch2_trans_run(c,
853 for_each_btree_key_reverse_commit(trans, iter,
854 BTREE_ID_snapshots, POS_MAX,
855 BTREE_ITER_prefetch, k,
856 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
857 check_snapshot(trans, &iter, k)));
858 bch_err_fn(c, ret);
859 return ret;
860 }
861
check_snapshot_exists(struct btree_trans * trans,u32 id)862 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
863 {
864 struct bch_fs *c = trans->c;
865
866 /* Do we need to reconstruct the snapshot_tree entry as well? */
867 struct btree_iter iter;
868 struct bkey_s_c k;
869 int ret = 0;
870 u32 tree_id = 0;
871
872 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
873 0, k, ret) {
874 if (k.k->type == KEY_TYPE_snapshot_tree &&
875 le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
876 tree_id = k.k->p.offset;
877 break;
878 }
879 }
880 bch2_trans_iter_exit(trans, &iter);
881
882 if (ret)
883 return ret;
884
885 if (!tree_id) {
886 ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
887 if (ret)
888 return ret;
889 }
890
891 struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
892 ret = PTR_ERR_OR_ZERO(snapshot);
893 if (ret)
894 return ret;
895
896 bkey_snapshot_init(&snapshot->k_i);
897 snapshot->k.p = POS(0, id);
898 snapshot->v.tree = cpu_to_le32(tree_id);
899 snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
900
901 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
902 0, k, ret) {
903 if (k.k->type == KEY_TYPE_subvolume &&
904 le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
905 snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
906 SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
907 break;
908 }
909 }
910 bch2_trans_iter_exit(trans, &iter);
911
912 return bch2_snapshot_table_make_room(c, id) ?:
913 bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0);
914 }
915
916 /* Figure out which snapshot nodes belong in the same tree: */
917 struct snapshot_tree_reconstruct {
918 enum btree_id btree;
919 struct bpos cur_pos;
920 snapshot_id_list cur_ids;
921 DARRAY(snapshot_id_list) trees;
922 };
923
snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct * r)924 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
925 {
926 darray_for_each(r->trees, i)
927 darray_exit(i);
928 darray_exit(&r->trees);
929 darray_exit(&r->cur_ids);
930 }
931
same_snapshot(struct snapshot_tree_reconstruct * r,struct bpos pos)932 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
933 {
934 return r->btree == BTREE_ID_inodes
935 ? r->cur_pos.offset == pos.offset
936 : r->cur_pos.inode == pos.inode;
937 }
938
snapshot_id_lists_have_common(snapshot_id_list * l,snapshot_id_list * r)939 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
940 {
941 return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL;
942 }
943
snapshot_id_list_to_text(struct printbuf * out,snapshot_id_list * s)944 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
945 {
946 bool first = true;
947 darray_for_each(*s, i) {
948 if (!first)
949 prt_char(out, ' ');
950 first = false;
951 prt_printf(out, "%u", *i);
952 }
953 }
954
snapshot_tree_reconstruct_next(struct bch_fs * c,struct snapshot_tree_reconstruct * r)955 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
956 {
957 if (r->cur_ids.nr) {
958 darray_for_each(r->trees, i)
959 if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
960 int ret = snapshot_list_merge(c, i, &r->cur_ids);
961 if (ret)
962 return ret;
963 goto out;
964 }
965 darray_push(&r->trees, r->cur_ids);
966 darray_init(&r->cur_ids);
967 }
968 out:
969 r->cur_ids.nr = 0;
970 return 0;
971 }
972
get_snapshot_trees(struct bch_fs * c,struct snapshot_tree_reconstruct * r,struct bpos pos)973 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
974 {
975 if (!same_snapshot(r, pos))
976 snapshot_tree_reconstruct_next(c, r);
977 r->cur_pos = pos;
978 return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
979 }
980
bch2_reconstruct_snapshots(struct bch_fs * c)981 int bch2_reconstruct_snapshots(struct bch_fs *c)
982 {
983 struct btree_trans *trans = bch2_trans_get(c);
984 struct printbuf buf = PRINTBUF;
985 struct snapshot_tree_reconstruct r = {};
986 int ret = 0;
987
988 for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
989 if (btree_type_has_snapshots(btree)) {
990 r.btree = btree;
991
992 ret = for_each_btree_key(trans, iter, btree, POS_MIN,
993 BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
994 get_snapshot_trees(c, &r, k.k->p);
995 }));
996 if (ret)
997 goto err;
998
999 snapshot_tree_reconstruct_next(c, &r);
1000 }
1001 }
1002
1003 darray_for_each(r.trees, t) {
1004 printbuf_reset(&buf);
1005 snapshot_id_list_to_text(&buf, t);
1006
1007 darray_for_each(*t, id) {
1008 if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty,
1009 trans, snapshot_node_missing,
1010 "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
1011 if (t->nr > 1) {
1012 bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
1013 ret = bch_err_throw(c, fsck_repair_unimplemented);
1014 goto err;
1015 }
1016
1017 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1018 check_snapshot_exists(trans, *id));
1019 if (ret)
1020 goto err;
1021 }
1022 }
1023 }
1024 fsck_err:
1025 err:
1026 bch2_trans_put(trans);
1027 snapshot_tree_reconstruct_exit(&r);
1028 printbuf_exit(&buf);
1029 bch_err_fn(c, ret);
1030 return ret;
1031 }
1032
__bch2_check_key_has_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1033 int __bch2_check_key_has_snapshot(struct btree_trans *trans,
1034 struct btree_iter *iter,
1035 struct bkey_s_c k)
1036 {
1037 struct bch_fs *c = trans->c;
1038 struct printbuf buf = PRINTBUF;
1039 int ret = 0;
1040 enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot);
1041
1042 /* Snapshot was definitively deleted, this error is marked autofix */
1043 if (fsck_err_on(state == SNAPSHOT_ID_deleted,
1044 trans, bkey_in_deleted_snapshot,
1045 "key in deleted snapshot %s, delete?",
1046 (bch2_btree_id_to_text(&buf, iter->btree_id),
1047 prt_char(&buf, ' '),
1048 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1049 ret = bch2_btree_delete_at(trans, iter,
1050 BTREE_UPDATE_internal_snapshot_node) ?: 1;
1051
1052 if (state == SNAPSHOT_ID_empty) {
1053 /*
1054 * Snapshot missing: we should have caught this with btree_lost_data and
1055 * kicked off reconstruct_snapshots, so if we end up here we have no
1056 * idea what happened.
1057 *
1058 * Do not delete unless we know that subvolumes and snapshots
1059 * are consistent:
1060 *
1061 * XXX:
1062 *
1063 * We could be smarter here, and instead of using the generic
1064 * recovery pass ratelimiting, track if there have been any
1065 * changes to the snapshots or inodes btrees since those passes
1066 * last ran.
1067 */
1068 ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
1069 ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
1070
1071 if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
1072 ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
1073
1074 unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
1075
1076 if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
1077 "key in missing snapshot %s, delete?",
1078 (bch2_btree_id_to_text(&buf, iter->btree_id),
1079 prt_char(&buf, ' '),
1080 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1081 ret = bch2_btree_delete_at(trans, iter,
1082 BTREE_UPDATE_internal_snapshot_node) ?: 1;
1083 }
1084 }
1085 fsck_err:
1086 printbuf_exit(&buf);
1087 return ret;
1088 }
1089
__bch2_get_snapshot_overwrites(struct btree_trans * trans,enum btree_id btree,struct bpos pos,snapshot_id_list * s)1090 int __bch2_get_snapshot_overwrites(struct btree_trans *trans,
1091 enum btree_id btree, struct bpos pos,
1092 snapshot_id_list *s)
1093 {
1094 struct bch_fs *c = trans->c;
1095 struct btree_iter iter;
1096 struct bkey_s_c k;
1097 int ret = 0;
1098
1099 for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos),
1100 BTREE_ITER_all_snapshots, k, ret) {
1101 if (!bkey_eq(k.k->p, pos))
1102 break;
1103
1104 if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) ||
1105 snapshot_list_has_ancestor(c, s, k.k->p.snapshot))
1106 continue;
1107
1108 ret = snapshot_list_add(c, s, k.k->p.snapshot);
1109 if (ret)
1110 break;
1111 }
1112 bch2_trans_iter_exit(trans, &iter);
1113 if (ret)
1114 darray_exit(s);
1115
1116 return ret;
1117 }
1118
1119 /*
1120 * Mark a snapshot as deleted, for future cleanup:
1121 */
bch2_snapshot_node_set_deleted(struct btree_trans * trans,u32 id)1122 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1123 {
1124 struct btree_iter iter;
1125 struct bkey_i_snapshot *s =
1126 bch2_bkey_get_mut_typed(trans, &iter,
1127 BTREE_ID_snapshots, POS(0, id),
1128 0, snapshot);
1129 int ret = PTR_ERR_OR_ZERO(s);
1130 if (unlikely(ret)) {
1131 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1132 trans->c, "missing snapshot %u", id);
1133 return ret;
1134 }
1135
1136 /* already deleted? */
1137 if (BCH_SNAPSHOT_WILL_DELETE(&s->v))
1138 goto err;
1139
1140 SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true);
1141 SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1142 s->v.subvol = 0;
1143 err:
1144 bch2_trans_iter_exit(trans, &iter);
1145 return ret;
1146 }
1147
normalize_snapshot_child_pointers(struct bch_snapshot * s)1148 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1149 {
1150 if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1151 swap(s->children[0], s->children[1]);
1152 }
1153
bch2_snapshot_node_delete(struct btree_trans * trans,u32 id)1154 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1155 {
1156 struct bch_fs *c = trans->c;
1157 struct btree_iter iter, p_iter = {};
1158 struct btree_iter c_iter = {};
1159 struct btree_iter tree_iter = {};
1160 u32 parent_id, child_id;
1161 unsigned i;
1162 int ret = 0;
1163
1164 struct bkey_i_snapshot *s =
1165 bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1166 BTREE_ITER_intent, snapshot);
1167 ret = PTR_ERR_OR_ZERO(s);
1168 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1169 "missing snapshot %u", id);
1170
1171 if (ret)
1172 goto err;
1173
1174 BUG_ON(BCH_SNAPSHOT_DELETED(&s->v));
1175 BUG_ON(s->v.children[1]);
1176
1177 parent_id = le32_to_cpu(s->v.parent);
1178 child_id = le32_to_cpu(s->v.children[0]);
1179
1180 if (parent_id) {
1181 struct bkey_i_snapshot *parent;
1182
1183 parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1184 BTREE_ID_snapshots, POS(0, parent_id),
1185 0, snapshot);
1186 ret = PTR_ERR_OR_ZERO(parent);
1187 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1188 "missing snapshot %u", parent_id);
1189 if (unlikely(ret))
1190 goto err;
1191
1192 /* find entry in parent->children for node being deleted */
1193 for (i = 0; i < 2; i++)
1194 if (le32_to_cpu(parent->v.children[i]) == id)
1195 break;
1196
1197 if (bch2_fs_inconsistent_on(i == 2, c,
1198 "snapshot %u missing child pointer to %u",
1199 parent_id, id))
1200 goto err;
1201
1202 parent->v.children[i] = cpu_to_le32(child_id);
1203
1204 normalize_snapshot_child_pointers(&parent->v);
1205 }
1206
1207 if (child_id) {
1208 struct bkey_i_snapshot *child;
1209
1210 child = bch2_bkey_get_mut_typed(trans, &c_iter,
1211 BTREE_ID_snapshots, POS(0, child_id),
1212 0, snapshot);
1213 ret = PTR_ERR_OR_ZERO(child);
1214 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1215 "missing snapshot %u", child_id);
1216 if (unlikely(ret))
1217 goto err;
1218
1219 child->v.parent = cpu_to_le32(parent_id);
1220
1221 if (!child->v.parent) {
1222 child->v.skip[0] = 0;
1223 child->v.skip[1] = 0;
1224 child->v.skip[2] = 0;
1225 }
1226 }
1227
1228 if (!parent_id) {
1229 /*
1230 * We're deleting the root of a snapshot tree: update the
1231 * snapshot_tree entry to point to the new root, or delete it if
1232 * this is the last snapshot ID in this tree:
1233 */
1234 struct bkey_i_snapshot_tree *s_t;
1235
1236 BUG_ON(s->v.children[1]);
1237
1238 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1239 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)),
1240 0, snapshot_tree);
1241 ret = PTR_ERR_OR_ZERO(s_t);
1242 if (ret)
1243 goto err;
1244
1245 if (s->v.children[0]) {
1246 s_t->v.root_snapshot = s->v.children[0];
1247 } else {
1248 s_t->k.type = KEY_TYPE_deleted;
1249 set_bkey_val_u64s(&s_t->k, 0);
1250 }
1251 }
1252
1253 if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) {
1254 SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1255 s->v.parent = 0;
1256 s->v.children[0] = 0;
1257 s->v.children[1] = 0;
1258 s->v.subvol = 0;
1259 s->v.tree = 0;
1260 s->v.depth = 0;
1261 s->v.skip[0] = 0;
1262 s->v.skip[1] = 0;
1263 s->v.skip[2] = 0;
1264 } else {
1265 s->k.type = KEY_TYPE_deleted;
1266 set_bkey_val_u64s(&s->k, 0);
1267 }
1268 err:
1269 bch2_trans_iter_exit(trans, &tree_iter);
1270 bch2_trans_iter_exit(trans, &p_iter);
1271 bch2_trans_iter_exit(trans, &c_iter);
1272 bch2_trans_iter_exit(trans, &iter);
1273 return ret;
1274 }
1275
create_snapids(struct btree_trans * trans,u32 parent,u32 tree,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1276 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1277 u32 *new_snapids,
1278 u32 *snapshot_subvols,
1279 unsigned nr_snapids)
1280 {
1281 struct bch_fs *c = trans->c;
1282 struct btree_iter iter;
1283 struct bkey_i_snapshot *n;
1284 struct bkey_s_c k;
1285 unsigned i, j;
1286 u32 depth = bch2_snapshot_depth(c, parent);
1287 int ret;
1288
1289 bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1290 POS_MIN, BTREE_ITER_intent);
1291 k = bch2_btree_iter_peek(trans, &iter);
1292 ret = bkey_err(k);
1293 if (ret)
1294 goto err;
1295
1296 for (i = 0; i < nr_snapids; i++) {
1297 k = bch2_btree_iter_prev_slot(trans, &iter);
1298 ret = bkey_err(k);
1299 if (ret)
1300 goto err;
1301
1302 if (!k.k || !k.k->p.offset) {
1303 ret = bch_err_throw(c, ENOSPC_snapshot_create);
1304 goto err;
1305 }
1306
1307 n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1308 ret = PTR_ERR_OR_ZERO(n);
1309 if (ret)
1310 goto err;
1311
1312 n->v.flags = 0;
1313 n->v.parent = cpu_to_le32(parent);
1314 n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
1315 n->v.tree = cpu_to_le32(tree);
1316 n->v.depth = cpu_to_le32(depth);
1317 n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
1318 n->v.btime.hi = 0;
1319
1320 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1321 n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1322
1323 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1324 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1325
1326 ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1327 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1328 if (ret)
1329 goto err;
1330
1331 new_snapids[i] = iter.pos.offset;
1332 }
1333 err:
1334 bch2_trans_iter_exit(trans, &iter);
1335 return ret;
1336 }
1337
1338 /*
1339 * Create new snapshot IDs as children of an existing snapshot ID:
1340 */
bch2_snapshot_node_create_children(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1341 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1342 u32 *new_snapids,
1343 u32 *snapshot_subvols,
1344 unsigned nr_snapids)
1345 {
1346 struct btree_iter iter;
1347 struct bkey_i_snapshot *n_parent;
1348 int ret = 0;
1349
1350 n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1351 BTREE_ID_snapshots, POS(0, parent),
1352 0, snapshot);
1353 ret = PTR_ERR_OR_ZERO(n_parent);
1354 if (unlikely(ret)) {
1355 if (bch2_err_matches(ret, ENOENT))
1356 bch_err(trans->c, "snapshot %u not found", parent);
1357 return ret;
1358 }
1359
1360 if (n_parent->v.children[0] || n_parent->v.children[1]) {
1361 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1362 ret = -EINVAL;
1363 goto err;
1364 }
1365
1366 ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1367 new_snapids, snapshot_subvols, nr_snapids);
1368 if (ret)
1369 goto err;
1370
1371 n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1372 n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1373 n_parent->v.subvol = 0;
1374 SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1375 err:
1376 bch2_trans_iter_exit(trans, &iter);
1377 return ret;
1378 }
1379
1380 /*
1381 * Create a snapshot node that is the root of a new tree:
1382 */
bch2_snapshot_node_create_tree(struct btree_trans * trans,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1383 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1384 u32 *new_snapids,
1385 u32 *snapshot_subvols,
1386 unsigned nr_snapids)
1387 {
1388 struct bkey_i_snapshot_tree *n_tree;
1389 int ret;
1390
1391 n_tree = __bch2_snapshot_tree_create(trans);
1392 ret = PTR_ERR_OR_ZERO(n_tree) ?:
1393 create_snapids(trans, 0, n_tree->k.p.offset,
1394 new_snapids, snapshot_subvols, nr_snapids);
1395 if (ret)
1396 return ret;
1397
1398 n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
1399 n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
1400 return 0;
1401 }
1402
bch2_snapshot_node_create(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1403 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1404 u32 *new_snapids,
1405 u32 *snapshot_subvols,
1406 unsigned nr_snapids)
1407 {
1408 BUG_ON((parent == 0) != (nr_snapids == 1));
1409 BUG_ON((parent != 0) != (nr_snapids == 2));
1410
1411 return parent
1412 ? bch2_snapshot_node_create_children(trans, parent,
1413 new_snapids, snapshot_subvols, nr_snapids)
1414 : bch2_snapshot_node_create_tree(trans,
1415 new_snapids, snapshot_subvols, nr_snapids);
1416
1417 }
1418
1419 /*
1420 * If we have an unlinked inode in an internal snapshot node, and the inode
1421 * really has been deleted in all child snapshots, how does this get cleaned up?
1422 *
1423 * first there is the problem of how keys that have been overwritten in all
1424 * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1425 * special?
1426 *
1427 * also: unlinked inode in internal snapshot appears to not be getting deleted
1428 * correctly if inode doesn't exist in leaf snapshots
1429 *
1430 * solution:
1431 *
1432 * for a key in an interior snapshot node that needs work to be done that
1433 * requires it to be mutated: iterate over all descendent leaf nodes and copy
1434 * that key to snapshot leaf nodes, where we can mutate it
1435 */
1436
interior_delete_has_id(interior_delete_list * l,u32 id)1437 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
1438 {
1439 struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id);
1440 return i ? i->live_child : 0;
1441 }
1442
__live_child(struct snapshot_table * t,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1443 static unsigned __live_child(struct snapshot_table *t, u32 id,
1444 snapshot_id_list *delete_leaves,
1445 interior_delete_list *delete_interior)
1446 {
1447 struct snapshot_t *s = __snapshot_t(t, id);
1448 if (!s)
1449 return 0;
1450
1451 for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
1452 if (s->children[i] &&
1453 !snapshot_list_has_id(delete_leaves, s->children[i]) &&
1454 !interior_delete_has_id(delete_interior, s->children[i]))
1455 return s->children[i];
1456
1457 for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
1458 u32 live_child = s->children[i]
1459 ? __live_child(t, s->children[i], delete_leaves, delete_interior)
1460 : 0;
1461 if (live_child)
1462 return live_child;
1463 }
1464
1465 return 0;
1466 }
1467
live_child(struct bch_fs * c,u32 id)1468 static unsigned live_child(struct bch_fs *c, u32 id)
1469 {
1470 struct snapshot_delete *d = &c->snapshot_delete;
1471
1472 guard(rcu)();
1473 return __live_child(rcu_dereference(c->snapshots), id,
1474 &d->delete_leaves, &d->delete_interior);
1475 }
1476
snapshot_id_dying(struct snapshot_delete * d,unsigned id)1477 static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
1478 {
1479 return snapshot_list_has_id(&d->delete_leaves, id) ||
1480 interior_delete_has_id(&d->delete_interior, id) != 0;
1481 }
1482
delete_dead_snapshots_process_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1483 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
1484 struct btree_iter *iter,
1485 struct bkey_s_c k)
1486 {
1487 struct snapshot_delete *d = &trans->c->snapshot_delete;
1488
1489 if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot))
1490 return bch2_btree_delete_at(trans, iter,
1491 BTREE_UPDATE_internal_snapshot_node);
1492
1493 u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot);
1494 if (live_child) {
1495 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1496 int ret = PTR_ERR_OR_ZERO(new);
1497 if (ret)
1498 return ret;
1499
1500 new->k.p.snapshot = live_child;
1501
1502 struct btree_iter dst_iter;
1503 struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
1504 iter->btree_id, new->k.p,
1505 BTREE_ITER_all_snapshots|
1506 BTREE_ITER_intent);
1507 ret = bkey_err(dst_k);
1508 if (ret)
1509 return ret;
1510
1511 ret = (bkey_deleted(dst_k.k)
1512 ? bch2_trans_update(trans, &dst_iter, new,
1513 BTREE_UPDATE_internal_snapshot_node)
1514 : 0) ?:
1515 bch2_btree_delete_at(trans, iter,
1516 BTREE_UPDATE_internal_snapshot_node);
1517 bch2_trans_iter_exit(trans, &dst_iter);
1518 return ret;
1519 }
1520
1521 return 0;
1522 }
1523
skip_unrelated_snapshot_tree(struct btree_trans * trans,struct btree_iter * iter,u64 * prev_inum)1524 static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
1525 {
1526 struct bch_fs *c = trans->c;
1527 struct snapshot_delete *d = &c->snapshot_delete;
1528
1529 u64 inum = iter->btree_id != BTREE_ID_inodes
1530 ? iter->pos.inode
1531 : iter->pos.offset;
1532
1533 if (*prev_inum == inum)
1534 return false;
1535
1536 *prev_inum = inum;
1537
1538 bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
1539 bch2_snapshot_tree(c, iter->pos.snapshot));
1540 if (unlikely(ret)) {
1541 struct bpos pos = iter->pos;
1542 pos.snapshot = 0;
1543 if (iter->btree_id != BTREE_ID_inodes)
1544 pos.offset = U64_MAX;
1545 bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos));
1546 }
1547
1548 return ret;
1549 }
1550
delete_dead_snapshot_keys_v1(struct btree_trans * trans)1551 static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
1552 {
1553 struct bch_fs *c = trans->c;
1554 struct snapshot_delete *d = &c->snapshot_delete;
1555
1556 for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
1557 struct disk_reservation res = { 0 };
1558 u64 prev_inum = 0;
1559
1560 d->pos.pos = POS_MIN;
1561
1562 if (!btree_type_has_snapshots(d->pos.btree))
1563 continue;
1564
1565 int ret = for_each_btree_key_commit(trans, iter,
1566 d->pos.btree, POS_MIN,
1567 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1568 &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1569 d->pos.pos = iter.pos;
1570
1571 if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
1572 continue;
1573
1574 delete_dead_snapshots_process_key(trans, &iter, k);
1575 }));
1576
1577 bch2_disk_reservation_put(c, &res);
1578
1579 if (ret)
1580 return ret;
1581 }
1582
1583 return 0;
1584 }
1585
delete_dead_snapshot_keys_range(struct btree_trans * trans,enum btree_id btree,struct bpos start,struct bpos end)1586 static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
1587 struct bpos start, struct bpos end)
1588 {
1589 struct bch_fs *c = trans->c;
1590 struct snapshot_delete *d = &c->snapshot_delete;
1591 struct disk_reservation res = { 0 };
1592
1593 d->pos.btree = btree;
1594 d->pos.pos = POS_MIN;
1595
1596 int ret = for_each_btree_key_max_commit(trans, iter,
1597 btree, start, end,
1598 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1599 &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1600 d->pos.pos = iter.pos;
1601 delete_dead_snapshots_process_key(trans, &iter, k);
1602 }));
1603
1604 bch2_disk_reservation_put(c, &res);
1605 return ret;
1606 }
1607
delete_dead_snapshot_keys_v2(struct btree_trans * trans)1608 static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
1609 {
1610 struct bch_fs *c = trans->c;
1611 struct snapshot_delete *d = &c->snapshot_delete;
1612 struct disk_reservation res = { 0 };
1613 u64 prev_inum = 0;
1614 int ret = 0;
1615
1616 struct btree_iter iter;
1617 bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
1618 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
1619
1620 while (1) {
1621 struct bkey_s_c k;
1622 ret = lockrestart_do(trans,
1623 bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
1624 if (ret)
1625 break;
1626
1627 if (!k.k)
1628 break;
1629
1630 d->pos.btree = iter.btree_id;
1631 d->pos.pos = iter.pos;
1632
1633 if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
1634 continue;
1635
1636 if (snapshot_id_dying(d, k.k->p.snapshot)) {
1637 struct bpos start = POS(k.k->p.offset, 0);
1638 struct bpos end = POS(k.k->p.offset, U64_MAX);
1639
1640 ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
1641 delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
1642 delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
1643 if (ret)
1644 break;
1645
1646 bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
1647 } else {
1648 bch2_btree_iter_advance(trans, &iter);
1649 }
1650 }
1651 bch2_trans_iter_exit(trans, &iter);
1652
1653 if (ret)
1654 goto err;
1655
1656 prev_inum = 0;
1657 ret = for_each_btree_key_commit(trans, iter,
1658 BTREE_ID_inodes, POS_MIN,
1659 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1660 &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1661 d->pos.btree = iter.btree_id;
1662 d->pos.pos = iter.pos;
1663
1664 if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
1665 continue;
1666
1667 delete_dead_snapshots_process_key(trans, &iter, k);
1668 }));
1669 err:
1670 bch2_disk_reservation_put(c, &res);
1671 return ret;
1672 }
1673
1674 /*
1675 * For a given snapshot, if it doesn't have a subvolume that points to it, and
1676 * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1677 * as deleted.
1678 */
check_should_delete_snapshot(struct btree_trans * trans,struct bkey_s_c k)1679 static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k)
1680 {
1681 if (k.k->type != KEY_TYPE_snapshot)
1682 return 0;
1683
1684 struct bch_fs *c = trans->c;
1685 struct snapshot_delete *d = &c->snapshot_delete;
1686 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
1687 unsigned live_children = 0;
1688 int ret = 0;
1689
1690 if (BCH_SNAPSHOT_SUBVOL(s.v))
1691 return 0;
1692
1693 if (BCH_SNAPSHOT_DELETED(s.v))
1694 return 0;
1695
1696 mutex_lock(&d->progress_lock);
1697 for (unsigned i = 0; i < 2; i++) {
1698 u32 child = le32_to_cpu(s.v->children[i]);
1699
1700 live_children += child &&
1701 !snapshot_list_has_id(&d->delete_leaves, child);
1702 }
1703
1704 u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
1705
1706 if (live_children == 0) {
1707 ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
1708 snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
1709 } else if (live_children == 1) {
1710 struct snapshot_interior_delete n = {
1711 .id = s.k->p.offset,
1712 .live_child = live_child(c, s.k->p.offset),
1713 };
1714
1715 if (!n.live_child) {
1716 bch_err(c, "error finding live child of snapshot %u", n.id);
1717 ret = -EINVAL;
1718 } else {
1719 ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
1720 darray_push(&d->delete_interior, n);
1721 }
1722 }
1723 mutex_unlock(&d->progress_lock);
1724
1725 return ret;
1726 }
1727
bch2_snapshot_nth_parent_skip(struct bch_fs * c,u32 id,u32 n,interior_delete_list * skip)1728 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1729 interior_delete_list *skip)
1730 {
1731 guard(rcu)();
1732 while (interior_delete_has_id(skip, id))
1733 id = __bch2_snapshot_parent(c, id);
1734
1735 while (n--) {
1736 do {
1737 id = __bch2_snapshot_parent(c, id);
1738 } while (interior_delete_has_id(skip, id));
1739 }
1740
1741 return id;
1742 }
1743
bch2_fix_child_of_deleted_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,interior_delete_list * deleted)1744 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1745 struct btree_iter *iter, struct bkey_s_c k,
1746 interior_delete_list *deleted)
1747 {
1748 struct bch_fs *c = trans->c;
1749 u32 nr_deleted_ancestors = 0;
1750 struct bkey_i_snapshot *s;
1751 int ret;
1752
1753 if (!bch2_snapshot_exists(c, k.k->p.offset))
1754 return 0;
1755
1756 if (k.k->type != KEY_TYPE_snapshot)
1757 return 0;
1758
1759 if (interior_delete_has_id(deleted, k.k->p.offset))
1760 return 0;
1761
1762 s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1763 ret = PTR_ERR_OR_ZERO(s);
1764 if (ret)
1765 return ret;
1766
1767 darray_for_each(*deleted, i)
1768 nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
1769
1770 if (!nr_deleted_ancestors)
1771 return 0;
1772
1773 le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1774
1775 if (!s->v.depth) {
1776 s->v.skip[0] = 0;
1777 s->v.skip[1] = 0;
1778 s->v.skip[2] = 0;
1779 } else {
1780 u32 depth = le32_to_cpu(s->v.depth);
1781 u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1782
1783 for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1784 u32 id = le32_to_cpu(s->v.skip[j]);
1785
1786 if (interior_delete_has_id(deleted, id)) {
1787 id = bch2_snapshot_nth_parent_skip(c,
1788 parent,
1789 depth > 1
1790 ? get_random_u32_below(depth - 1)
1791 : 0,
1792 deleted);
1793 s->v.skip[j] = cpu_to_le32(id);
1794 }
1795 }
1796
1797 bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1798 }
1799
1800 return bch2_trans_update(trans, iter, &s->k_i, 0);
1801 }
1802
bch2_snapshot_delete_nodes_to_text(struct printbuf * out,struct snapshot_delete * d)1803 static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d)
1804 {
1805 prt_printf(out, "deleting from trees");
1806 darray_for_each(d->deleting_from_trees, i)
1807 prt_printf(out, " %u", *i);
1808
1809 prt_printf(out, "deleting leaves");
1810 darray_for_each(d->delete_leaves, i)
1811 prt_printf(out, " %u", *i);
1812 prt_newline(out);
1813
1814 prt_printf(out, "interior");
1815 darray_for_each(d->delete_interior, i)
1816 prt_printf(out, " %u->%u", i->id, i->live_child);
1817 prt_newline(out);
1818 }
1819
__bch2_delete_dead_snapshots(struct bch_fs * c)1820 int __bch2_delete_dead_snapshots(struct bch_fs *c)
1821 {
1822 struct snapshot_delete *d = &c->snapshot_delete;
1823 int ret = 0;
1824
1825 if (!mutex_trylock(&d->lock))
1826 return 0;
1827
1828 if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1829 goto out_unlock;
1830
1831 struct btree_trans *trans = bch2_trans_get(c);
1832
1833 /*
1834 * For every snapshot node: If we have no live children and it's not
1835 * pointed to by a subvolume, delete it:
1836 */
1837 d->running = true;
1838 d->pos = BBPOS_MIN;
1839
1840 ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
1841 check_should_delete_snapshot(trans, k));
1842 if (!bch2_err_matches(ret, EROFS))
1843 bch_err_msg(c, ret, "walking snapshots");
1844 if (ret)
1845 goto err;
1846
1847 if (!d->delete_leaves.nr && !d->delete_interior.nr)
1848 goto err;
1849
1850 {
1851 struct printbuf buf = PRINTBUF;
1852 bch2_snapshot_delete_nodes_to_text(&buf, d);
1853
1854 ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
1855 printbuf_exit(&buf);
1856 if (ret)
1857 goto err;
1858 }
1859
1860 ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
1861 ? delete_dead_snapshot_keys_v2(trans)
1862 : delete_dead_snapshot_keys_v1(trans);
1863 if (!bch2_err_matches(ret, EROFS))
1864 bch_err_msg(c, ret, "deleting keys from dying snapshots");
1865 if (ret)
1866 goto err;
1867
1868 darray_for_each(d->delete_leaves, i) {
1869 ret = commit_do(trans, NULL, NULL, 0,
1870 bch2_snapshot_node_delete(trans, *i));
1871 if (!bch2_err_matches(ret, EROFS))
1872 bch_err_msg(c, ret, "deleting snapshot %u", *i);
1873 if (ret)
1874 goto err;
1875 }
1876
1877 /*
1878 * Fixing children of deleted snapshots can't be done completely
1879 * atomically, if we crash between here and when we delete the interior
1880 * nodes some depth fields will be off:
1881 */
1882 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1883 BTREE_ITER_intent, k,
1884 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1885 bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior));
1886 if (ret)
1887 goto err;
1888
1889 darray_for_each(d->delete_interior, i) {
1890 ret = commit_do(trans, NULL, NULL, 0,
1891 bch2_snapshot_node_delete(trans, i->id));
1892 if (!bch2_err_matches(ret, EROFS))
1893 bch_err_msg(c, ret, "deleting snapshot %u", i->id);
1894 if (ret)
1895 goto err;
1896 }
1897 err:
1898 mutex_lock(&d->progress_lock);
1899 darray_exit(&d->deleting_from_trees);
1900 darray_exit(&d->delete_interior);
1901 darray_exit(&d->delete_leaves);
1902 d->running = false;
1903 mutex_unlock(&d->progress_lock);
1904 bch2_trans_put(trans);
1905
1906 bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots);
1907 out_unlock:
1908 mutex_unlock(&d->lock);
1909 if (!bch2_err_matches(ret, EROFS))
1910 bch_err_fn(c, ret);
1911 return ret;
1912 }
1913
bch2_delete_dead_snapshots(struct bch_fs * c)1914 int bch2_delete_dead_snapshots(struct bch_fs *c)
1915 {
1916 if (!c->opts.auto_snapshot_deletion)
1917 return 0;
1918
1919 return __bch2_delete_dead_snapshots(c);
1920 }
1921
bch2_delete_dead_snapshots_work(struct work_struct * work)1922 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1923 {
1924 struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work);
1925
1926 set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
1927
1928 bch2_delete_dead_snapshots(c);
1929 enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
1930 }
1931
bch2_delete_dead_snapshots_async(struct bch_fs * c)1932 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1933 {
1934 if (!c->opts.auto_snapshot_deletion)
1935 return;
1936
1937 if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots))
1938 return;
1939
1940 BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
1941
1942 if (!queue_work(system_long_wq, &c->snapshot_delete.work))
1943 enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
1944 }
1945
bch2_snapshot_delete_status_to_text(struct printbuf * out,struct bch_fs * c)1946 void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
1947 {
1948 struct snapshot_delete *d = &c->snapshot_delete;
1949
1950 if (!d->running) {
1951 prt_str(out, "(not running)");
1952 return;
1953 }
1954
1955 mutex_lock(&d->progress_lock);
1956 bch2_snapshot_delete_nodes_to_text(out, d);
1957
1958 bch2_bbpos_to_text(out, d->pos);
1959 mutex_unlock(&d->progress_lock);
1960 }
1961
__bch2_key_has_snapshot_overwrites(struct btree_trans * trans,enum btree_id id,struct bpos pos)1962 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1963 enum btree_id id,
1964 struct bpos pos)
1965 {
1966 struct bch_fs *c = trans->c;
1967 struct btree_iter iter;
1968 struct bkey_s_c k;
1969 int ret;
1970
1971 for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
1972 BTREE_ITER_not_extents|
1973 BTREE_ITER_all_snapshots,
1974 k, ret) {
1975 if (!bkey_eq(pos, k.k->p))
1976 break;
1977
1978 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1979 ret = 1;
1980 break;
1981 }
1982 }
1983 bch2_trans_iter_exit(trans, &iter);
1984
1985 return ret;
1986 }
1987
interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)1988 static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
1989 {
1990 /* If there's one child, it's redundant and keys will be moved to the child */
1991 return !!snap.v->children[0] + !!snap.v->children[1] == 1;
1992 }
1993
bch2_check_snapshot_needs_deletion(struct btree_trans * trans,struct bkey_s_c k)1994 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1995 {
1996 if (k.k->type != KEY_TYPE_snapshot)
1997 return 0;
1998
1999 struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
2000 if (BCH_SNAPSHOT_WILL_DELETE(snap.v) ||
2001 interior_snapshot_needs_delete(snap))
2002 set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
2003
2004 return 0;
2005 }
2006
bch2_snapshots_read(struct bch_fs * c)2007 int bch2_snapshots_read(struct bch_fs *c)
2008 {
2009 /*
2010 * Initializing the is_ancestor bitmaps requires ancestors to already be
2011 * initialized - so mark in reverse:
2012 */
2013 int ret = bch2_trans_run(c,
2014 for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
2015 POS_MAX, 0, k,
2016 __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
2017 bch2_check_snapshot_needs_deletion(trans, k)));
2018 bch_err_fn(c, ret);
2019
2020 /*
2021 * It's important that we check if we need to reconstruct snapshots
2022 * before going RW, so we mark that pass as required in the superblock -
2023 * otherwise, we could end up deleting keys with missing snapshot nodes
2024 * instead
2025 */
2026 BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
2027 test_bit(BCH_FS_may_go_rw, &c->flags));
2028
2029 return ret;
2030 }
2031
bch2_fs_snapshots_exit(struct bch_fs * c)2032 void bch2_fs_snapshots_exit(struct bch_fs *c)
2033 {
2034 kvfree(rcu_dereference_protected(c->snapshots, true));
2035 }
2036
bch2_fs_snapshots_init_early(struct bch_fs * c)2037 void bch2_fs_snapshots_init_early(struct bch_fs *c)
2038 {
2039 INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work);
2040 mutex_init(&c->snapshot_delete.lock);
2041 mutex_init(&c->snapshot_delete.progress_lock);
2042 mutex_init(&c->snapshots_unlinked_lock);
2043 }
2044