1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_key_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "errcode.h"
10 #include "error.h"
11 #include "fs.h"
12 #include "recovery_passes.h"
13 #include "snapshot.h"
14
15 #include <linux/random.h>
16
17 /*
18 * Snapshot trees:
19 *
20 * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
21 * exist to provide a stable identifier for the whole lifetime of a snapshot
22 * tree.
23 */
24
bch2_snapshot_tree_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)25 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
26 struct bkey_s_c k)
27 {
28 struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
29
30 prt_printf(out, "subvol %u root snapshot %u",
31 le32_to_cpu(t.v->master_subvol),
32 le32_to_cpu(t.v->root_snapshot));
33 }
34
bch2_snapshot_tree_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)35 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
36 struct bkey_validate_context from)
37 {
38 int ret = 0;
39
40 bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
41 bkey_lt(k.k->p, POS(0, 1)),
42 c, snapshot_tree_pos_bad,
43 "bad pos");
44 fsck_err:
45 return ret;
46 }
47
bch2_snapshot_tree_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot_tree * s)48 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
49 struct bch_snapshot_tree *s)
50 {
51 int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
52 BTREE_ITER_with_updates, snapshot_tree, s);
53
54 if (bch2_err_matches(ret, ENOENT))
55 ret = -BCH_ERR_ENOENT_snapshot_tree;
56 return ret;
57 }
58
59 struct bkey_i_snapshot_tree *
__bch2_snapshot_tree_create(struct btree_trans * trans)60 __bch2_snapshot_tree_create(struct btree_trans *trans)
61 {
62 struct btree_iter iter;
63 int ret = bch2_bkey_get_empty_slot(trans, &iter,
64 BTREE_ID_snapshot_trees, POS(0, U32_MAX));
65 struct bkey_i_snapshot_tree *s_t;
66
67 if (ret == -BCH_ERR_ENOSPC_btree_slot)
68 ret = -BCH_ERR_ENOSPC_snapshot_tree;
69 if (ret)
70 return ERR_PTR(ret);
71
72 s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
73 ret = PTR_ERR_OR_ZERO(s_t);
74 bch2_trans_iter_exit(trans, &iter);
75 return ret ? ERR_PTR(ret) : s_t;
76 }
77
bch2_snapshot_tree_create(struct btree_trans * trans,u32 root_id,u32 subvol_id,u32 * tree_id)78 static int bch2_snapshot_tree_create(struct btree_trans *trans,
79 u32 root_id, u32 subvol_id, u32 *tree_id)
80 {
81 struct bkey_i_snapshot_tree *n_tree =
82 __bch2_snapshot_tree_create(trans);
83
84 if (IS_ERR(n_tree))
85 return PTR_ERR(n_tree);
86
87 n_tree->v.master_subvol = cpu_to_le32(subvol_id);
88 n_tree->v.root_snapshot = cpu_to_le32(root_id);
89 *tree_id = n_tree->k.p.offset;
90 return 0;
91 }
92
93 /* Snapshot nodes: */
94
__bch2_snapshot_is_ancestor_early(struct snapshot_table * t,u32 id,u32 ancestor)95 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
96 {
97 while (id && id < ancestor) {
98 const struct snapshot_t *s = __snapshot_t(t, id);
99 id = s ? s->parent : 0;
100 }
101 return id == ancestor;
102 }
103
bch2_snapshot_is_ancestor_early(struct bch_fs * c,u32 id,u32 ancestor)104 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
105 {
106 rcu_read_lock();
107 bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
108 rcu_read_unlock();
109
110 return ret;
111 }
112
get_ancestor_below(struct snapshot_table * t,u32 id,u32 ancestor)113 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
114 {
115 const struct snapshot_t *s = __snapshot_t(t, id);
116 if (!s)
117 return 0;
118
119 if (s->skip[2] <= ancestor)
120 return s->skip[2];
121 if (s->skip[1] <= ancestor)
122 return s->skip[1];
123 if (s->skip[0] <= ancestor)
124 return s->skip[0];
125 return s->parent;
126 }
127
test_ancestor_bitmap(struct snapshot_table * t,u32 id,u32 ancestor)128 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
129 {
130 const struct snapshot_t *s = __snapshot_t(t, id);
131 if (!s)
132 return false;
133
134 return test_bit(ancestor - id - 1, s->is_ancestor);
135 }
136
__bch2_snapshot_is_ancestor(struct bch_fs * c,u32 id,u32 ancestor)137 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
138 {
139 bool ret;
140
141 rcu_read_lock();
142 struct snapshot_table *t = rcu_dereference(c->snapshots);
143
144 if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
145 ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
146 goto out;
147 }
148
149 if (likely(ancestor >= IS_ANCESTOR_BITMAP))
150 while (id && id < ancestor - IS_ANCESTOR_BITMAP)
151 id = get_ancestor_below(t, id, ancestor);
152
153 ret = id && id < ancestor
154 ? test_ancestor_bitmap(t, id, ancestor)
155 : id == ancestor;
156
157 EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
158 out:
159 rcu_read_unlock();
160
161 return ret;
162 }
163
__snapshot_t_mut(struct bch_fs * c,u32 id)164 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
165 {
166 size_t idx = U32_MAX - id;
167 struct snapshot_table *new, *old;
168
169 size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
170 size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
171
172 if (unlikely(new_bytes > INT_MAX))
173 return NULL;
174
175 new = kvzalloc(new_bytes, GFP_KERNEL);
176 if (!new)
177 return NULL;
178
179 new->nr = new_size;
180
181 old = rcu_dereference_protected(c->snapshots, true);
182 if (old)
183 memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
184
185 rcu_assign_pointer(c->snapshots, new);
186 kvfree_rcu(old, rcu);
187
188 return &rcu_dereference_protected(c->snapshots,
189 lockdep_is_held(&c->snapshot_table_lock))->s[idx];
190 }
191
snapshot_t_mut(struct bch_fs * c,u32 id)192 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
193 {
194 size_t idx = U32_MAX - id;
195 struct snapshot_table *table =
196 rcu_dereference_protected(c->snapshots,
197 lockdep_is_held(&c->snapshot_table_lock));
198
199 lockdep_assert_held(&c->snapshot_table_lock);
200
201 if (likely(table && idx < table->nr))
202 return &table->s[idx];
203
204 return __snapshot_t_mut(c, id);
205 }
206
bch2_snapshot_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)207 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
208 struct bkey_s_c k)
209 {
210 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
211
212 prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
213 BCH_SNAPSHOT_SUBVOL(s.v),
214 BCH_SNAPSHOT_DELETED(s.v),
215 le32_to_cpu(s.v->parent),
216 le32_to_cpu(s.v->children[0]),
217 le32_to_cpu(s.v->children[1]),
218 le32_to_cpu(s.v->subvol),
219 le32_to_cpu(s.v->tree));
220
221 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
222 prt_printf(out, " depth %u skiplist %u %u %u",
223 le32_to_cpu(s.v->depth),
224 le32_to_cpu(s.v->skip[0]),
225 le32_to_cpu(s.v->skip[1]),
226 le32_to_cpu(s.v->skip[2]));
227 }
228
bch2_snapshot_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)229 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
230 struct bkey_validate_context from)
231 {
232 struct bkey_s_c_snapshot s;
233 u32 i, id;
234 int ret = 0;
235
236 bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
237 bkey_lt(k.k->p, POS(0, 1)),
238 c, snapshot_pos_bad,
239 "bad pos");
240
241 s = bkey_s_c_to_snapshot(k);
242
243 id = le32_to_cpu(s.v->parent);
244 bkey_fsck_err_on(id && id <= k.k->p.offset,
245 c, snapshot_parent_bad,
246 "bad parent node (%u <= %llu)",
247 id, k.k->p.offset);
248
249 bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
250 c, snapshot_children_not_normalized,
251 "children not normalized");
252
253 bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
254 c, snapshot_child_duplicate,
255 "duplicate child nodes");
256
257 for (i = 0; i < 2; i++) {
258 id = le32_to_cpu(s.v->children[i]);
259
260 bkey_fsck_err_on(id >= k.k->p.offset,
261 c, snapshot_child_bad,
262 "bad child node (%u >= %llu)",
263 id, k.k->p.offset);
264 }
265
266 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
267 bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
268 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
269 c, snapshot_skiplist_not_normalized,
270 "skiplist not normalized");
271
272 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
273 id = le32_to_cpu(s.v->skip[i]);
274
275 bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
276 c, snapshot_skiplist_bad,
277 "bad skiplist node %u", id);
278 }
279 }
280 fsck_err:
281 return ret;
282 }
283
__bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s_c new,enum btree_iter_update_trigger_flags flags)284 static int __bch2_mark_snapshot(struct btree_trans *trans,
285 enum btree_id btree, unsigned level,
286 struct bkey_s_c old, struct bkey_s_c new,
287 enum btree_iter_update_trigger_flags flags)
288 {
289 struct bch_fs *c = trans->c;
290 struct snapshot_t *t;
291 u32 id = new.k->p.offset;
292 int ret = 0;
293
294 mutex_lock(&c->snapshot_table_lock);
295
296 t = snapshot_t_mut(c, id);
297 if (!t) {
298 ret = -BCH_ERR_ENOMEM_mark_snapshot;
299 goto err;
300 }
301
302 if (new.k->type == KEY_TYPE_snapshot) {
303 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
304
305 t->live = true;
306 t->parent = le32_to_cpu(s.v->parent);
307 t->children[0] = le32_to_cpu(s.v->children[0]);
308 t->children[1] = le32_to_cpu(s.v->children[1]);
309 t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
310 t->tree = le32_to_cpu(s.v->tree);
311
312 if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
313 t->depth = le32_to_cpu(s.v->depth);
314 t->skip[0] = le32_to_cpu(s.v->skip[0]);
315 t->skip[1] = le32_to_cpu(s.v->skip[1]);
316 t->skip[2] = le32_to_cpu(s.v->skip[2]);
317 } else {
318 t->depth = 0;
319 t->skip[0] = 0;
320 t->skip[1] = 0;
321 t->skip[2] = 0;
322 }
323
324 u32 parent = id;
325
326 while ((parent = bch2_snapshot_parent_early(c, parent)) &&
327 parent - id - 1 < IS_ANCESTOR_BITMAP)
328 __set_bit(parent - id - 1, t->is_ancestor);
329
330 if (BCH_SNAPSHOT_DELETED(s.v)) {
331 set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
332 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
333 bch2_delete_dead_snapshots_async(c);
334 }
335 } else {
336 memset(t, 0, sizeof(*t));
337 }
338 err:
339 mutex_unlock(&c->snapshot_table_lock);
340 return ret;
341 }
342
bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s new,enum btree_iter_update_trigger_flags flags)343 int bch2_mark_snapshot(struct btree_trans *trans,
344 enum btree_id btree, unsigned level,
345 struct bkey_s_c old, struct bkey_s new,
346 enum btree_iter_update_trigger_flags flags)
347 {
348 return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
349 }
350
bch2_snapshot_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot * s)351 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
352 struct bch_snapshot *s)
353 {
354 return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
355 BTREE_ITER_with_updates, snapshot, s);
356 }
357
358 /* fsck: */
359
bch2_snapshot_child(struct bch_fs * c,u32 id,unsigned child)360 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
361 {
362 return snapshot_t(c, id)->children[child];
363 }
364
bch2_snapshot_left_child(struct bch_fs * c,u32 id)365 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
366 {
367 return bch2_snapshot_child(c, id, 0);
368 }
369
bch2_snapshot_right_child(struct bch_fs * c,u32 id)370 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
371 {
372 return bch2_snapshot_child(c, id, 1);
373 }
374
bch2_snapshot_tree_next(struct bch_fs * c,u32 id)375 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
376 {
377 u32 n, parent;
378
379 n = bch2_snapshot_left_child(c, id);
380 if (n)
381 return n;
382
383 while ((parent = bch2_snapshot_parent(c, id))) {
384 n = bch2_snapshot_right_child(c, parent);
385 if (n && n != id)
386 return n;
387 id = parent;
388 }
389
390 return 0;
391 }
392
bch2_snapshot_tree_oldest_subvol(struct bch_fs * c,u32 snapshot_root)393 u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
394 {
395 u32 id = snapshot_root;
396 u32 subvol = 0, s;
397
398 rcu_read_lock();
399 while (id) {
400 s = snapshot_t(c, id)->subvol;
401
402 if (s && (!subvol || s < subvol))
403 subvol = s;
404
405 id = bch2_snapshot_tree_next(c, id);
406 }
407 rcu_read_unlock();
408
409 return subvol;
410 }
411
bch2_snapshot_tree_master_subvol(struct btree_trans * trans,u32 snapshot_root,u32 * subvol_id)412 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
413 u32 snapshot_root, u32 *subvol_id)
414 {
415 struct bch_fs *c = trans->c;
416 struct btree_iter iter;
417 struct bkey_s_c k;
418 bool found = false;
419 int ret;
420
421 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
422 0, k, ret) {
423 if (k.k->type != KEY_TYPE_subvolume)
424 continue;
425
426 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
427 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
428 continue;
429 if (!BCH_SUBVOLUME_SNAP(s.v)) {
430 *subvol_id = s.k->p.offset;
431 found = true;
432 break;
433 }
434 }
435 bch2_trans_iter_exit(trans, &iter);
436
437 if (!ret && !found) {
438 struct bkey_i_subvolume *u;
439
440 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
441
442 u = bch2_bkey_get_mut_typed(trans, &iter,
443 BTREE_ID_subvolumes, POS(0, *subvol_id),
444 0, subvolume);
445 ret = PTR_ERR_OR_ZERO(u);
446 if (ret)
447 return ret;
448
449 SET_BCH_SUBVOLUME_SNAP(&u->v, false);
450 }
451
452 return ret;
453 }
454
check_snapshot_tree(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)455 static int check_snapshot_tree(struct btree_trans *trans,
456 struct btree_iter *iter,
457 struct bkey_s_c k)
458 {
459 struct bch_fs *c = trans->c;
460 struct bkey_s_c_snapshot_tree st;
461 struct bch_snapshot s;
462 struct bch_subvolume subvol;
463 struct printbuf buf = PRINTBUF;
464 struct btree_iter snapshot_iter = {};
465 u32 root_id;
466 int ret;
467
468 if (k.k->type != KEY_TYPE_snapshot_tree)
469 return 0;
470
471 st = bkey_s_c_to_snapshot_tree(k);
472 root_id = le32_to_cpu(st.v->root_snapshot);
473
474 struct bkey_s_c_snapshot snapshot_k =
475 bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
476 POS(0, root_id), 0, snapshot);
477 ret = bkey_err(snapshot_k);
478 if (ret && !bch2_err_matches(ret, ENOENT))
479 goto err;
480
481 if (!ret)
482 bkey_val_copy(&s, snapshot_k);
483
484 if (fsck_err_on(ret ||
485 root_id != bch2_snapshot_root(c, root_id) ||
486 st.k->p.offset != le32_to_cpu(s.tree),
487 trans, snapshot_tree_to_missing_snapshot,
488 "snapshot tree points to missing/incorrect snapshot:\n %s",
489 (bch2_bkey_val_to_text(&buf, c, st.s_c),
490 prt_newline(&buf),
491 ret
492 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
493 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
494 buf.buf))) {
495 ret = bch2_btree_delete_at(trans, iter, 0);
496 goto err;
497 }
498
499 if (!st.v->master_subvol)
500 goto out;
501
502 ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
503 if (ret && !bch2_err_matches(ret, ENOENT))
504 goto err;
505
506 if (fsck_err_on(ret,
507 trans, snapshot_tree_to_missing_subvol,
508 "snapshot tree points to missing subvolume:\n %s",
509 (printbuf_reset(&buf),
510 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
511 fsck_err_on(!bch2_snapshot_is_ancestor(c,
512 le32_to_cpu(subvol.snapshot),
513 root_id),
514 trans, snapshot_tree_to_wrong_subvol,
515 "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
516 (printbuf_reset(&buf),
517 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
518 fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
519 trans, snapshot_tree_to_snapshot_subvol,
520 "snapshot tree points to snapshot subvolume:\n %s",
521 (printbuf_reset(&buf),
522 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
523 struct bkey_i_snapshot_tree *u;
524 u32 subvol_id;
525
526 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
527 bch_err_fn(c, ret);
528
529 if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
530 ret = 0;
531 goto err;
532 }
533
534 if (ret)
535 goto err;
536
537 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
538 ret = PTR_ERR_OR_ZERO(u);
539 if (ret)
540 goto err;
541
542 u->v.master_subvol = cpu_to_le32(subvol_id);
543 st = snapshot_tree_i_to_s_c(u);
544 }
545 out:
546 err:
547 fsck_err:
548 bch2_trans_iter_exit(trans, &snapshot_iter);
549 printbuf_exit(&buf);
550 return ret;
551 }
552
553 /*
554 * For each snapshot_tree, make sure it points to the root of a snapshot tree
555 * and that snapshot entry points back to it, or delete it.
556 *
557 * And, make sure it points to a subvolume within that snapshot tree, or correct
558 * it to point to the oldest subvolume within that snapshot tree.
559 */
bch2_check_snapshot_trees(struct bch_fs * c)560 int bch2_check_snapshot_trees(struct bch_fs *c)
561 {
562 int ret = bch2_trans_run(c,
563 for_each_btree_key_commit(trans, iter,
564 BTREE_ID_snapshot_trees, POS_MIN,
565 BTREE_ITER_prefetch, k,
566 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
567 check_snapshot_tree(trans, &iter, k)));
568 bch_err_fn(c, ret);
569 return ret;
570 }
571
572 /*
573 * Look up snapshot tree for @tree_id and find root,
574 * make sure @snap_id is a descendent:
575 */
snapshot_tree_ptr_good(struct btree_trans * trans,u32 snap_id,u32 tree_id)576 static int snapshot_tree_ptr_good(struct btree_trans *trans,
577 u32 snap_id, u32 tree_id)
578 {
579 struct bch_snapshot_tree s_t;
580 int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
581
582 if (bch2_err_matches(ret, ENOENT))
583 return 0;
584 if (ret)
585 return ret;
586
587 return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
588 }
589
bch2_snapshot_skiplist_get(struct bch_fs * c,u32 id)590 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
591 {
592 const struct snapshot_t *s;
593
594 if (!id)
595 return 0;
596
597 rcu_read_lock();
598 s = snapshot_t(c, id);
599 if (s->parent)
600 id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
601 rcu_read_unlock();
602
603 return id;
604 }
605
snapshot_skiplist_good(struct btree_trans * trans,u32 id,struct bch_snapshot s)606 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
607 {
608 unsigned i;
609
610 for (i = 0; i < 3; i++)
611 if (!s.parent) {
612 if (s.skip[i])
613 return false;
614 } else {
615 if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
616 return false;
617 }
618
619 return true;
620 }
621
622 /*
623 * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
624 * its snapshot_tree pointer is correct (allocate new one if necessary), then
625 * update this node's pointer to root node's pointer:
626 */
snapshot_tree_ptr_repair(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_snapshot * s)627 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
628 struct btree_iter *iter,
629 struct bkey_s_c k,
630 struct bch_snapshot *s)
631 {
632 struct bch_fs *c = trans->c;
633 struct btree_iter root_iter;
634 struct bch_snapshot_tree s_t;
635 struct bkey_s_c_snapshot root;
636 struct bkey_i_snapshot *u;
637 u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
638 int ret;
639
640 root = bch2_bkey_get_iter_typed(trans, &root_iter,
641 BTREE_ID_snapshots, POS(0, root_id),
642 BTREE_ITER_with_updates, snapshot);
643 ret = bkey_err(root);
644 if (ret)
645 goto err;
646
647 tree_id = le32_to_cpu(root.v->tree);
648
649 ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
650 if (ret && !bch2_err_matches(ret, ENOENT))
651 return ret;
652
653 if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
654 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
655 ret = PTR_ERR_OR_ZERO(u) ?:
656 bch2_snapshot_tree_create(trans, root_id,
657 bch2_snapshot_tree_oldest_subvol(c, root_id),
658 &tree_id);
659 if (ret)
660 goto err;
661
662 u->v.tree = cpu_to_le32(tree_id);
663 if (k.k->p.offset == root_id)
664 *s = u->v;
665 }
666
667 if (k.k->p.offset != root_id) {
668 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
669 ret = PTR_ERR_OR_ZERO(u);
670 if (ret)
671 goto err;
672
673 u->v.tree = cpu_to_le32(tree_id);
674 *s = u->v;
675 }
676 err:
677 bch2_trans_iter_exit(trans, &root_iter);
678 return ret;
679 }
680
check_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)681 static int check_snapshot(struct btree_trans *trans,
682 struct btree_iter *iter,
683 struct bkey_s_c k)
684 {
685 struct bch_fs *c = trans->c;
686 struct bch_snapshot s;
687 struct bch_subvolume subvol;
688 struct bch_snapshot v;
689 struct bkey_i_snapshot *u;
690 u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
691 u32 real_depth;
692 struct printbuf buf = PRINTBUF;
693 u32 i, id;
694 int ret = 0;
695
696 if (k.k->type != KEY_TYPE_snapshot)
697 return 0;
698
699 memset(&s, 0, sizeof(s));
700 memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
701
702 id = le32_to_cpu(s.parent);
703 if (id) {
704 ret = bch2_snapshot_lookup(trans, id, &v);
705 if (bch2_err_matches(ret, ENOENT))
706 bch_err(c, "snapshot with nonexistent parent:\n %s",
707 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
708 if (ret)
709 goto err;
710
711 if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
712 le32_to_cpu(v.children[1]) != k.k->p.offset) {
713 bch_err(c, "snapshot parent %u missing pointer to child %llu",
714 id, k.k->p.offset);
715 ret = -EINVAL;
716 goto err;
717 }
718 }
719
720 for (i = 0; i < 2 && s.children[i]; i++) {
721 id = le32_to_cpu(s.children[i]);
722
723 ret = bch2_snapshot_lookup(trans, id, &v);
724 if (bch2_err_matches(ret, ENOENT))
725 bch_err(c, "snapshot node %llu has nonexistent child %u",
726 k.k->p.offset, id);
727 if (ret)
728 goto err;
729
730 if (le32_to_cpu(v.parent) != k.k->p.offset) {
731 bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
732 id, le32_to_cpu(v.parent), k.k->p.offset);
733 ret = -EINVAL;
734 goto err;
735 }
736 }
737
738 bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
739 !BCH_SNAPSHOT_DELETED(&s);
740
741 if (should_have_subvol) {
742 id = le32_to_cpu(s.subvol);
743 ret = bch2_subvolume_get(trans, id, false, &subvol);
744 if (bch2_err_matches(ret, ENOENT))
745 bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
746 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
747 if (ret)
748 goto err;
749
750 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
751 bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
752 k.k->p.offset);
753 ret = -EINVAL;
754 goto err;
755 }
756 } else {
757 if (fsck_err_on(s.subvol,
758 trans, snapshot_should_not_have_subvol,
759 "snapshot should not point to subvol:\n %s",
760 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
761 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
762 ret = PTR_ERR_OR_ZERO(u);
763 if (ret)
764 goto err;
765
766 u->v.subvol = 0;
767 s = u->v;
768 }
769 }
770
771 ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
772 if (ret < 0)
773 goto err;
774
775 if (fsck_err_on(!ret,
776 trans, snapshot_to_bad_snapshot_tree,
777 "snapshot points to missing/incorrect tree:\n %s",
778 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
779 ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
780 if (ret)
781 goto err;
782 }
783 ret = 0;
784
785 real_depth = bch2_snapshot_depth(c, parent_id);
786
787 if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
788 trans, snapshot_bad_depth,
789 "snapshot with incorrect depth field, should be %u:\n %s",
790 real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
791 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
792 ret = PTR_ERR_OR_ZERO(u);
793 if (ret)
794 goto err;
795
796 u->v.depth = cpu_to_le32(real_depth);
797 s = u->v;
798 }
799
800 ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
801 if (ret < 0)
802 goto err;
803
804 if (fsck_err_on(!ret,
805 trans, snapshot_bad_skiplist,
806 "snapshot with bad skiplist field:\n %s",
807 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
808 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
809 ret = PTR_ERR_OR_ZERO(u);
810 if (ret)
811 goto err;
812
813 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
814 u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
815
816 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
817 s = u->v;
818 }
819 ret = 0;
820 err:
821 fsck_err:
822 printbuf_exit(&buf);
823 return ret;
824 }
825
bch2_check_snapshots(struct bch_fs * c)826 int bch2_check_snapshots(struct bch_fs *c)
827 {
828 /*
829 * We iterate backwards as checking/fixing the depth field requires that
830 * the parent's depth already be correct:
831 */
832 int ret = bch2_trans_run(c,
833 for_each_btree_key_reverse_commit(trans, iter,
834 BTREE_ID_snapshots, POS_MAX,
835 BTREE_ITER_prefetch, k,
836 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
837 check_snapshot(trans, &iter, k)));
838 bch_err_fn(c, ret);
839 return ret;
840 }
841
check_snapshot_exists(struct btree_trans * trans,u32 id)842 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
843 {
844 struct bch_fs *c = trans->c;
845
846 if (bch2_snapshot_exists(c, id))
847 return 0;
848
849 /* Do we need to reconstruct the snapshot_tree entry as well? */
850 struct btree_iter iter;
851 struct bkey_s_c k;
852 int ret = 0;
853 u32 tree_id = 0;
854
855 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
856 0, k, ret) {
857 if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
858 tree_id = k.k->p.offset;
859 break;
860 }
861 }
862 bch2_trans_iter_exit(trans, &iter);
863
864 if (ret)
865 return ret;
866
867 if (!tree_id) {
868 ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
869 if (ret)
870 return ret;
871 }
872
873 struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
874 ret = PTR_ERR_OR_ZERO(snapshot);
875 if (ret)
876 return ret;
877
878 bkey_snapshot_init(&snapshot->k_i);
879 snapshot->k.p = POS(0, id);
880 snapshot->v.tree = cpu_to_le32(tree_id);
881 snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
882
883 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
884 0, k, ret) {
885 if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
886 snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
887 SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
888 break;
889 }
890 }
891 bch2_trans_iter_exit(trans, &iter);
892
893 return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
894 bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
895 bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
896 }
897
898 /* Figure out which snapshot nodes belong in the same tree: */
899 struct snapshot_tree_reconstruct {
900 enum btree_id btree;
901 struct bpos cur_pos;
902 snapshot_id_list cur_ids;
903 DARRAY(snapshot_id_list) trees;
904 };
905
snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct * r)906 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
907 {
908 darray_for_each(r->trees, i)
909 darray_exit(i);
910 darray_exit(&r->trees);
911 darray_exit(&r->cur_ids);
912 }
913
same_snapshot(struct snapshot_tree_reconstruct * r,struct bpos pos)914 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
915 {
916 return r->btree == BTREE_ID_inodes
917 ? r->cur_pos.offset == pos.offset
918 : r->cur_pos.inode == pos.inode;
919 }
920
snapshot_id_lists_have_common(snapshot_id_list * l,snapshot_id_list * r)921 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
922 {
923 darray_for_each(*l, i)
924 if (snapshot_list_has_id(r, *i))
925 return true;
926 return false;
927 }
928
snapshot_id_list_to_text(struct printbuf * out,snapshot_id_list * s)929 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
930 {
931 bool first = true;
932 darray_for_each(*s, i) {
933 if (!first)
934 prt_char(out, ' ');
935 first = false;
936 prt_printf(out, "%u", *i);
937 }
938 }
939
snapshot_tree_reconstruct_next(struct bch_fs * c,struct snapshot_tree_reconstruct * r)940 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
941 {
942 if (r->cur_ids.nr) {
943 darray_for_each(r->trees, i)
944 if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
945 int ret = snapshot_list_merge(c, i, &r->cur_ids);
946 if (ret)
947 return ret;
948 goto out;
949 }
950 darray_push(&r->trees, r->cur_ids);
951 darray_init(&r->cur_ids);
952 }
953 out:
954 r->cur_ids.nr = 0;
955 return 0;
956 }
957
get_snapshot_trees(struct bch_fs * c,struct snapshot_tree_reconstruct * r,struct bpos pos)958 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
959 {
960 if (!same_snapshot(r, pos))
961 snapshot_tree_reconstruct_next(c, r);
962 r->cur_pos = pos;
963 return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
964 }
965
bch2_reconstruct_snapshots(struct bch_fs * c)966 int bch2_reconstruct_snapshots(struct bch_fs *c)
967 {
968 struct btree_trans *trans = bch2_trans_get(c);
969 struct printbuf buf = PRINTBUF;
970 struct snapshot_tree_reconstruct r = {};
971 int ret = 0;
972
973 for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
974 if (btree_type_has_snapshots(btree)) {
975 r.btree = btree;
976
977 ret = for_each_btree_key(trans, iter, btree, POS_MIN,
978 BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
979 get_snapshot_trees(c, &r, k.k->p);
980 }));
981 if (ret)
982 goto err;
983
984 snapshot_tree_reconstruct_next(c, &r);
985 }
986 }
987
988 darray_for_each(r.trees, t) {
989 printbuf_reset(&buf);
990 snapshot_id_list_to_text(&buf, t);
991
992 darray_for_each(*t, id) {
993 if (fsck_err_on(!bch2_snapshot_exists(c, *id),
994 trans, snapshot_node_missing,
995 "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
996 if (t->nr > 1) {
997 bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
998 ret = -BCH_ERR_fsck_repair_unimplemented;
999 goto err;
1000 }
1001
1002 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1003 check_snapshot_exists(trans, *id));
1004 if (ret)
1005 goto err;
1006 }
1007 }
1008 }
1009 fsck_err:
1010 err:
1011 bch2_trans_put(trans);
1012 snapshot_tree_reconstruct_exit(&r);
1013 printbuf_exit(&buf);
1014 bch_err_fn(c, ret);
1015 return ret;
1016 }
1017
bch2_check_key_has_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1018 int bch2_check_key_has_snapshot(struct btree_trans *trans,
1019 struct btree_iter *iter,
1020 struct bkey_s_c k)
1021 {
1022 struct bch_fs *c = trans->c;
1023 struct printbuf buf = PRINTBUF;
1024 int ret = 0;
1025
1026 if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
1027 trans, bkey_in_missing_snapshot,
1028 "key in missing snapshot %s, delete?",
1029 (bch2_btree_id_to_text(&buf, iter->btree_id),
1030 prt_char(&buf, ' '),
1031 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1032 ret = bch2_btree_delete_at(trans, iter,
1033 BTREE_UPDATE_internal_snapshot_node) ?: 1;
1034 fsck_err:
1035 printbuf_exit(&buf);
1036 return ret;
1037 }
1038
1039 /*
1040 * Mark a snapshot as deleted, for future cleanup:
1041 */
bch2_snapshot_node_set_deleted(struct btree_trans * trans,u32 id)1042 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1043 {
1044 struct btree_iter iter;
1045 struct bkey_i_snapshot *s =
1046 bch2_bkey_get_mut_typed(trans, &iter,
1047 BTREE_ID_snapshots, POS(0, id),
1048 0, snapshot);
1049 int ret = PTR_ERR_OR_ZERO(s);
1050 if (unlikely(ret)) {
1051 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1052 trans->c, "missing snapshot %u", id);
1053 return ret;
1054 }
1055
1056 /* already deleted? */
1057 if (BCH_SNAPSHOT_DELETED(&s->v))
1058 goto err;
1059
1060 SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1061 SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1062 s->v.subvol = 0;
1063 err:
1064 bch2_trans_iter_exit(trans, &iter);
1065 return ret;
1066 }
1067
normalize_snapshot_child_pointers(struct bch_snapshot * s)1068 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1069 {
1070 if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1071 swap(s->children[0], s->children[1]);
1072 }
1073
bch2_snapshot_node_delete(struct btree_trans * trans,u32 id)1074 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1075 {
1076 struct bch_fs *c = trans->c;
1077 struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
1078 struct btree_iter c_iter = (struct btree_iter) { NULL };
1079 struct btree_iter tree_iter = (struct btree_iter) { NULL };
1080 struct bkey_s_c_snapshot s;
1081 u32 parent_id, child_id;
1082 unsigned i;
1083 int ret = 0;
1084
1085 s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1086 BTREE_ITER_intent, snapshot);
1087 ret = bkey_err(s);
1088 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1089 "missing snapshot %u", id);
1090
1091 if (ret)
1092 goto err;
1093
1094 BUG_ON(s.v->children[1]);
1095
1096 parent_id = le32_to_cpu(s.v->parent);
1097 child_id = le32_to_cpu(s.v->children[0]);
1098
1099 if (parent_id) {
1100 struct bkey_i_snapshot *parent;
1101
1102 parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1103 BTREE_ID_snapshots, POS(0, parent_id),
1104 0, snapshot);
1105 ret = PTR_ERR_OR_ZERO(parent);
1106 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1107 "missing snapshot %u", parent_id);
1108 if (unlikely(ret))
1109 goto err;
1110
1111 /* find entry in parent->children for node being deleted */
1112 for (i = 0; i < 2; i++)
1113 if (le32_to_cpu(parent->v.children[i]) == id)
1114 break;
1115
1116 if (bch2_fs_inconsistent_on(i == 2, c,
1117 "snapshot %u missing child pointer to %u",
1118 parent_id, id))
1119 goto err;
1120
1121 parent->v.children[i] = cpu_to_le32(child_id);
1122
1123 normalize_snapshot_child_pointers(&parent->v);
1124 }
1125
1126 if (child_id) {
1127 struct bkey_i_snapshot *child;
1128
1129 child = bch2_bkey_get_mut_typed(trans, &c_iter,
1130 BTREE_ID_snapshots, POS(0, child_id),
1131 0, snapshot);
1132 ret = PTR_ERR_OR_ZERO(child);
1133 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1134 "missing snapshot %u", child_id);
1135 if (unlikely(ret))
1136 goto err;
1137
1138 child->v.parent = cpu_to_le32(parent_id);
1139
1140 if (!child->v.parent) {
1141 child->v.skip[0] = 0;
1142 child->v.skip[1] = 0;
1143 child->v.skip[2] = 0;
1144 }
1145 }
1146
1147 if (!parent_id) {
1148 /*
1149 * We're deleting the root of a snapshot tree: update the
1150 * snapshot_tree entry to point to the new root, or delete it if
1151 * this is the last snapshot ID in this tree:
1152 */
1153 struct bkey_i_snapshot_tree *s_t;
1154
1155 BUG_ON(s.v->children[1]);
1156
1157 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1158 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
1159 0, snapshot_tree);
1160 ret = PTR_ERR_OR_ZERO(s_t);
1161 if (ret)
1162 goto err;
1163
1164 if (s.v->children[0]) {
1165 s_t->v.root_snapshot = s.v->children[0];
1166 } else {
1167 s_t->k.type = KEY_TYPE_deleted;
1168 set_bkey_val_u64s(&s_t->k, 0);
1169 }
1170 }
1171
1172 ret = bch2_btree_delete_at(trans, &iter, 0);
1173 err:
1174 bch2_trans_iter_exit(trans, &tree_iter);
1175 bch2_trans_iter_exit(trans, &p_iter);
1176 bch2_trans_iter_exit(trans, &c_iter);
1177 bch2_trans_iter_exit(trans, &iter);
1178 return ret;
1179 }
1180
create_snapids(struct btree_trans * trans,u32 parent,u32 tree,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1181 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1182 u32 *new_snapids,
1183 u32 *snapshot_subvols,
1184 unsigned nr_snapids)
1185 {
1186 struct bch_fs *c = trans->c;
1187 struct btree_iter iter;
1188 struct bkey_i_snapshot *n;
1189 struct bkey_s_c k;
1190 unsigned i, j;
1191 u32 depth = bch2_snapshot_depth(c, parent);
1192 int ret;
1193
1194 bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1195 POS_MIN, BTREE_ITER_intent);
1196 k = bch2_btree_iter_peek(&iter);
1197 ret = bkey_err(k);
1198 if (ret)
1199 goto err;
1200
1201 for (i = 0; i < nr_snapids; i++) {
1202 k = bch2_btree_iter_prev_slot(&iter);
1203 ret = bkey_err(k);
1204 if (ret)
1205 goto err;
1206
1207 if (!k.k || !k.k->p.offset) {
1208 ret = -BCH_ERR_ENOSPC_snapshot_create;
1209 goto err;
1210 }
1211
1212 n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1213 ret = PTR_ERR_OR_ZERO(n);
1214 if (ret)
1215 goto err;
1216
1217 n->v.flags = 0;
1218 n->v.parent = cpu_to_le32(parent);
1219 n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
1220 n->v.tree = cpu_to_le32(tree);
1221 n->v.depth = cpu_to_le32(depth);
1222 n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
1223 n->v.btime.hi = 0;
1224
1225 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1226 n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1227
1228 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1229 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1230
1231 ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1232 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1233 if (ret)
1234 goto err;
1235
1236 new_snapids[i] = iter.pos.offset;
1237 }
1238 err:
1239 bch2_trans_iter_exit(trans, &iter);
1240 return ret;
1241 }
1242
1243 /*
1244 * Create new snapshot IDs as children of an existing snapshot ID:
1245 */
bch2_snapshot_node_create_children(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1246 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1247 u32 *new_snapids,
1248 u32 *snapshot_subvols,
1249 unsigned nr_snapids)
1250 {
1251 struct btree_iter iter;
1252 struct bkey_i_snapshot *n_parent;
1253 int ret = 0;
1254
1255 n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1256 BTREE_ID_snapshots, POS(0, parent),
1257 0, snapshot);
1258 ret = PTR_ERR_OR_ZERO(n_parent);
1259 if (unlikely(ret)) {
1260 if (bch2_err_matches(ret, ENOENT))
1261 bch_err(trans->c, "snapshot %u not found", parent);
1262 return ret;
1263 }
1264
1265 if (n_parent->v.children[0] || n_parent->v.children[1]) {
1266 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1267 ret = -EINVAL;
1268 goto err;
1269 }
1270
1271 ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1272 new_snapids, snapshot_subvols, nr_snapids);
1273 if (ret)
1274 goto err;
1275
1276 n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1277 n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1278 n_parent->v.subvol = 0;
1279 SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1280 err:
1281 bch2_trans_iter_exit(trans, &iter);
1282 return ret;
1283 }
1284
1285 /*
1286 * Create a snapshot node that is the root of a new tree:
1287 */
bch2_snapshot_node_create_tree(struct btree_trans * trans,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1288 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1289 u32 *new_snapids,
1290 u32 *snapshot_subvols,
1291 unsigned nr_snapids)
1292 {
1293 struct bkey_i_snapshot_tree *n_tree;
1294 int ret;
1295
1296 n_tree = __bch2_snapshot_tree_create(trans);
1297 ret = PTR_ERR_OR_ZERO(n_tree) ?:
1298 create_snapids(trans, 0, n_tree->k.p.offset,
1299 new_snapids, snapshot_subvols, nr_snapids);
1300 if (ret)
1301 return ret;
1302
1303 n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
1304 n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
1305 return 0;
1306 }
1307
bch2_snapshot_node_create(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1308 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1309 u32 *new_snapids,
1310 u32 *snapshot_subvols,
1311 unsigned nr_snapids)
1312 {
1313 BUG_ON((parent == 0) != (nr_snapids == 1));
1314 BUG_ON((parent != 0) != (nr_snapids == 2));
1315
1316 return parent
1317 ? bch2_snapshot_node_create_children(trans, parent,
1318 new_snapids, snapshot_subvols, nr_snapids)
1319 : bch2_snapshot_node_create_tree(trans,
1320 new_snapids, snapshot_subvols, nr_snapids);
1321
1322 }
1323
1324 /*
1325 * If we have an unlinked inode in an internal snapshot node, and the inode
1326 * really has been deleted in all child snapshots, how does this get cleaned up?
1327 *
1328 * first there is the problem of how keys that have been overwritten in all
1329 * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1330 * special?
1331 *
1332 * also: unlinked inode in internal snapshot appears to not be getting deleted
1333 * correctly if inode doesn't exist in leaf snapshots
1334 *
1335 * solution:
1336 *
1337 * for a key in an interior snapshot node that needs work to be done that
1338 * requires it to be mutated: iterate over all descendent leaf nodes and copy
1339 * that key to snapshot leaf nodes, where we can mutate it
1340 */
1341
1342 struct snapshot_interior_delete {
1343 u32 id;
1344 u32 live_child;
1345 };
1346 typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
1347
interior_delete_has_id(interior_delete_list * l,u32 id)1348 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
1349 {
1350 darray_for_each(*l, i)
1351 if (i->id == id)
1352 return i->live_child;
1353 return 0;
1354 }
1355
__live_child(struct snapshot_table * t,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1356 static unsigned __live_child(struct snapshot_table *t, u32 id,
1357 snapshot_id_list *delete_leaves,
1358 interior_delete_list *delete_interior)
1359 {
1360 struct snapshot_t *s = __snapshot_t(t, id);
1361 if (!s)
1362 return 0;
1363
1364 for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
1365 if (s->children[i] &&
1366 !snapshot_list_has_id(delete_leaves, s->children[i]) &&
1367 !interior_delete_has_id(delete_interior, s->children[i]))
1368 return s->children[i];
1369
1370 for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
1371 u32 live_child = s->children[i]
1372 ? __live_child(t, s->children[i], delete_leaves, delete_interior)
1373 : 0;
1374 if (live_child)
1375 return live_child;
1376 }
1377
1378 return 0;
1379 }
1380
live_child(struct bch_fs * c,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1381 static unsigned live_child(struct bch_fs *c, u32 id,
1382 snapshot_id_list *delete_leaves,
1383 interior_delete_list *delete_interior)
1384 {
1385 rcu_read_lock();
1386 u32 ret = __live_child(rcu_dereference(c->snapshots), id,
1387 delete_leaves, delete_interior);
1388 rcu_read_unlock();
1389 return ret;
1390 }
1391
delete_dead_snapshots_process_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1392 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
1393 struct btree_iter *iter,
1394 struct bkey_s_c k,
1395 snapshot_id_list *delete_leaves,
1396 interior_delete_list *delete_interior)
1397 {
1398 if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
1399 return bch2_btree_delete_at(trans, iter,
1400 BTREE_UPDATE_internal_snapshot_node);
1401
1402 u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
1403 if (live_child) {
1404 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1405 int ret = PTR_ERR_OR_ZERO(new);
1406 if (ret)
1407 return ret;
1408
1409 new->k.p.snapshot = live_child;
1410
1411 struct btree_iter dst_iter;
1412 struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
1413 iter->btree_id, new->k.p,
1414 BTREE_ITER_all_snapshots|
1415 BTREE_ITER_intent);
1416 ret = bkey_err(dst_k);
1417 if (ret)
1418 return ret;
1419
1420 ret = (bkey_deleted(dst_k.k)
1421 ? bch2_trans_update(trans, &dst_iter, new,
1422 BTREE_UPDATE_internal_snapshot_node)
1423 : 0) ?:
1424 bch2_btree_delete_at(trans, iter,
1425 BTREE_UPDATE_internal_snapshot_node);
1426 bch2_trans_iter_exit(trans, &dst_iter);
1427 return ret;
1428 }
1429
1430 return 0;
1431 }
1432
1433 /*
1434 * For a given snapshot, if it doesn't have a subvolume that points to it, and
1435 * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1436 * as deleted.
1437 */
check_should_delete_snapshot(struct btree_trans * trans,struct bkey_s_c k,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1438 static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
1439 snapshot_id_list *delete_leaves,
1440 interior_delete_list *delete_interior)
1441 {
1442 if (k.k->type != KEY_TYPE_snapshot)
1443 return 0;
1444
1445 struct bch_fs *c = trans->c;
1446 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
1447 unsigned live_children = 0;
1448
1449 if (BCH_SNAPSHOT_SUBVOL(s.v))
1450 return 0;
1451
1452 for (unsigned i = 0; i < 2; i++) {
1453 u32 child = le32_to_cpu(s.v->children[i]);
1454
1455 live_children += child &&
1456 !snapshot_list_has_id(delete_leaves, child);
1457 }
1458
1459 if (live_children == 0) {
1460 return snapshot_list_add(c, delete_leaves, s.k->p.offset);
1461 } else if (live_children == 1) {
1462 struct snapshot_interior_delete d = {
1463 .id = s.k->p.offset,
1464 .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior),
1465 };
1466
1467 if (!d.live_child) {
1468 bch_err(c, "error finding live child of snapshot %u", d.id);
1469 return -EINVAL;
1470 }
1471
1472 return darray_push(delete_interior, d);
1473 } else {
1474 return 0;
1475 }
1476 }
1477
bch2_snapshot_nth_parent_skip(struct bch_fs * c,u32 id,u32 n,interior_delete_list * skip)1478 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1479 interior_delete_list *skip)
1480 {
1481 rcu_read_lock();
1482 while (interior_delete_has_id(skip, id))
1483 id = __bch2_snapshot_parent(c, id);
1484
1485 while (n--) {
1486 do {
1487 id = __bch2_snapshot_parent(c, id);
1488 } while (interior_delete_has_id(skip, id));
1489 }
1490 rcu_read_unlock();
1491
1492 return id;
1493 }
1494
bch2_fix_child_of_deleted_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,interior_delete_list * deleted)1495 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1496 struct btree_iter *iter, struct bkey_s_c k,
1497 interior_delete_list *deleted)
1498 {
1499 struct bch_fs *c = trans->c;
1500 u32 nr_deleted_ancestors = 0;
1501 struct bkey_i_snapshot *s;
1502 int ret;
1503
1504 if (k.k->type != KEY_TYPE_snapshot)
1505 return 0;
1506
1507 if (interior_delete_has_id(deleted, k.k->p.offset))
1508 return 0;
1509
1510 s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1511 ret = PTR_ERR_OR_ZERO(s);
1512 if (ret)
1513 return ret;
1514
1515 darray_for_each(*deleted, i)
1516 nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
1517
1518 if (!nr_deleted_ancestors)
1519 return 0;
1520
1521 le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1522
1523 if (!s->v.depth) {
1524 s->v.skip[0] = 0;
1525 s->v.skip[1] = 0;
1526 s->v.skip[2] = 0;
1527 } else {
1528 u32 depth = le32_to_cpu(s->v.depth);
1529 u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1530
1531 for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1532 u32 id = le32_to_cpu(s->v.skip[j]);
1533
1534 if (interior_delete_has_id(deleted, id)) {
1535 id = bch2_snapshot_nth_parent_skip(c,
1536 parent,
1537 depth > 1
1538 ? get_random_u32_below(depth - 1)
1539 : 0,
1540 deleted);
1541 s->v.skip[j] = cpu_to_le32(id);
1542 }
1543 }
1544
1545 bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1546 }
1547
1548 return bch2_trans_update(trans, iter, &s->k_i, 0);
1549 }
1550
bch2_delete_dead_snapshots(struct bch_fs * c)1551 int bch2_delete_dead_snapshots(struct bch_fs *c)
1552 {
1553 if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1554 return 0;
1555
1556 struct btree_trans *trans = bch2_trans_get(c);
1557 snapshot_id_list delete_leaves = {};
1558 interior_delete_list delete_interior = {};
1559 int ret = 0;
1560
1561 /*
1562 * For every snapshot node: If we have no live children and it's not
1563 * pointed to by a subvolume, delete it:
1564 */
1565 ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
1566 check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
1567 if (!bch2_err_matches(ret, EROFS))
1568 bch_err_msg(c, ret, "walking snapshots");
1569 if (ret)
1570 goto err;
1571
1572 if (!delete_leaves.nr && !delete_interior.nr)
1573 goto err;
1574
1575 {
1576 struct printbuf buf = PRINTBUF;
1577 prt_printf(&buf, "deleting leaves");
1578 darray_for_each(delete_leaves, i)
1579 prt_printf(&buf, " %u", *i);
1580
1581 prt_printf(&buf, " interior");
1582 darray_for_each(delete_interior, i)
1583 prt_printf(&buf, " %u->%u", i->id, i->live_child);
1584
1585 ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
1586 printbuf_exit(&buf);
1587 if (ret)
1588 goto err;
1589 }
1590
1591 for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
1592 struct disk_reservation res = { 0 };
1593
1594 if (!btree_type_has_snapshots(btree))
1595 continue;
1596
1597 ret = for_each_btree_key_commit(trans, iter,
1598 btree, POS_MIN,
1599 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1600 &res, NULL, BCH_TRANS_COMMIT_no_enospc,
1601 delete_dead_snapshots_process_key(trans, &iter, k,
1602 &delete_leaves,
1603 &delete_interior));
1604
1605 bch2_disk_reservation_put(c, &res);
1606
1607 if (!bch2_err_matches(ret, EROFS))
1608 bch_err_msg(c, ret, "deleting keys from dying snapshots");
1609 if (ret)
1610 goto err;
1611 }
1612
1613 darray_for_each(delete_leaves, i) {
1614 ret = commit_do(trans, NULL, NULL, 0,
1615 bch2_snapshot_node_delete(trans, *i));
1616 if (!bch2_err_matches(ret, EROFS))
1617 bch_err_msg(c, ret, "deleting snapshot %u", *i);
1618 if (ret)
1619 goto err;
1620 }
1621
1622 /*
1623 * Fixing children of deleted snapshots can't be done completely
1624 * atomically, if we crash between here and when we delete the interior
1625 * nodes some depth fields will be off:
1626 */
1627 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1628 BTREE_ITER_intent, k,
1629 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1630 bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
1631 if (ret)
1632 goto err;
1633
1634 darray_for_each(delete_interior, i) {
1635 ret = commit_do(trans, NULL, NULL, 0,
1636 bch2_snapshot_node_delete(trans, i->id));
1637 if (!bch2_err_matches(ret, EROFS))
1638 bch_err_msg(c, ret, "deleting snapshot %u", i->id);
1639 if (ret)
1640 goto err;
1641 }
1642 err:
1643 darray_exit(&delete_interior);
1644 darray_exit(&delete_leaves);
1645 bch2_trans_put(trans);
1646 if (!bch2_err_matches(ret, EROFS))
1647 bch_err_fn(c, ret);
1648 return ret;
1649 }
1650
bch2_delete_dead_snapshots_work(struct work_struct * work)1651 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1652 {
1653 struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1654
1655 set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
1656
1657 bch2_delete_dead_snapshots(c);
1658 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1659 }
1660
bch2_delete_dead_snapshots_async(struct bch_fs * c)1661 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1662 {
1663 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
1664 return;
1665
1666 BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
1667
1668 if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1669 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1670 }
1671
__bch2_key_has_snapshot_overwrites(struct btree_trans * trans,enum btree_id id,struct bpos pos)1672 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1673 enum btree_id id,
1674 struct bpos pos)
1675 {
1676 struct bch_fs *c = trans->c;
1677 struct btree_iter iter;
1678 struct bkey_s_c k;
1679 int ret;
1680
1681 for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
1682 BTREE_ITER_not_extents|
1683 BTREE_ITER_all_snapshots,
1684 k, ret) {
1685 if (!bkey_eq(pos, k.k->p))
1686 break;
1687
1688 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1689 ret = 1;
1690 break;
1691 }
1692 }
1693 bch2_trans_iter_exit(trans, &iter);
1694
1695 return ret;
1696 }
1697
interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)1698 static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
1699 {
1700 /* If there's one child, it's redundant and keys will be moved to the child */
1701 return !!snap.v->children[0] + !!snap.v->children[1] == 1;
1702 }
1703
bch2_check_snapshot_needs_deletion(struct btree_trans * trans,struct bkey_s_c k)1704 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1705 {
1706 if (k.k->type != KEY_TYPE_snapshot)
1707 return 0;
1708
1709 struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
1710 if (BCH_SNAPSHOT_DELETED(snap.v) ||
1711 interior_snapshot_needs_delete(snap))
1712 set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
1713
1714 return 0;
1715 }
1716
bch2_snapshots_read(struct bch_fs * c)1717 int bch2_snapshots_read(struct bch_fs *c)
1718 {
1719 /*
1720 * Initializing the is_ancestor bitmaps requires ancestors to already be
1721 * initialized - so mark in reverse:
1722 */
1723 int ret = bch2_trans_run(c,
1724 for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
1725 POS_MAX, 0, k,
1726 __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1727 bch2_check_snapshot_needs_deletion(trans, k)));
1728 bch_err_fn(c, ret);
1729
1730 /*
1731 * It's important that we check if we need to reconstruct snapshots
1732 * before going RW, so we mark that pass as required in the superblock -
1733 * otherwise, we could end up deleting keys with missing snapshot nodes
1734 * instead
1735 */
1736 BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
1737 test_bit(BCH_FS_may_go_rw, &c->flags));
1738
1739 if (bch2_err_matches(ret, EIO) ||
1740 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
1741 ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
1742
1743 return ret;
1744 }
1745
bch2_fs_snapshots_exit(struct bch_fs * c)1746 void bch2_fs_snapshots_exit(struct bch_fs *c)
1747 {
1748 kvfree(rcu_dereference_protected(c->snapshots, true));
1749 }
1750