1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_key_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "errcode.h"
10 #include "error.h"
11 #include "fs.h"
12 #include "recovery_passes.h"
13 #include "snapshot.h"
14
15 #include <linux/random.h>
16
17 /*
18 * Snapshot trees:
19 *
20 * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
21 * exist to provide a stable identifier for the whole lifetime of a snapshot
22 * tree.
23 */
24
bch2_snapshot_tree_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)25 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
26 struct bkey_s_c k)
27 {
28 struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
29
30 prt_printf(out, "subvol %u root snapshot %u",
31 le32_to_cpu(t.v->master_subvol),
32 le32_to_cpu(t.v->root_snapshot));
33 }
34
bch2_snapshot_tree_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)35 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
36 struct bkey_validate_context from)
37 {
38 int ret = 0;
39
40 bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
41 bkey_lt(k.k->p, POS(0, 1)),
42 c, snapshot_tree_pos_bad,
43 "bad pos");
44 fsck_err:
45 return ret;
46 }
47
bch2_snapshot_tree_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot_tree * s)48 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
49 struct bch_snapshot_tree *s)
50 {
51 int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
52 BTREE_ITER_with_updates, snapshot_tree, s);
53
54 if (bch2_err_matches(ret, ENOENT))
55 ret = -BCH_ERR_ENOENT_snapshot_tree;
56 return ret;
57 }
58
59 struct bkey_i_snapshot_tree *
__bch2_snapshot_tree_create(struct btree_trans * trans)60 __bch2_snapshot_tree_create(struct btree_trans *trans)
61 {
62 struct btree_iter iter;
63 int ret = bch2_bkey_get_empty_slot(trans, &iter,
64 BTREE_ID_snapshot_trees, POS(0, U32_MAX));
65 struct bkey_i_snapshot_tree *s_t;
66
67 if (ret == -BCH_ERR_ENOSPC_btree_slot)
68 ret = -BCH_ERR_ENOSPC_snapshot_tree;
69 if (ret)
70 return ERR_PTR(ret);
71
72 s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
73 ret = PTR_ERR_OR_ZERO(s_t);
74 bch2_trans_iter_exit(trans, &iter);
75 return ret ? ERR_PTR(ret) : s_t;
76 }
77
bch2_snapshot_tree_create(struct btree_trans * trans,u32 root_id,u32 subvol_id,u32 * tree_id)78 static int bch2_snapshot_tree_create(struct btree_trans *trans,
79 u32 root_id, u32 subvol_id, u32 *tree_id)
80 {
81 struct bkey_i_snapshot_tree *n_tree =
82 __bch2_snapshot_tree_create(trans);
83
84 if (IS_ERR(n_tree))
85 return PTR_ERR(n_tree);
86
87 n_tree->v.master_subvol = cpu_to_le32(subvol_id);
88 n_tree->v.root_snapshot = cpu_to_le32(root_id);
89 *tree_id = n_tree->k.p.offset;
90 return 0;
91 }
92
93 /* Snapshot nodes: */
94
__bch2_snapshot_is_ancestor_early(struct snapshot_table * t,u32 id,u32 ancestor)95 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
96 {
97 while (id && id < ancestor) {
98 const struct snapshot_t *s = __snapshot_t(t, id);
99 id = s ? s->parent : 0;
100 }
101 return id == ancestor;
102 }
103
bch2_snapshot_is_ancestor_early(struct bch_fs * c,u32 id,u32 ancestor)104 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
105 {
106 rcu_read_lock();
107 bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
108 rcu_read_unlock();
109
110 return ret;
111 }
112
get_ancestor_below(struct snapshot_table * t,u32 id,u32 ancestor)113 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
114 {
115 const struct snapshot_t *s = __snapshot_t(t, id);
116 if (!s)
117 return 0;
118
119 if (s->skip[2] <= ancestor)
120 return s->skip[2];
121 if (s->skip[1] <= ancestor)
122 return s->skip[1];
123 if (s->skip[0] <= ancestor)
124 return s->skip[0];
125 return s->parent;
126 }
127
test_ancestor_bitmap(struct snapshot_table * t,u32 id,u32 ancestor)128 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
129 {
130 const struct snapshot_t *s = __snapshot_t(t, id);
131 if (!s)
132 return false;
133
134 return test_bit(ancestor - id - 1, s->is_ancestor);
135 }
136
__bch2_snapshot_is_ancestor(struct bch_fs * c,u32 id,u32 ancestor)137 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
138 {
139 bool ret;
140
141 rcu_read_lock();
142 struct snapshot_table *t = rcu_dereference(c->snapshots);
143
144 if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
145 ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
146 goto out;
147 }
148
149 while (id && id < ancestor - IS_ANCESTOR_BITMAP)
150 id = get_ancestor_below(t, id, ancestor);
151
152 ret = id && id < ancestor
153 ? test_ancestor_bitmap(t, id, ancestor)
154 : id == ancestor;
155
156 EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
157 out:
158 rcu_read_unlock();
159
160 return ret;
161 }
162
__snapshot_t_mut(struct bch_fs * c,u32 id)163 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
164 {
165 size_t idx = U32_MAX - id;
166 struct snapshot_table *new, *old;
167
168 size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
169 size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
170
171 if (unlikely(new_bytes > INT_MAX))
172 return NULL;
173
174 new = kvzalloc(new_bytes, GFP_KERNEL);
175 if (!new)
176 return NULL;
177
178 new->nr = new_size;
179
180 old = rcu_dereference_protected(c->snapshots, true);
181 if (old)
182 memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
183
184 rcu_assign_pointer(c->snapshots, new);
185 kvfree_rcu(old, rcu);
186
187 return &rcu_dereference_protected(c->snapshots,
188 lockdep_is_held(&c->snapshot_table_lock))->s[idx];
189 }
190
snapshot_t_mut(struct bch_fs * c,u32 id)191 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
192 {
193 size_t idx = U32_MAX - id;
194 struct snapshot_table *table =
195 rcu_dereference_protected(c->snapshots,
196 lockdep_is_held(&c->snapshot_table_lock));
197
198 lockdep_assert_held(&c->snapshot_table_lock);
199
200 if (likely(table && idx < table->nr))
201 return &table->s[idx];
202
203 return __snapshot_t_mut(c, id);
204 }
205
bch2_snapshot_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)206 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
207 struct bkey_s_c k)
208 {
209 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
210
211 prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
212 BCH_SNAPSHOT_SUBVOL(s.v),
213 BCH_SNAPSHOT_DELETED(s.v),
214 le32_to_cpu(s.v->parent),
215 le32_to_cpu(s.v->children[0]),
216 le32_to_cpu(s.v->children[1]),
217 le32_to_cpu(s.v->subvol),
218 le32_to_cpu(s.v->tree));
219
220 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
221 prt_printf(out, " depth %u skiplist %u %u %u",
222 le32_to_cpu(s.v->depth),
223 le32_to_cpu(s.v->skip[0]),
224 le32_to_cpu(s.v->skip[1]),
225 le32_to_cpu(s.v->skip[2]));
226 }
227
bch2_snapshot_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)228 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
229 struct bkey_validate_context from)
230 {
231 struct bkey_s_c_snapshot s;
232 u32 i, id;
233 int ret = 0;
234
235 bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
236 bkey_lt(k.k->p, POS(0, 1)),
237 c, snapshot_pos_bad,
238 "bad pos");
239
240 s = bkey_s_c_to_snapshot(k);
241
242 id = le32_to_cpu(s.v->parent);
243 bkey_fsck_err_on(id && id <= k.k->p.offset,
244 c, snapshot_parent_bad,
245 "bad parent node (%u <= %llu)",
246 id, k.k->p.offset);
247
248 bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
249 c, snapshot_children_not_normalized,
250 "children not normalized");
251
252 bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
253 c, snapshot_child_duplicate,
254 "duplicate child nodes");
255
256 for (i = 0; i < 2; i++) {
257 id = le32_to_cpu(s.v->children[i]);
258
259 bkey_fsck_err_on(id >= k.k->p.offset,
260 c, snapshot_child_bad,
261 "bad child node (%u >= %llu)",
262 id, k.k->p.offset);
263 }
264
265 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
266 bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
267 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
268 c, snapshot_skiplist_not_normalized,
269 "skiplist not normalized");
270
271 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
272 id = le32_to_cpu(s.v->skip[i]);
273
274 bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
275 c, snapshot_skiplist_bad,
276 "bad skiplist node %u", id);
277 }
278 }
279 fsck_err:
280 return ret;
281 }
282
__bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s_c new,enum btree_iter_update_trigger_flags flags)283 static int __bch2_mark_snapshot(struct btree_trans *trans,
284 enum btree_id btree, unsigned level,
285 struct bkey_s_c old, struct bkey_s_c new,
286 enum btree_iter_update_trigger_flags flags)
287 {
288 struct bch_fs *c = trans->c;
289 struct snapshot_t *t;
290 u32 id = new.k->p.offset;
291 int ret = 0;
292
293 mutex_lock(&c->snapshot_table_lock);
294
295 t = snapshot_t_mut(c, id);
296 if (!t) {
297 ret = -BCH_ERR_ENOMEM_mark_snapshot;
298 goto err;
299 }
300
301 if (new.k->type == KEY_TYPE_snapshot) {
302 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
303
304 t->live = true;
305 t->parent = le32_to_cpu(s.v->parent);
306 t->children[0] = le32_to_cpu(s.v->children[0]);
307 t->children[1] = le32_to_cpu(s.v->children[1]);
308 t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
309 t->tree = le32_to_cpu(s.v->tree);
310
311 if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
312 t->depth = le32_to_cpu(s.v->depth);
313 t->skip[0] = le32_to_cpu(s.v->skip[0]);
314 t->skip[1] = le32_to_cpu(s.v->skip[1]);
315 t->skip[2] = le32_to_cpu(s.v->skip[2]);
316 } else {
317 t->depth = 0;
318 t->skip[0] = 0;
319 t->skip[1] = 0;
320 t->skip[2] = 0;
321 }
322
323 u32 parent = id;
324
325 while ((parent = bch2_snapshot_parent_early(c, parent)) &&
326 parent - id - 1 < IS_ANCESTOR_BITMAP)
327 __set_bit(parent - id - 1, t->is_ancestor);
328
329 if (BCH_SNAPSHOT_DELETED(s.v)) {
330 set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
331 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
332 bch2_delete_dead_snapshots_async(c);
333 }
334 } else {
335 memset(t, 0, sizeof(*t));
336 }
337 err:
338 mutex_unlock(&c->snapshot_table_lock);
339 return ret;
340 }
341
bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s new,enum btree_iter_update_trigger_flags flags)342 int bch2_mark_snapshot(struct btree_trans *trans,
343 enum btree_id btree, unsigned level,
344 struct bkey_s_c old, struct bkey_s new,
345 enum btree_iter_update_trigger_flags flags)
346 {
347 return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
348 }
349
bch2_snapshot_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot * s)350 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
351 struct bch_snapshot *s)
352 {
353 return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
354 BTREE_ITER_with_updates, snapshot, s);
355 }
356
357 /* fsck: */
358
bch2_snapshot_child(struct bch_fs * c,u32 id,unsigned child)359 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
360 {
361 return snapshot_t(c, id)->children[child];
362 }
363
bch2_snapshot_left_child(struct bch_fs * c,u32 id)364 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
365 {
366 return bch2_snapshot_child(c, id, 0);
367 }
368
bch2_snapshot_right_child(struct bch_fs * c,u32 id)369 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
370 {
371 return bch2_snapshot_child(c, id, 1);
372 }
373
bch2_snapshot_tree_next(struct bch_fs * c,u32 id)374 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
375 {
376 u32 n, parent;
377
378 n = bch2_snapshot_left_child(c, id);
379 if (n)
380 return n;
381
382 while ((parent = bch2_snapshot_parent(c, id))) {
383 n = bch2_snapshot_right_child(c, parent);
384 if (n && n != id)
385 return n;
386 id = parent;
387 }
388
389 return 0;
390 }
391
bch2_snapshot_tree_oldest_subvol(struct bch_fs * c,u32 snapshot_root)392 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
393 {
394 u32 id = snapshot_root;
395 u32 subvol = 0, s;
396
397 rcu_read_lock();
398 while (id) {
399 s = snapshot_t(c, id)->subvol;
400
401 if (s && (!subvol || s < subvol))
402 subvol = s;
403
404 id = bch2_snapshot_tree_next(c, id);
405 }
406 rcu_read_unlock();
407
408 return subvol;
409 }
410
bch2_snapshot_tree_master_subvol(struct btree_trans * trans,u32 snapshot_root,u32 * subvol_id)411 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
412 u32 snapshot_root, u32 *subvol_id)
413 {
414 struct bch_fs *c = trans->c;
415 struct btree_iter iter;
416 struct bkey_s_c k;
417 bool found = false;
418 int ret;
419
420 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
421 0, k, ret) {
422 if (k.k->type != KEY_TYPE_subvolume)
423 continue;
424
425 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
426 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
427 continue;
428 if (!BCH_SUBVOLUME_SNAP(s.v)) {
429 *subvol_id = s.k->p.offset;
430 found = true;
431 break;
432 }
433 }
434 bch2_trans_iter_exit(trans, &iter);
435
436 if (!ret && !found) {
437 struct bkey_i_subvolume *u;
438
439 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
440
441 u = bch2_bkey_get_mut_typed(trans, &iter,
442 BTREE_ID_subvolumes, POS(0, *subvol_id),
443 0, subvolume);
444 ret = PTR_ERR_OR_ZERO(u);
445 if (ret)
446 return ret;
447
448 SET_BCH_SUBVOLUME_SNAP(&u->v, false);
449 }
450
451 return ret;
452 }
453
check_snapshot_tree(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)454 static int check_snapshot_tree(struct btree_trans *trans,
455 struct btree_iter *iter,
456 struct bkey_s_c k)
457 {
458 struct bch_fs *c = trans->c;
459 struct bkey_s_c_snapshot_tree st;
460 struct bch_snapshot s;
461 struct bch_subvolume subvol;
462 struct printbuf buf = PRINTBUF;
463 struct btree_iter snapshot_iter = {};
464 u32 root_id;
465 int ret;
466
467 if (k.k->type != KEY_TYPE_snapshot_tree)
468 return 0;
469
470 st = bkey_s_c_to_snapshot_tree(k);
471 root_id = le32_to_cpu(st.v->root_snapshot);
472
473 struct bkey_s_c_snapshot snapshot_k =
474 bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
475 POS(0, root_id), 0, snapshot);
476 ret = bkey_err(snapshot_k);
477 if (ret && !bch2_err_matches(ret, ENOENT))
478 goto err;
479
480 if (!ret)
481 bkey_val_copy(&s, snapshot_k);
482
483 if (fsck_err_on(ret ||
484 root_id != bch2_snapshot_root(c, root_id) ||
485 st.k->p.offset != le32_to_cpu(s.tree),
486 trans, snapshot_tree_to_missing_snapshot,
487 "snapshot tree points to missing/incorrect snapshot:\n %s",
488 (bch2_bkey_val_to_text(&buf, c, st.s_c),
489 prt_newline(&buf),
490 ret
491 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
492 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
493 buf.buf))) {
494 ret = bch2_btree_delete_at(trans, iter, 0);
495 goto err;
496 }
497
498 if (!st.v->master_subvol)
499 goto out;
500
501 ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
502 if (ret && !bch2_err_matches(ret, ENOENT))
503 goto err;
504
505 if (fsck_err_on(ret,
506 trans, snapshot_tree_to_missing_subvol,
507 "snapshot tree points to missing subvolume:\n %s",
508 (printbuf_reset(&buf),
509 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
510 fsck_err_on(!bch2_snapshot_is_ancestor(c,
511 le32_to_cpu(subvol.snapshot),
512 root_id),
513 trans, snapshot_tree_to_wrong_subvol,
514 "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
515 (printbuf_reset(&buf),
516 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
517 fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
518 trans, snapshot_tree_to_snapshot_subvol,
519 "snapshot tree points to snapshot subvolume:\n %s",
520 (printbuf_reset(&buf),
521 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
522 struct bkey_i_snapshot_tree *u;
523 u32 subvol_id;
524
525 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
526 bch_err_fn(c, ret);
527
528 if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
529 ret = 0;
530 goto err;
531 }
532
533 if (ret)
534 goto err;
535
536 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
537 ret = PTR_ERR_OR_ZERO(u);
538 if (ret)
539 goto err;
540
541 u->v.master_subvol = cpu_to_le32(subvol_id);
542 st = snapshot_tree_i_to_s_c(u);
543 }
544 out:
545 err:
546 fsck_err:
547 bch2_trans_iter_exit(trans, &snapshot_iter);
548 printbuf_exit(&buf);
549 return ret;
550 }
551
552 /*
553 * For each snapshot_tree, make sure it points to the root of a snapshot tree
554 * and that snapshot entry points back to it, or delete it.
555 *
556 * And, make sure it points to a subvolume within that snapshot tree, or correct
557 * it to point to the oldest subvolume within that snapshot tree.
558 */
bch2_check_snapshot_trees(struct bch_fs * c)559 int bch2_check_snapshot_trees(struct bch_fs *c)
560 {
561 int ret = bch2_trans_run(c,
562 for_each_btree_key_commit(trans, iter,
563 BTREE_ID_snapshot_trees, POS_MIN,
564 BTREE_ITER_prefetch, k,
565 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
566 check_snapshot_tree(trans, &iter, k)));
567 bch_err_fn(c, ret);
568 return ret;
569 }
570
571 /*
572 * Look up snapshot tree for @tree_id and find root,
573 * make sure @snap_id is a descendent:
574 */
snapshot_tree_ptr_good(struct btree_trans * trans,u32 snap_id,u32 tree_id)575 static int snapshot_tree_ptr_good(struct btree_trans *trans,
576 u32 snap_id, u32 tree_id)
577 {
578 struct bch_snapshot_tree s_t;
579 int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
580
581 if (bch2_err_matches(ret, ENOENT))
582 return 0;
583 if (ret)
584 return ret;
585
586 return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
587 }
588
bch2_snapshot_skiplist_get(struct bch_fs * c,u32 id)589 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
590 {
591 const struct snapshot_t *s;
592
593 if (!id)
594 return 0;
595
596 rcu_read_lock();
597 s = snapshot_t(c, id);
598 if (s->parent)
599 id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
600 rcu_read_unlock();
601
602 return id;
603 }
604
snapshot_skiplist_good(struct btree_trans * trans,u32 id,struct bch_snapshot s)605 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
606 {
607 unsigned i;
608
609 for (i = 0; i < 3; i++)
610 if (!s.parent) {
611 if (s.skip[i])
612 return false;
613 } else {
614 if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
615 return false;
616 }
617
618 return true;
619 }
620
621 /*
622 * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
623 * its snapshot_tree pointer is correct (allocate new one if necessary), then
624 * update this node's pointer to root node's pointer:
625 */
snapshot_tree_ptr_repair(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_snapshot * s)626 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
627 struct btree_iter *iter,
628 struct bkey_s_c k,
629 struct bch_snapshot *s)
630 {
631 struct bch_fs *c = trans->c;
632 struct btree_iter root_iter;
633 struct bch_snapshot_tree s_t;
634 struct bkey_s_c_snapshot root;
635 struct bkey_i_snapshot *u;
636 u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
637 int ret;
638
639 root = bch2_bkey_get_iter_typed(trans, &root_iter,
640 BTREE_ID_snapshots, POS(0, root_id),
641 BTREE_ITER_with_updates, snapshot);
642 ret = bkey_err(root);
643 if (ret)
644 goto err;
645
646 tree_id = le32_to_cpu(root.v->tree);
647
648 ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
649 if (ret && !bch2_err_matches(ret, ENOENT))
650 return ret;
651
652 if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
653 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
654 ret = PTR_ERR_OR_ZERO(u) ?:
655 bch2_snapshot_tree_create(trans, root_id,
656 bch2_snapshot_tree_oldest_subvol(c, root_id),
657 &tree_id);
658 if (ret)
659 goto err;
660
661 u->v.tree = cpu_to_le32(tree_id);
662 if (k.k->p.offset == root_id)
663 *s = u->v;
664 }
665
666 if (k.k->p.offset != root_id) {
667 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
668 ret = PTR_ERR_OR_ZERO(u);
669 if (ret)
670 goto err;
671
672 u->v.tree = cpu_to_le32(tree_id);
673 *s = u->v;
674 }
675 err:
676 bch2_trans_iter_exit(trans, &root_iter);
677 return ret;
678 }
679
check_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)680 static int check_snapshot(struct btree_trans *trans,
681 struct btree_iter *iter,
682 struct bkey_s_c k)
683 {
684 struct bch_fs *c = trans->c;
685 struct bch_snapshot s;
686 struct bch_subvolume subvol;
687 struct bch_snapshot v;
688 struct bkey_i_snapshot *u;
689 u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
690 u32 real_depth;
691 struct printbuf buf = PRINTBUF;
692 u32 i, id;
693 int ret = 0;
694
695 if (k.k->type != KEY_TYPE_snapshot)
696 return 0;
697
698 memset(&s, 0, sizeof(s));
699 memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
700
701 id = le32_to_cpu(s.parent);
702 if (id) {
703 ret = bch2_snapshot_lookup(trans, id, &v);
704 if (bch2_err_matches(ret, ENOENT))
705 bch_err(c, "snapshot with nonexistent parent:\n %s",
706 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
707 if (ret)
708 goto err;
709
710 if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
711 le32_to_cpu(v.children[1]) != k.k->p.offset) {
712 bch_err(c, "snapshot parent %u missing pointer to child %llu",
713 id, k.k->p.offset);
714 ret = -EINVAL;
715 goto err;
716 }
717 }
718
719 for (i = 0; i < 2 && s.children[i]; i++) {
720 id = le32_to_cpu(s.children[i]);
721
722 ret = bch2_snapshot_lookup(trans, id, &v);
723 if (bch2_err_matches(ret, ENOENT))
724 bch_err(c, "snapshot node %llu has nonexistent child %u",
725 k.k->p.offset, id);
726 if (ret)
727 goto err;
728
729 if (le32_to_cpu(v.parent) != k.k->p.offset) {
730 bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
731 id, le32_to_cpu(v.parent), k.k->p.offset);
732 ret = -EINVAL;
733 goto err;
734 }
735 }
736
737 bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
738 !BCH_SNAPSHOT_DELETED(&s);
739
740 if (should_have_subvol) {
741 id = le32_to_cpu(s.subvol);
742 ret = bch2_subvolume_get(trans, id, false, &subvol);
743 if (bch2_err_matches(ret, ENOENT))
744 bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
745 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
746 if (ret)
747 goto err;
748
749 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
750 bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
751 k.k->p.offset);
752 ret = -EINVAL;
753 goto err;
754 }
755 } else {
756 if (fsck_err_on(s.subvol,
757 trans, snapshot_should_not_have_subvol,
758 "snapshot should not point to subvol:\n %s",
759 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
760 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
761 ret = PTR_ERR_OR_ZERO(u);
762 if (ret)
763 goto err;
764
765 u->v.subvol = 0;
766 s = u->v;
767 }
768 }
769
770 ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
771 if (ret < 0)
772 goto err;
773
774 if (fsck_err_on(!ret,
775 trans, snapshot_to_bad_snapshot_tree,
776 "snapshot points to missing/incorrect tree:\n %s",
777 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
778 ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
779 if (ret)
780 goto err;
781 }
782 ret = 0;
783
784 real_depth = bch2_snapshot_depth(c, parent_id);
785
786 if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
787 trans, snapshot_bad_depth,
788 "snapshot with incorrect depth field, should be %u:\n %s",
789 real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
790 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
791 ret = PTR_ERR_OR_ZERO(u);
792 if (ret)
793 goto err;
794
795 u->v.depth = cpu_to_le32(real_depth);
796 s = u->v;
797 }
798
799 ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
800 if (ret < 0)
801 goto err;
802
803 if (fsck_err_on(!ret,
804 trans, snapshot_bad_skiplist,
805 "snapshot with bad skiplist field:\n %s",
806 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
807 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
808 ret = PTR_ERR_OR_ZERO(u);
809 if (ret)
810 goto err;
811
812 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
813 u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
814
815 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
816 s = u->v;
817 }
818 ret = 0;
819 err:
820 fsck_err:
821 printbuf_exit(&buf);
822 return ret;
823 }
824
bch2_check_snapshots(struct bch_fs * c)825 int bch2_check_snapshots(struct bch_fs *c)
826 {
827 /*
828 * We iterate backwards as checking/fixing the depth field requires that
829 * the parent's depth already be correct:
830 */
831 int ret = bch2_trans_run(c,
832 for_each_btree_key_reverse_commit(trans, iter,
833 BTREE_ID_snapshots, POS_MAX,
834 BTREE_ITER_prefetch, k,
835 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
836 check_snapshot(trans, &iter, k)));
837 bch_err_fn(c, ret);
838 return ret;
839 }
840
check_snapshot_exists(struct btree_trans * trans,u32 id)841 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
842 {
843 struct bch_fs *c = trans->c;
844
845 if (bch2_snapshot_exists(c, id))
846 return 0;
847
848 /* Do we need to reconstruct the snapshot_tree entry as well? */
849 struct btree_iter iter;
850 struct bkey_s_c k;
851 int ret = 0;
852 u32 tree_id = 0;
853
854 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
855 0, k, ret) {
856 if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
857 tree_id = k.k->p.offset;
858 break;
859 }
860 }
861 bch2_trans_iter_exit(trans, &iter);
862
863 if (ret)
864 return ret;
865
866 if (!tree_id) {
867 ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
868 if (ret)
869 return ret;
870 }
871
872 struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
873 ret = PTR_ERR_OR_ZERO(snapshot);
874 if (ret)
875 return ret;
876
877 bkey_snapshot_init(&snapshot->k_i);
878 snapshot->k.p = POS(0, id);
879 snapshot->v.tree = cpu_to_le32(tree_id);
880 snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
881
882 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
883 0, k, ret) {
884 if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
885 snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
886 SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
887 break;
888 }
889 }
890 bch2_trans_iter_exit(trans, &iter);
891
892 return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
893 bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
894 bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
895 }
896
897 /* Figure out which snapshot nodes belong in the same tree: */
898 struct snapshot_tree_reconstruct {
899 enum btree_id btree;
900 struct bpos cur_pos;
901 snapshot_id_list cur_ids;
902 DARRAY(snapshot_id_list) trees;
903 };
904
snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct * r)905 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
906 {
907 darray_for_each(r->trees, i)
908 darray_exit(i);
909 darray_exit(&r->trees);
910 darray_exit(&r->cur_ids);
911 }
912
same_snapshot(struct snapshot_tree_reconstruct * r,struct bpos pos)913 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
914 {
915 return r->btree == BTREE_ID_inodes
916 ? r->cur_pos.offset == pos.offset
917 : r->cur_pos.inode == pos.inode;
918 }
919
snapshot_id_lists_have_common(snapshot_id_list * l,snapshot_id_list * r)920 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
921 {
922 darray_for_each(*l, i)
923 if (snapshot_list_has_id(r, *i))
924 return true;
925 return false;
926 }
927
snapshot_id_list_to_text(struct printbuf * out,snapshot_id_list * s)928 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
929 {
930 bool first = true;
931 darray_for_each(*s, i) {
932 if (!first)
933 prt_char(out, ' ');
934 first = false;
935 prt_printf(out, "%u", *i);
936 }
937 }
938
snapshot_tree_reconstruct_next(struct bch_fs * c,struct snapshot_tree_reconstruct * r)939 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
940 {
941 if (r->cur_ids.nr) {
942 darray_for_each(r->trees, i)
943 if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
944 int ret = snapshot_list_merge(c, i, &r->cur_ids);
945 if (ret)
946 return ret;
947 goto out;
948 }
949 darray_push(&r->trees, r->cur_ids);
950 darray_init(&r->cur_ids);
951 }
952 out:
953 r->cur_ids.nr = 0;
954 return 0;
955 }
956
get_snapshot_trees(struct bch_fs * c,struct snapshot_tree_reconstruct * r,struct bpos pos)957 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
958 {
959 if (!same_snapshot(r, pos))
960 snapshot_tree_reconstruct_next(c, r);
961 r->cur_pos = pos;
962 return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
963 }
964
bch2_reconstruct_snapshots(struct bch_fs * c)965 int bch2_reconstruct_snapshots(struct bch_fs *c)
966 {
967 struct btree_trans *trans = bch2_trans_get(c);
968 struct printbuf buf = PRINTBUF;
969 struct snapshot_tree_reconstruct r = {};
970 int ret = 0;
971
972 for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
973 if (btree_type_has_snapshots(btree)) {
974 r.btree = btree;
975
976 ret = for_each_btree_key(trans, iter, btree, POS_MIN,
977 BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
978 get_snapshot_trees(c, &r, k.k->p);
979 }));
980 if (ret)
981 goto err;
982
983 snapshot_tree_reconstruct_next(c, &r);
984 }
985 }
986
987 darray_for_each(r.trees, t) {
988 printbuf_reset(&buf);
989 snapshot_id_list_to_text(&buf, t);
990
991 darray_for_each(*t, id) {
992 if (fsck_err_on(!bch2_snapshot_exists(c, *id),
993 trans, snapshot_node_missing,
994 "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
995 if (t->nr > 1) {
996 bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
997 ret = -BCH_ERR_fsck_repair_unimplemented;
998 goto err;
999 }
1000
1001 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1002 check_snapshot_exists(trans, *id));
1003 if (ret)
1004 goto err;
1005 }
1006 }
1007 }
1008 fsck_err:
1009 err:
1010 bch2_trans_put(trans);
1011 snapshot_tree_reconstruct_exit(&r);
1012 printbuf_exit(&buf);
1013 bch_err_fn(c, ret);
1014 return ret;
1015 }
1016
bch2_check_key_has_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1017 int bch2_check_key_has_snapshot(struct btree_trans *trans,
1018 struct btree_iter *iter,
1019 struct bkey_s_c k)
1020 {
1021 struct bch_fs *c = trans->c;
1022 struct printbuf buf = PRINTBUF;
1023 int ret = 0;
1024
1025 if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
1026 trans, bkey_in_missing_snapshot,
1027 "key in missing snapshot %s, delete?",
1028 (bch2_btree_id_to_text(&buf, iter->btree_id),
1029 prt_char(&buf, ' '),
1030 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1031 ret = bch2_btree_delete_at(trans, iter,
1032 BTREE_UPDATE_internal_snapshot_node) ?: 1;
1033 fsck_err:
1034 printbuf_exit(&buf);
1035 return ret;
1036 }
1037
1038 /*
1039 * Mark a snapshot as deleted, for future cleanup:
1040 */
bch2_snapshot_node_set_deleted(struct btree_trans * trans,u32 id)1041 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1042 {
1043 struct btree_iter iter;
1044 struct bkey_i_snapshot *s =
1045 bch2_bkey_get_mut_typed(trans, &iter,
1046 BTREE_ID_snapshots, POS(0, id),
1047 0, snapshot);
1048 int ret = PTR_ERR_OR_ZERO(s);
1049 if (unlikely(ret)) {
1050 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1051 trans->c, "missing snapshot %u", id);
1052 return ret;
1053 }
1054
1055 /* already deleted? */
1056 if (BCH_SNAPSHOT_DELETED(&s->v))
1057 goto err;
1058
1059 SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1060 SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1061 s->v.subvol = 0;
1062 err:
1063 bch2_trans_iter_exit(trans, &iter);
1064 return ret;
1065 }
1066
normalize_snapshot_child_pointers(struct bch_snapshot * s)1067 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1068 {
1069 if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1070 swap(s->children[0], s->children[1]);
1071 }
1072
bch2_snapshot_node_delete(struct btree_trans * trans,u32 id)1073 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1074 {
1075 struct bch_fs *c = trans->c;
1076 struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
1077 struct btree_iter c_iter = (struct btree_iter) { NULL };
1078 struct btree_iter tree_iter = (struct btree_iter) { NULL };
1079 struct bkey_s_c_snapshot s;
1080 u32 parent_id, child_id;
1081 unsigned i;
1082 int ret = 0;
1083
1084 s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1085 BTREE_ITER_intent, snapshot);
1086 ret = bkey_err(s);
1087 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1088 "missing snapshot %u", id);
1089
1090 if (ret)
1091 goto err;
1092
1093 BUG_ON(s.v->children[1]);
1094
1095 parent_id = le32_to_cpu(s.v->parent);
1096 child_id = le32_to_cpu(s.v->children[0]);
1097
1098 if (parent_id) {
1099 struct bkey_i_snapshot *parent;
1100
1101 parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1102 BTREE_ID_snapshots, POS(0, parent_id),
1103 0, snapshot);
1104 ret = PTR_ERR_OR_ZERO(parent);
1105 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1106 "missing snapshot %u", parent_id);
1107 if (unlikely(ret))
1108 goto err;
1109
1110 /* find entry in parent->children for node being deleted */
1111 for (i = 0; i < 2; i++)
1112 if (le32_to_cpu(parent->v.children[i]) == id)
1113 break;
1114
1115 if (bch2_fs_inconsistent_on(i == 2, c,
1116 "snapshot %u missing child pointer to %u",
1117 parent_id, id))
1118 goto err;
1119
1120 parent->v.children[i] = cpu_to_le32(child_id);
1121
1122 normalize_snapshot_child_pointers(&parent->v);
1123 }
1124
1125 if (child_id) {
1126 struct bkey_i_snapshot *child;
1127
1128 child = bch2_bkey_get_mut_typed(trans, &c_iter,
1129 BTREE_ID_snapshots, POS(0, child_id),
1130 0, snapshot);
1131 ret = PTR_ERR_OR_ZERO(child);
1132 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1133 "missing snapshot %u", child_id);
1134 if (unlikely(ret))
1135 goto err;
1136
1137 child->v.parent = cpu_to_le32(parent_id);
1138
1139 if (!child->v.parent) {
1140 child->v.skip[0] = 0;
1141 child->v.skip[1] = 0;
1142 child->v.skip[2] = 0;
1143 }
1144 }
1145
1146 if (!parent_id) {
1147 /*
1148 * We're deleting the root of a snapshot tree: update the
1149 * snapshot_tree entry to point to the new root, or delete it if
1150 * this is the last snapshot ID in this tree:
1151 */
1152 struct bkey_i_snapshot_tree *s_t;
1153
1154 BUG_ON(s.v->children[1]);
1155
1156 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1157 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
1158 0, snapshot_tree);
1159 ret = PTR_ERR_OR_ZERO(s_t);
1160 if (ret)
1161 goto err;
1162
1163 if (s.v->children[0]) {
1164 s_t->v.root_snapshot = s.v->children[0];
1165 } else {
1166 s_t->k.type = KEY_TYPE_deleted;
1167 set_bkey_val_u64s(&s_t->k, 0);
1168 }
1169 }
1170
1171 ret = bch2_btree_delete_at(trans, &iter, 0);
1172 err:
1173 bch2_trans_iter_exit(trans, &tree_iter);
1174 bch2_trans_iter_exit(trans, &p_iter);
1175 bch2_trans_iter_exit(trans, &c_iter);
1176 bch2_trans_iter_exit(trans, &iter);
1177 return ret;
1178 }
1179
create_snapids(struct btree_trans * trans,u32 parent,u32 tree,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1180 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1181 u32 *new_snapids,
1182 u32 *snapshot_subvols,
1183 unsigned nr_snapids)
1184 {
1185 struct bch_fs *c = trans->c;
1186 struct btree_iter iter;
1187 struct bkey_i_snapshot *n;
1188 struct bkey_s_c k;
1189 unsigned i, j;
1190 u32 depth = bch2_snapshot_depth(c, parent);
1191 int ret;
1192
1193 bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1194 POS_MIN, BTREE_ITER_intent);
1195 k = bch2_btree_iter_peek(&iter);
1196 ret = bkey_err(k);
1197 if (ret)
1198 goto err;
1199
1200 for (i = 0; i < nr_snapids; i++) {
1201 k = bch2_btree_iter_prev_slot(&iter);
1202 ret = bkey_err(k);
1203 if (ret)
1204 goto err;
1205
1206 if (!k.k || !k.k->p.offset) {
1207 ret = -BCH_ERR_ENOSPC_snapshot_create;
1208 goto err;
1209 }
1210
1211 n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1212 ret = PTR_ERR_OR_ZERO(n);
1213 if (ret)
1214 goto err;
1215
1216 n->v.flags = 0;
1217 n->v.parent = cpu_to_le32(parent);
1218 n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
1219 n->v.tree = cpu_to_le32(tree);
1220 n->v.depth = cpu_to_le32(depth);
1221 n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
1222 n->v.btime.hi = 0;
1223
1224 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1225 n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1226
1227 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1228 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1229
1230 ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1231 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1232 if (ret)
1233 goto err;
1234
1235 new_snapids[i] = iter.pos.offset;
1236 }
1237 err:
1238 bch2_trans_iter_exit(trans, &iter);
1239 return ret;
1240 }
1241
1242 /*
1243 * Create new snapshot IDs as children of an existing snapshot ID:
1244 */
bch2_snapshot_node_create_children(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1245 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1246 u32 *new_snapids,
1247 u32 *snapshot_subvols,
1248 unsigned nr_snapids)
1249 {
1250 struct btree_iter iter;
1251 struct bkey_i_snapshot *n_parent;
1252 int ret = 0;
1253
1254 n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1255 BTREE_ID_snapshots, POS(0, parent),
1256 0, snapshot);
1257 ret = PTR_ERR_OR_ZERO(n_parent);
1258 if (unlikely(ret)) {
1259 if (bch2_err_matches(ret, ENOENT))
1260 bch_err(trans->c, "snapshot %u not found", parent);
1261 return ret;
1262 }
1263
1264 if (n_parent->v.children[0] || n_parent->v.children[1]) {
1265 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1266 ret = -EINVAL;
1267 goto err;
1268 }
1269
1270 ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1271 new_snapids, snapshot_subvols, nr_snapids);
1272 if (ret)
1273 goto err;
1274
1275 n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1276 n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1277 n_parent->v.subvol = 0;
1278 SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1279 err:
1280 bch2_trans_iter_exit(trans, &iter);
1281 return ret;
1282 }
1283
1284 /*
1285 * Create a snapshot node that is the root of a new tree:
1286 */
bch2_snapshot_node_create_tree(struct btree_trans * trans,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1287 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1288 u32 *new_snapids,
1289 u32 *snapshot_subvols,
1290 unsigned nr_snapids)
1291 {
1292 struct bkey_i_snapshot_tree *n_tree;
1293 int ret;
1294
1295 n_tree = __bch2_snapshot_tree_create(trans);
1296 ret = PTR_ERR_OR_ZERO(n_tree) ?:
1297 create_snapids(trans, 0, n_tree->k.p.offset,
1298 new_snapids, snapshot_subvols, nr_snapids);
1299 if (ret)
1300 return ret;
1301
1302 n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
1303 n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
1304 return 0;
1305 }
1306
bch2_snapshot_node_create(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1307 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1308 u32 *new_snapids,
1309 u32 *snapshot_subvols,
1310 unsigned nr_snapids)
1311 {
1312 BUG_ON((parent == 0) != (nr_snapids == 1));
1313 BUG_ON((parent != 0) != (nr_snapids == 2));
1314
1315 return parent
1316 ? bch2_snapshot_node_create_children(trans, parent,
1317 new_snapids, snapshot_subvols, nr_snapids)
1318 : bch2_snapshot_node_create_tree(trans,
1319 new_snapids, snapshot_subvols, nr_snapids);
1320
1321 }
1322
1323 /*
1324 * If we have an unlinked inode in an internal snapshot node, and the inode
1325 * really has been deleted in all child snapshots, how does this get cleaned up?
1326 *
1327 * first there is the problem of how keys that have been overwritten in all
1328 * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1329 * special?
1330 *
1331 * also: unlinked inode in internal snapshot appears to not be getting deleted
1332 * correctly if inode doesn't exist in leaf snapshots
1333 *
1334 * solution:
1335 *
1336 * for a key in an interior snapshot node that needs work to be done that
1337 * requires it to be mutated: iterate over all descendent leaf nodes and copy
1338 * that key to snapshot leaf nodes, where we can mutate it
1339 */
1340
1341 struct snapshot_interior_delete {
1342 u32 id;
1343 u32 live_child;
1344 };
1345 typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
1346
interior_delete_has_id(interior_delete_list * l,u32 id)1347 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
1348 {
1349 darray_for_each(*l, i)
1350 if (i->id == id)
1351 return i->live_child;
1352 return 0;
1353 }
1354
__live_child(struct snapshot_table * t,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1355 static unsigned __live_child(struct snapshot_table *t, u32 id,
1356 snapshot_id_list *delete_leaves,
1357 interior_delete_list *delete_interior)
1358 {
1359 struct snapshot_t *s = __snapshot_t(t, id);
1360 if (!s)
1361 return 0;
1362
1363 for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
1364 if (s->children[i] &&
1365 !snapshot_list_has_id(delete_leaves, s->children[i]) &&
1366 !interior_delete_has_id(delete_interior, s->children[i]))
1367 return s->children[i];
1368
1369 for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
1370 u32 live_child = s->children[i]
1371 ? __live_child(t, s->children[i], delete_leaves, delete_interior)
1372 : 0;
1373 if (live_child)
1374 return live_child;
1375 }
1376
1377 return 0;
1378 }
1379
live_child(struct bch_fs * c,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1380 static unsigned live_child(struct bch_fs *c, u32 id,
1381 snapshot_id_list *delete_leaves,
1382 interior_delete_list *delete_interior)
1383 {
1384 rcu_read_lock();
1385 u32 ret = __live_child(rcu_dereference(c->snapshots), id,
1386 delete_leaves, delete_interior);
1387 rcu_read_unlock();
1388 return ret;
1389 }
1390
delete_dead_snapshots_process_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1391 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
1392 struct btree_iter *iter,
1393 struct bkey_s_c k,
1394 snapshot_id_list *delete_leaves,
1395 interior_delete_list *delete_interior)
1396 {
1397 if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
1398 return bch2_btree_delete_at(trans, iter,
1399 BTREE_UPDATE_internal_snapshot_node);
1400
1401 u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
1402 if (live_child) {
1403 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1404 int ret = PTR_ERR_OR_ZERO(new);
1405 if (ret)
1406 return ret;
1407
1408 new->k.p.snapshot = live_child;
1409
1410 struct btree_iter dst_iter;
1411 struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
1412 iter->btree_id, new->k.p,
1413 BTREE_ITER_all_snapshots|
1414 BTREE_ITER_intent);
1415 ret = bkey_err(dst_k);
1416 if (ret)
1417 return ret;
1418
1419 ret = (bkey_deleted(dst_k.k)
1420 ? bch2_trans_update(trans, &dst_iter, new,
1421 BTREE_UPDATE_internal_snapshot_node)
1422 : 0) ?:
1423 bch2_btree_delete_at(trans, iter,
1424 BTREE_UPDATE_internal_snapshot_node);
1425 bch2_trans_iter_exit(trans, &dst_iter);
1426 return ret;
1427 }
1428
1429 return 0;
1430 }
1431
1432 /*
1433 * For a given snapshot, if it doesn't have a subvolume that points to it, and
1434 * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1435 * as deleted.
1436 */
check_should_delete_snapshot(struct btree_trans * trans,struct bkey_s_c k,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1437 static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
1438 snapshot_id_list *delete_leaves,
1439 interior_delete_list *delete_interior)
1440 {
1441 if (k.k->type != KEY_TYPE_snapshot)
1442 return 0;
1443
1444 struct bch_fs *c = trans->c;
1445 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
1446 unsigned live_children = 0;
1447
1448 if (BCH_SNAPSHOT_SUBVOL(s.v))
1449 return 0;
1450
1451 for (unsigned i = 0; i < 2; i++) {
1452 u32 child = le32_to_cpu(s.v->children[i]);
1453
1454 live_children += child &&
1455 !snapshot_list_has_id(delete_leaves, child);
1456 }
1457
1458 if (live_children == 0) {
1459 return snapshot_list_add(c, delete_leaves, s.k->p.offset);
1460 } else if (live_children == 1) {
1461 struct snapshot_interior_delete d = {
1462 .id = s.k->p.offset,
1463 .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior),
1464 };
1465
1466 if (!d.live_child) {
1467 bch_err(c, "error finding live child of snapshot %u", d.id);
1468 return -EINVAL;
1469 }
1470
1471 return darray_push(delete_interior, d);
1472 } else {
1473 return 0;
1474 }
1475 }
1476
bch2_snapshot_nth_parent_skip(struct bch_fs * c,u32 id,u32 n,interior_delete_list * skip)1477 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1478 interior_delete_list *skip)
1479 {
1480 rcu_read_lock();
1481 while (interior_delete_has_id(skip, id))
1482 id = __bch2_snapshot_parent(c, id);
1483
1484 while (n--) {
1485 do {
1486 id = __bch2_snapshot_parent(c, id);
1487 } while (interior_delete_has_id(skip, id));
1488 }
1489 rcu_read_unlock();
1490
1491 return id;
1492 }
1493
bch2_fix_child_of_deleted_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,interior_delete_list * deleted)1494 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1495 struct btree_iter *iter, struct bkey_s_c k,
1496 interior_delete_list *deleted)
1497 {
1498 struct bch_fs *c = trans->c;
1499 u32 nr_deleted_ancestors = 0;
1500 struct bkey_i_snapshot *s;
1501 int ret;
1502
1503 if (k.k->type != KEY_TYPE_snapshot)
1504 return 0;
1505
1506 if (interior_delete_has_id(deleted, k.k->p.offset))
1507 return 0;
1508
1509 s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1510 ret = PTR_ERR_OR_ZERO(s);
1511 if (ret)
1512 return ret;
1513
1514 darray_for_each(*deleted, i)
1515 nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
1516
1517 if (!nr_deleted_ancestors)
1518 return 0;
1519
1520 le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1521
1522 if (!s->v.depth) {
1523 s->v.skip[0] = 0;
1524 s->v.skip[1] = 0;
1525 s->v.skip[2] = 0;
1526 } else {
1527 u32 depth = le32_to_cpu(s->v.depth);
1528 u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1529
1530 for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1531 u32 id = le32_to_cpu(s->v.skip[j]);
1532
1533 if (interior_delete_has_id(deleted, id)) {
1534 id = bch2_snapshot_nth_parent_skip(c,
1535 parent,
1536 depth > 1
1537 ? get_random_u32_below(depth - 1)
1538 : 0,
1539 deleted);
1540 s->v.skip[j] = cpu_to_le32(id);
1541 }
1542 }
1543
1544 bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1545 }
1546
1547 return bch2_trans_update(trans, iter, &s->k_i, 0);
1548 }
1549
bch2_delete_dead_snapshots(struct bch_fs * c)1550 int bch2_delete_dead_snapshots(struct bch_fs *c)
1551 {
1552 if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1553 return 0;
1554
1555 struct btree_trans *trans = bch2_trans_get(c);
1556 snapshot_id_list delete_leaves = {};
1557 interior_delete_list delete_interior = {};
1558 int ret = 0;
1559
1560 /*
1561 * For every snapshot node: If we have no live children and it's not
1562 * pointed to by a subvolume, delete it:
1563 */
1564 ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
1565 check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
1566 if (!bch2_err_matches(ret, EROFS))
1567 bch_err_msg(c, ret, "walking snapshots");
1568 if (ret)
1569 goto err;
1570
1571 if (!delete_leaves.nr && !delete_interior.nr)
1572 goto err;
1573
1574 {
1575 struct printbuf buf = PRINTBUF;
1576 prt_printf(&buf, "deleting leaves");
1577 darray_for_each(delete_leaves, i)
1578 prt_printf(&buf, " %u", *i);
1579
1580 prt_printf(&buf, " interior");
1581 darray_for_each(delete_interior, i)
1582 prt_printf(&buf, " %u->%u", i->id, i->live_child);
1583
1584 ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
1585 printbuf_exit(&buf);
1586 if (ret)
1587 goto err;
1588 }
1589
1590 for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
1591 struct disk_reservation res = { 0 };
1592
1593 if (!btree_type_has_snapshots(btree))
1594 continue;
1595
1596 ret = for_each_btree_key_commit(trans, iter,
1597 btree, POS_MIN,
1598 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1599 &res, NULL, BCH_TRANS_COMMIT_no_enospc,
1600 delete_dead_snapshots_process_key(trans, &iter, k,
1601 &delete_leaves,
1602 &delete_interior));
1603
1604 bch2_disk_reservation_put(c, &res);
1605
1606 if (!bch2_err_matches(ret, EROFS))
1607 bch_err_msg(c, ret, "deleting keys from dying snapshots");
1608 if (ret)
1609 goto err;
1610 }
1611
1612 darray_for_each(delete_leaves, i) {
1613 ret = commit_do(trans, NULL, NULL, 0,
1614 bch2_snapshot_node_delete(trans, *i));
1615 if (!bch2_err_matches(ret, EROFS))
1616 bch_err_msg(c, ret, "deleting snapshot %u", *i);
1617 if (ret)
1618 goto err;
1619 }
1620
1621 /*
1622 * Fixing children of deleted snapshots can't be done completely
1623 * atomically, if we crash between here and when we delete the interior
1624 * nodes some depth fields will be off:
1625 */
1626 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1627 BTREE_ITER_intent, k,
1628 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1629 bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
1630 if (ret)
1631 goto err;
1632
1633 darray_for_each(delete_interior, i) {
1634 ret = commit_do(trans, NULL, NULL, 0,
1635 bch2_snapshot_node_delete(trans, i->id));
1636 if (!bch2_err_matches(ret, EROFS))
1637 bch_err_msg(c, ret, "deleting snapshot %u", i->id);
1638 if (ret)
1639 goto err;
1640 }
1641 err:
1642 darray_exit(&delete_interior);
1643 darray_exit(&delete_leaves);
1644 bch2_trans_put(trans);
1645 if (!bch2_err_matches(ret, EROFS))
1646 bch_err_fn(c, ret);
1647 return ret;
1648 }
1649
bch2_delete_dead_snapshots_work(struct work_struct * work)1650 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1651 {
1652 struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1653
1654 set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
1655
1656 bch2_delete_dead_snapshots(c);
1657 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1658 }
1659
bch2_delete_dead_snapshots_async(struct bch_fs * c)1660 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1661 {
1662 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
1663 return;
1664
1665 BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
1666
1667 if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1668 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1669 }
1670
__bch2_key_has_snapshot_overwrites(struct btree_trans * trans,enum btree_id id,struct bpos pos)1671 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1672 enum btree_id id,
1673 struct bpos pos)
1674 {
1675 struct bch_fs *c = trans->c;
1676 struct btree_iter iter;
1677 struct bkey_s_c k;
1678 int ret;
1679
1680 for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
1681 BTREE_ITER_not_extents|
1682 BTREE_ITER_all_snapshots,
1683 k, ret) {
1684 if (!bkey_eq(pos, k.k->p))
1685 break;
1686
1687 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1688 ret = 1;
1689 break;
1690 }
1691 }
1692 bch2_trans_iter_exit(trans, &iter);
1693
1694 return ret;
1695 }
1696
interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)1697 static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
1698 {
1699 /* If there's one child, it's redundant and keys will be moved to the child */
1700 return !!snap.v->children[0] + !!snap.v->children[1] == 1;
1701 }
1702
bch2_check_snapshot_needs_deletion(struct btree_trans * trans,struct bkey_s_c k)1703 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1704 {
1705 if (k.k->type != KEY_TYPE_snapshot)
1706 return 0;
1707
1708 struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
1709 if (BCH_SNAPSHOT_DELETED(snap.v) ||
1710 interior_snapshot_needs_delete(snap))
1711 set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
1712
1713 return 0;
1714 }
1715
bch2_snapshots_read(struct bch_fs * c)1716 int bch2_snapshots_read(struct bch_fs *c)
1717 {
1718 /*
1719 * Initializing the is_ancestor bitmaps requires ancestors to already be
1720 * initialized - so mark in reverse:
1721 */
1722 int ret = bch2_trans_run(c,
1723 for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
1724 POS_MAX, 0, k,
1725 __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1726 bch2_check_snapshot_needs_deletion(trans, k)));
1727 bch_err_fn(c, ret);
1728
1729 /*
1730 * It's important that we check if we need to reconstruct snapshots
1731 * before going RW, so we mark that pass as required in the superblock -
1732 * otherwise, we could end up deleting keys with missing snapshot nodes
1733 * instead
1734 */
1735 BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
1736 test_bit(BCH_FS_may_go_rw, &c->flags));
1737
1738 if (bch2_err_matches(ret, EIO) ||
1739 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
1740 ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
1741
1742 return ret;
1743 }
1744
bch2_fs_snapshots_exit(struct bch_fs * c)1745 void bch2_fs_snapshots_exit(struct bch_fs *c)
1746 {
1747 kvfree(rcu_dereference_protected(c->snapshots, true));
1748 }
1749