1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_key_cache.h" 6 #include "btree_update.h" 7 #include "buckets.h" 8 #include "errcode.h" 9 #include "error.h" 10 #include "fs.h" 11 #include "snapshot.h" 12 13 #include <linux/random.h> 14 15 /* 16 * Snapshot trees: 17 * 18 * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they 19 * exist to provide a stable identifier for the whole lifetime of a snapshot 20 * tree. 21 */ 22 23 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, 24 struct bkey_s_c k) 25 { 26 struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); 27 28 prt_printf(out, "subvol %u root snapshot %u", 29 le32_to_cpu(t.v->master_subvol), 30 le32_to_cpu(t.v->root_snapshot)); 31 } 32 33 int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k, 34 enum bkey_invalid_flags flags, 35 struct printbuf *err) 36 { 37 if (bkey_gt(k.k->p, POS(0, U32_MAX)) || 38 bkey_lt(k.k->p, POS(0, 1))) { 39 prt_printf(err, "bad pos"); 40 return -BCH_ERR_invalid_bkey; 41 } 42 43 return 0; 44 } 45 46 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, 47 struct bch_snapshot_tree *s) 48 { 49 int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), 50 BTREE_ITER_WITH_UPDATES, snapshot_tree, s); 51 52 if (bch2_err_matches(ret, ENOENT)) 53 ret = -BCH_ERR_ENOENT_snapshot_tree; 54 return ret; 55 } 56 57 struct bkey_i_snapshot_tree * 58 __bch2_snapshot_tree_create(struct btree_trans *trans) 59 { 60 struct btree_iter iter; 61 int ret = bch2_bkey_get_empty_slot(trans, &iter, 62 BTREE_ID_snapshot_trees, POS(0, U32_MAX)); 63 struct bkey_i_snapshot_tree *s_t; 64 65 if (ret == -BCH_ERR_ENOSPC_btree_slot) 66 ret = -BCH_ERR_ENOSPC_snapshot_tree; 67 if (ret) 68 return ERR_PTR(ret); 69 70 s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); 71 ret = PTR_ERR_OR_ZERO(s_t); 72 bch2_trans_iter_exit(trans, &iter); 73 return ret ? ERR_PTR(ret) : s_t; 74 } 75 76 static int bch2_snapshot_tree_create(struct btree_trans *trans, 77 u32 root_id, u32 subvol_id, u32 *tree_id) 78 { 79 struct bkey_i_snapshot_tree *n_tree = 80 __bch2_snapshot_tree_create(trans); 81 82 if (IS_ERR(n_tree)) 83 return PTR_ERR(n_tree); 84 85 n_tree->v.master_subvol = cpu_to_le32(subvol_id); 86 n_tree->v.root_snapshot = cpu_to_le32(root_id); 87 *tree_id = n_tree->k.p.offset; 88 return 0; 89 } 90 91 /* Snapshot nodes: */ 92 93 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) 94 { 95 struct snapshot_table *t; 96 97 rcu_read_lock(); 98 t = rcu_dereference(c->snapshots); 99 100 while (id && id < ancestor) 101 id = __snapshot_t(t, id)->parent; 102 rcu_read_unlock(); 103 104 return id == ancestor; 105 } 106 107 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) 108 { 109 const struct snapshot_t *s = __snapshot_t(t, id); 110 111 if (s->skip[2] <= ancestor) 112 return s->skip[2]; 113 if (s->skip[1] <= ancestor) 114 return s->skip[1]; 115 if (s->skip[0] <= ancestor) 116 return s->skip[0]; 117 return s->parent; 118 } 119 120 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) 121 { 122 struct snapshot_table *t; 123 bool ret; 124 125 EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); 126 127 rcu_read_lock(); 128 t = rcu_dereference(c->snapshots); 129 130 while (id && id < ancestor - IS_ANCESTOR_BITMAP) 131 id = get_ancestor_below(t, id, ancestor); 132 133 if (id && id < ancestor) { 134 ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); 135 136 EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor)); 137 } else { 138 ret = id == ancestor; 139 } 140 141 rcu_read_unlock(); 142 143 return ret; 144 } 145 146 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) 147 { 148 size_t idx = U32_MAX - id; 149 size_t new_size; 150 struct snapshot_table *new, *old; 151 152 new_size = max(16UL, roundup_pow_of_two(idx + 1)); 153 154 new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); 155 if (!new) 156 return NULL; 157 158 old = rcu_dereference_protected(c->snapshots, true); 159 if (old) 160 memcpy(new->s, 161 rcu_dereference_protected(c->snapshots, true)->s, 162 sizeof(new->s[0]) * c->snapshot_table_size); 163 164 rcu_assign_pointer(c->snapshots, new); 165 c->snapshot_table_size = new_size; 166 kvfree_rcu_mightsleep(old); 167 168 return &rcu_dereference_protected(c->snapshots, true)->s[idx]; 169 } 170 171 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) 172 { 173 size_t idx = U32_MAX - id; 174 175 lockdep_assert_held(&c->snapshot_table_lock); 176 177 if (likely(idx < c->snapshot_table_size)) 178 return &rcu_dereference_protected(c->snapshots, true)->s[idx]; 179 180 return __snapshot_t_mut(c, id); 181 } 182 183 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, 184 struct bkey_s_c k) 185 { 186 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); 187 188 prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", 189 BCH_SNAPSHOT_SUBVOL(s.v), 190 BCH_SNAPSHOT_DELETED(s.v), 191 le32_to_cpu(s.v->parent), 192 le32_to_cpu(s.v->children[0]), 193 le32_to_cpu(s.v->children[1]), 194 le32_to_cpu(s.v->subvol), 195 le32_to_cpu(s.v->tree)); 196 197 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) 198 prt_printf(out, " depth %u skiplist %u %u %u", 199 le32_to_cpu(s.v->depth), 200 le32_to_cpu(s.v->skip[0]), 201 le32_to_cpu(s.v->skip[1]), 202 le32_to_cpu(s.v->skip[2])); 203 } 204 205 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, 206 enum bkey_invalid_flags flags, 207 struct printbuf *err) 208 { 209 struct bkey_s_c_snapshot s; 210 u32 i, id; 211 212 if (bkey_gt(k.k->p, POS(0, U32_MAX)) || 213 bkey_lt(k.k->p, POS(0, 1))) { 214 prt_printf(err, "bad pos"); 215 return -BCH_ERR_invalid_bkey; 216 } 217 218 s = bkey_s_c_to_snapshot(k); 219 220 id = le32_to_cpu(s.v->parent); 221 if (id && id <= k.k->p.offset) { 222 prt_printf(err, "bad parent node (%u <= %llu)", 223 id, k.k->p.offset); 224 return -BCH_ERR_invalid_bkey; 225 } 226 227 if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { 228 prt_printf(err, "children not normalized"); 229 return -BCH_ERR_invalid_bkey; 230 } 231 232 if (s.v->children[0] && 233 s.v->children[0] == s.v->children[1]) { 234 prt_printf(err, "duplicate child nodes"); 235 return -BCH_ERR_invalid_bkey; 236 } 237 238 for (i = 0; i < 2; i++) { 239 id = le32_to_cpu(s.v->children[i]); 240 241 if (id >= k.k->p.offset) { 242 prt_printf(err, "bad child node (%u >= %llu)", 243 id, k.k->p.offset); 244 return -BCH_ERR_invalid_bkey; 245 } 246 } 247 248 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { 249 if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || 250 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) { 251 prt_printf(err, "skiplist not normalized"); 252 return -BCH_ERR_invalid_bkey; 253 } 254 255 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { 256 id = le32_to_cpu(s.v->skip[i]); 257 258 if ((id && !s.v->parent) || 259 (id && id <= k.k->p.offset)) { 260 prt_printf(err, "bad skiplist node %u", id); 261 return -BCH_ERR_invalid_bkey; 262 } 263 } 264 } 265 266 return 0; 267 } 268 269 static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) 270 { 271 struct snapshot_t *t = snapshot_t_mut(c, id); 272 u32 parent = id; 273 274 while ((parent = bch2_snapshot_parent_early(c, parent)) && 275 parent - id - 1 < IS_ANCESTOR_BITMAP) 276 __set_bit(parent - id - 1, t->is_ancestor); 277 } 278 279 static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) 280 { 281 mutex_lock(&c->snapshot_table_lock); 282 __set_is_ancestor_bitmap(c, id); 283 mutex_unlock(&c->snapshot_table_lock); 284 } 285 286 int bch2_mark_snapshot(struct btree_trans *trans, 287 enum btree_id btree, unsigned level, 288 struct bkey_s_c old, struct bkey_s_c new, 289 unsigned flags) 290 { 291 struct bch_fs *c = trans->c; 292 struct snapshot_t *t; 293 u32 id = new.k->p.offset; 294 int ret = 0; 295 296 mutex_lock(&c->snapshot_table_lock); 297 298 t = snapshot_t_mut(c, id); 299 if (!t) { 300 ret = -BCH_ERR_ENOMEM_mark_snapshot; 301 goto err; 302 } 303 304 if (new.k->type == KEY_TYPE_snapshot) { 305 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); 306 307 t->parent = le32_to_cpu(s.v->parent); 308 t->children[0] = le32_to_cpu(s.v->children[0]); 309 t->children[1] = le32_to_cpu(s.v->children[1]); 310 t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; 311 t->tree = le32_to_cpu(s.v->tree); 312 313 if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { 314 t->depth = le32_to_cpu(s.v->depth); 315 t->skip[0] = le32_to_cpu(s.v->skip[0]); 316 t->skip[1] = le32_to_cpu(s.v->skip[1]); 317 t->skip[2] = le32_to_cpu(s.v->skip[2]); 318 } else { 319 t->depth = 0; 320 t->skip[0] = 0; 321 t->skip[1] = 0; 322 t->skip[2] = 0; 323 } 324 325 __set_is_ancestor_bitmap(c, id); 326 327 if (BCH_SNAPSHOT_DELETED(s.v)) { 328 set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); 329 c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots); 330 } 331 } else { 332 memset(t, 0, sizeof(*t)); 333 } 334 err: 335 mutex_unlock(&c->snapshot_table_lock); 336 return ret; 337 } 338 339 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, 340 struct bch_snapshot *s) 341 { 342 return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), 343 BTREE_ITER_WITH_UPDATES, snapshot, s); 344 } 345 346 static int bch2_snapshot_live(struct btree_trans *trans, u32 id) 347 { 348 struct bch_snapshot v; 349 int ret; 350 351 if (!id) 352 return 0; 353 354 ret = bch2_snapshot_lookup(trans, id, &v); 355 if (bch2_err_matches(ret, ENOENT)) 356 bch_err(trans->c, "snapshot node %u not found", id); 357 if (ret) 358 return ret; 359 360 return !BCH_SNAPSHOT_DELETED(&v); 361 } 362 363 /* 364 * If @k is a snapshot with just one live child, it's part of a linear chain, 365 * which we consider to be an equivalence class: and then after snapshot 366 * deletion cleanup, there should only be a single key at a given position in 367 * this equivalence class. 368 * 369 * This sets the equivalence class of @k to be the child's equivalence class, if 370 * it's part of such a linear chain: this correctly sets equivalence classes on 371 * startup if we run leaf to root (i.e. in natural key order). 372 */ 373 static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) 374 { 375 struct bch_fs *c = trans->c; 376 unsigned i, nr_live = 0, live_idx = 0; 377 struct bkey_s_c_snapshot snap; 378 u32 id = k.k->p.offset, child[2]; 379 380 if (k.k->type != KEY_TYPE_snapshot) 381 return 0; 382 383 snap = bkey_s_c_to_snapshot(k); 384 385 child[0] = le32_to_cpu(snap.v->children[0]); 386 child[1] = le32_to_cpu(snap.v->children[1]); 387 388 for (i = 0; i < 2; i++) { 389 int ret = bch2_snapshot_live(trans, child[i]); 390 391 if (ret < 0) 392 return ret; 393 394 if (ret) 395 live_idx = i; 396 nr_live += ret; 397 } 398 399 mutex_lock(&c->snapshot_table_lock); 400 401 snapshot_t_mut(c, id)->equiv = nr_live == 1 402 ? snapshot_t_mut(c, child[live_idx])->equiv 403 : id; 404 405 mutex_unlock(&c->snapshot_table_lock); 406 407 return 0; 408 } 409 410 /* fsck: */ 411 412 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) 413 { 414 return snapshot_t(c, id)->children[child]; 415 } 416 417 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) 418 { 419 return bch2_snapshot_child(c, id, 0); 420 } 421 422 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) 423 { 424 return bch2_snapshot_child(c, id, 1); 425 } 426 427 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) 428 { 429 u32 n, parent; 430 431 n = bch2_snapshot_left_child(c, id); 432 if (n) 433 return n; 434 435 while ((parent = bch2_snapshot_parent(c, id))) { 436 n = bch2_snapshot_right_child(c, parent); 437 if (n && n != id) 438 return n; 439 id = parent; 440 } 441 442 return 0; 443 } 444 445 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) 446 { 447 u32 id = snapshot_root; 448 u32 subvol = 0, s; 449 450 while (id) { 451 s = snapshot_t(c, id)->subvol; 452 453 if (s && (!subvol || s < subvol)) 454 subvol = s; 455 456 id = bch2_snapshot_tree_next(c, id); 457 } 458 459 return subvol; 460 } 461 462 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, 463 u32 snapshot_root, u32 *subvol_id) 464 { 465 struct bch_fs *c = trans->c; 466 struct btree_iter iter; 467 struct bkey_s_c k; 468 struct bkey_s_c_subvolume s; 469 bool found = false; 470 int ret; 471 472 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 473 0, k, ret) { 474 if (k.k->type != KEY_TYPE_subvolume) 475 continue; 476 477 s = bkey_s_c_to_subvolume(k); 478 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) 479 continue; 480 if (!BCH_SUBVOLUME_SNAP(s.v)) { 481 *subvol_id = s.k->p.offset; 482 found = true; 483 break; 484 } 485 } 486 487 bch2_trans_iter_exit(trans, &iter); 488 489 if (!ret && !found) { 490 struct bkey_i_subvolume *u; 491 492 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); 493 494 u = bch2_bkey_get_mut_typed(trans, &iter, 495 BTREE_ID_subvolumes, POS(0, *subvol_id), 496 0, subvolume); 497 ret = PTR_ERR_OR_ZERO(u); 498 if (ret) 499 return ret; 500 501 SET_BCH_SUBVOLUME_SNAP(&u->v, false); 502 } 503 504 return ret; 505 } 506 507 static int check_snapshot_tree(struct btree_trans *trans, 508 struct btree_iter *iter, 509 struct bkey_s_c k) 510 { 511 struct bch_fs *c = trans->c; 512 struct bkey_s_c_snapshot_tree st; 513 struct bch_snapshot s; 514 struct bch_subvolume subvol; 515 struct printbuf buf = PRINTBUF; 516 u32 root_id; 517 int ret; 518 519 if (k.k->type != KEY_TYPE_snapshot_tree) 520 return 0; 521 522 st = bkey_s_c_to_snapshot_tree(k); 523 root_id = le32_to_cpu(st.v->root_snapshot); 524 525 ret = bch2_snapshot_lookup(trans, root_id, &s); 526 if (ret && !bch2_err_matches(ret, ENOENT)) 527 goto err; 528 529 if (fsck_err_on(ret || 530 root_id != bch2_snapshot_root(c, root_id) || 531 st.k->p.offset != le32_to_cpu(s.tree), 532 c, 533 "snapshot tree points to missing/incorrect snapshot:\n %s", 534 (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { 535 ret = bch2_btree_delete_at(trans, iter, 0); 536 goto err; 537 } 538 539 ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), 540 false, 0, &subvol); 541 if (ret && !bch2_err_matches(ret, ENOENT)) 542 goto err; 543 544 if (fsck_err_on(ret, c, 545 "snapshot tree points to missing subvolume:\n %s", 546 (printbuf_reset(&buf), 547 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || 548 fsck_err_on(!bch2_snapshot_is_ancestor_early(c, 549 le32_to_cpu(subvol.snapshot), 550 root_id), c, 551 "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", 552 (printbuf_reset(&buf), 553 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || 554 fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c, 555 "snapshot tree points to snapshot subvolume:\n %s", 556 (printbuf_reset(&buf), 557 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { 558 struct bkey_i_snapshot_tree *u; 559 u32 subvol_id; 560 561 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); 562 if (ret) 563 goto err; 564 565 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); 566 ret = PTR_ERR_OR_ZERO(u); 567 if (ret) 568 goto err; 569 570 u->v.master_subvol = cpu_to_le32(subvol_id); 571 st = snapshot_tree_i_to_s_c(u); 572 } 573 err: 574 fsck_err: 575 printbuf_exit(&buf); 576 return ret; 577 } 578 579 /* 580 * For each snapshot_tree, make sure it points to the root of a snapshot tree 581 * and that snapshot entry points back to it, or delete it. 582 * 583 * And, make sure it points to a subvolume within that snapshot tree, or correct 584 * it to point to the oldest subvolume within that snapshot tree. 585 */ 586 int bch2_check_snapshot_trees(struct bch_fs *c) 587 { 588 struct btree_iter iter; 589 struct bkey_s_c k; 590 int ret; 591 592 ret = bch2_trans_run(c, 593 for_each_btree_key_commit(trans, iter, 594 BTREE_ID_snapshot_trees, POS_MIN, 595 BTREE_ITER_PREFETCH, k, 596 NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, 597 check_snapshot_tree(trans, &iter, k))); 598 599 if (ret) 600 bch_err(c, "error %i checking snapshot trees", ret); 601 return ret; 602 } 603 604 /* 605 * Look up snapshot tree for @tree_id and find root, 606 * make sure @snap_id is a descendent: 607 */ 608 static int snapshot_tree_ptr_good(struct btree_trans *trans, 609 u32 snap_id, u32 tree_id) 610 { 611 struct bch_snapshot_tree s_t; 612 int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); 613 614 if (bch2_err_matches(ret, ENOENT)) 615 return 0; 616 if (ret) 617 return ret; 618 619 return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); 620 } 621 622 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) 623 { 624 const struct snapshot_t *s; 625 626 if (!id) 627 return 0; 628 629 rcu_read_lock(); 630 s = snapshot_t(c, id); 631 if (s->parent) 632 id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); 633 rcu_read_unlock(); 634 635 return id; 636 } 637 638 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) 639 { 640 unsigned i; 641 642 for (i = 0; i < 3; i++) 643 if (!s.parent) { 644 if (s.skip[i]) 645 return false; 646 } else { 647 if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i]))) 648 return false; 649 } 650 651 return true; 652 } 653 654 /* 655 * snapshot_tree pointer was incorrect: look up root snapshot node, make sure 656 * its snapshot_tree pointer is correct (allocate new one if necessary), then 657 * update this node's pointer to root node's pointer: 658 */ 659 static int snapshot_tree_ptr_repair(struct btree_trans *trans, 660 struct btree_iter *iter, 661 struct bkey_s_c k, 662 struct bch_snapshot *s) 663 { 664 struct bch_fs *c = trans->c; 665 struct btree_iter root_iter; 666 struct bch_snapshot_tree s_t; 667 struct bkey_s_c_snapshot root; 668 struct bkey_i_snapshot *u; 669 u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; 670 int ret; 671 672 root = bch2_bkey_get_iter_typed(trans, &root_iter, 673 BTREE_ID_snapshots, POS(0, root_id), 674 BTREE_ITER_WITH_UPDATES, snapshot); 675 ret = bkey_err(root); 676 if (ret) 677 goto err; 678 679 tree_id = le32_to_cpu(root.v->tree); 680 681 ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); 682 if (ret && !bch2_err_matches(ret, ENOENT)) 683 return ret; 684 685 if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { 686 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); 687 ret = PTR_ERR_OR_ZERO(u) ?: 688 bch2_snapshot_tree_create(trans, root_id, 689 bch2_snapshot_tree_oldest_subvol(c, root_id), 690 &tree_id); 691 if (ret) 692 goto err; 693 694 u->v.tree = cpu_to_le32(tree_id); 695 if (k.k->p.offset == root_id) 696 *s = u->v; 697 } 698 699 if (k.k->p.offset != root_id) { 700 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); 701 ret = PTR_ERR_OR_ZERO(u); 702 if (ret) 703 goto err; 704 705 u->v.tree = cpu_to_le32(tree_id); 706 *s = u->v; 707 } 708 err: 709 bch2_trans_iter_exit(trans, &root_iter); 710 return ret; 711 } 712 713 static int check_snapshot(struct btree_trans *trans, 714 struct btree_iter *iter, 715 struct bkey_s_c k) 716 { 717 struct bch_fs *c = trans->c; 718 struct bch_snapshot s; 719 struct bch_subvolume subvol; 720 struct bch_snapshot v; 721 struct bkey_i_snapshot *u; 722 u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); 723 u32 real_depth; 724 struct printbuf buf = PRINTBUF; 725 bool should_have_subvol; 726 u32 i, id; 727 int ret = 0; 728 729 if (k.k->type != KEY_TYPE_snapshot) 730 return 0; 731 732 memset(&s, 0, sizeof(s)); 733 memcpy(&s, k.v, bkey_val_bytes(k.k)); 734 735 id = le32_to_cpu(s.parent); 736 if (id) { 737 ret = bch2_snapshot_lookup(trans, id, &v); 738 if (bch2_err_matches(ret, ENOENT)) 739 bch_err(c, "snapshot with nonexistent parent:\n %s", 740 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 741 if (ret) 742 goto err; 743 744 if (le32_to_cpu(v.children[0]) != k.k->p.offset && 745 le32_to_cpu(v.children[1]) != k.k->p.offset) { 746 bch_err(c, "snapshot parent %u missing pointer to child %llu", 747 id, k.k->p.offset); 748 ret = -EINVAL; 749 goto err; 750 } 751 } 752 753 for (i = 0; i < 2 && s.children[i]; i++) { 754 id = le32_to_cpu(s.children[i]); 755 756 ret = bch2_snapshot_lookup(trans, id, &v); 757 if (bch2_err_matches(ret, ENOENT)) 758 bch_err(c, "snapshot node %llu has nonexistent child %u", 759 k.k->p.offset, id); 760 if (ret) 761 goto err; 762 763 if (le32_to_cpu(v.parent) != k.k->p.offset) { 764 bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", 765 id, le32_to_cpu(v.parent), k.k->p.offset); 766 ret = -EINVAL; 767 goto err; 768 } 769 } 770 771 should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && 772 !BCH_SNAPSHOT_DELETED(&s); 773 774 if (should_have_subvol) { 775 id = le32_to_cpu(s.subvol); 776 ret = bch2_subvolume_get(trans, id, 0, false, &subvol); 777 if (bch2_err_matches(ret, ENOENT)) 778 bch_err(c, "snapshot points to nonexistent subvolume:\n %s", 779 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 780 if (ret) 781 goto err; 782 783 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { 784 bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", 785 k.k->p.offset); 786 ret = -EINVAL; 787 goto err; 788 } 789 } else { 790 if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s", 791 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 792 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); 793 ret = PTR_ERR_OR_ZERO(u); 794 if (ret) 795 goto err; 796 797 u->v.subvol = 0; 798 s = u->v; 799 } 800 } 801 802 ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); 803 if (ret < 0) 804 goto err; 805 806 if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s", 807 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 808 ret = snapshot_tree_ptr_repair(trans, iter, k, &s); 809 if (ret) 810 goto err; 811 } 812 ret = 0; 813 814 real_depth = bch2_snapshot_depth(c, parent_id); 815 816 if (le32_to_cpu(s.depth) != real_depth && 817 (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || 818 fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s", 819 real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { 820 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); 821 ret = PTR_ERR_OR_ZERO(u); 822 if (ret) 823 goto err; 824 825 u->v.depth = cpu_to_le32(real_depth); 826 s = u->v; 827 } 828 829 ret = snapshot_skiplist_good(trans, k.k->p.offset, s); 830 if (ret < 0) 831 goto err; 832 833 if (!ret && 834 (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || 835 fsck_err(c, "snapshot with bad skiplist field:\n %s", 836 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { 837 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); 838 ret = PTR_ERR_OR_ZERO(u); 839 if (ret) 840 goto err; 841 842 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) 843 u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id)); 844 845 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); 846 s = u->v; 847 } 848 ret = 0; 849 err: 850 fsck_err: 851 printbuf_exit(&buf); 852 return ret; 853 } 854 855 int bch2_check_snapshots(struct bch_fs *c) 856 { 857 struct btree_iter iter; 858 struct bkey_s_c k; 859 int ret; 860 861 /* 862 * We iterate backwards as checking/fixing the depth field requires that 863 * the parent's depth already be correct: 864 */ 865 ret = bch2_trans_run(c, 866 for_each_btree_key_reverse_commit(trans, iter, 867 BTREE_ID_snapshots, POS_MAX, 868 BTREE_ITER_PREFETCH, k, 869 NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, 870 check_snapshot(trans, &iter, k))); 871 if (ret) 872 bch_err_fn(c, ret); 873 return ret; 874 } 875 876 /* 877 * Mark a snapshot as deleted, for future cleanup: 878 */ 879 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) 880 { 881 struct btree_iter iter; 882 struct bkey_i_snapshot *s; 883 int ret = 0; 884 885 s = bch2_bkey_get_mut_typed(trans, &iter, 886 BTREE_ID_snapshots, POS(0, id), 887 0, snapshot); 888 ret = PTR_ERR_OR_ZERO(s); 889 if (unlikely(ret)) { 890 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), 891 trans->c, "missing snapshot %u", id); 892 return ret; 893 } 894 895 /* already deleted? */ 896 if (BCH_SNAPSHOT_DELETED(&s->v)) 897 goto err; 898 899 SET_BCH_SNAPSHOT_DELETED(&s->v, true); 900 SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); 901 s->v.subvol = 0; 902 err: 903 bch2_trans_iter_exit(trans, &iter); 904 return ret; 905 } 906 907 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) 908 { 909 if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) 910 swap(s->children[0], s->children[1]); 911 } 912 913 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) 914 { 915 struct bch_fs *c = trans->c; 916 struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; 917 struct btree_iter c_iter = (struct btree_iter) { NULL }; 918 struct btree_iter tree_iter = (struct btree_iter) { NULL }; 919 struct bkey_s_c_snapshot s; 920 u32 parent_id, child_id; 921 unsigned i; 922 int ret = 0; 923 924 s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), 925 BTREE_ITER_INTENT, snapshot); 926 ret = bkey_err(s); 927 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, 928 "missing snapshot %u", id); 929 930 if (ret) 931 goto err; 932 933 BUG_ON(s.v->children[1]); 934 935 parent_id = le32_to_cpu(s.v->parent); 936 child_id = le32_to_cpu(s.v->children[0]); 937 938 if (parent_id) { 939 struct bkey_i_snapshot *parent; 940 941 parent = bch2_bkey_get_mut_typed(trans, &p_iter, 942 BTREE_ID_snapshots, POS(0, parent_id), 943 0, snapshot); 944 ret = PTR_ERR_OR_ZERO(parent); 945 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, 946 "missing snapshot %u", parent_id); 947 if (unlikely(ret)) 948 goto err; 949 950 /* find entry in parent->children for node being deleted */ 951 for (i = 0; i < 2; i++) 952 if (le32_to_cpu(parent->v.children[i]) == id) 953 break; 954 955 if (bch2_fs_inconsistent_on(i == 2, c, 956 "snapshot %u missing child pointer to %u", 957 parent_id, id)) 958 goto err; 959 960 parent->v.children[i] = le32_to_cpu(child_id); 961 962 normalize_snapshot_child_pointers(&parent->v); 963 } 964 965 if (child_id) { 966 struct bkey_i_snapshot *child; 967 968 child = bch2_bkey_get_mut_typed(trans, &c_iter, 969 BTREE_ID_snapshots, POS(0, child_id), 970 0, snapshot); 971 ret = PTR_ERR_OR_ZERO(child); 972 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, 973 "missing snapshot %u", child_id); 974 if (unlikely(ret)) 975 goto err; 976 977 child->v.parent = cpu_to_le32(parent_id); 978 979 if (!child->v.parent) { 980 child->v.skip[0] = 0; 981 child->v.skip[1] = 0; 982 child->v.skip[2] = 0; 983 } 984 } 985 986 if (!parent_id) { 987 /* 988 * We're deleting the root of a snapshot tree: update the 989 * snapshot_tree entry to point to the new root, or delete it if 990 * this is the last snapshot ID in this tree: 991 */ 992 struct bkey_i_snapshot_tree *s_t; 993 994 BUG_ON(s.v->children[1]); 995 996 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, 997 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), 998 0, snapshot_tree); 999 ret = PTR_ERR_OR_ZERO(s_t); 1000 if (ret) 1001 goto err; 1002 1003 if (s.v->children[0]) { 1004 s_t->v.root_snapshot = s.v->children[0]; 1005 } else { 1006 s_t->k.type = KEY_TYPE_deleted; 1007 set_bkey_val_u64s(&s_t->k, 0); 1008 } 1009 } 1010 1011 ret = bch2_btree_delete_at(trans, &iter, 0); 1012 err: 1013 bch2_trans_iter_exit(trans, &tree_iter); 1014 bch2_trans_iter_exit(trans, &p_iter); 1015 bch2_trans_iter_exit(trans, &c_iter); 1016 bch2_trans_iter_exit(trans, &iter); 1017 return ret; 1018 } 1019 1020 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, 1021 u32 *new_snapids, 1022 u32 *snapshot_subvols, 1023 unsigned nr_snapids) 1024 { 1025 struct bch_fs *c = trans->c; 1026 struct btree_iter iter; 1027 struct bkey_i_snapshot *n; 1028 struct bkey_s_c k; 1029 unsigned i, j; 1030 u32 depth = bch2_snapshot_depth(c, parent); 1031 int ret; 1032 1033 bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, 1034 POS_MIN, BTREE_ITER_INTENT); 1035 k = bch2_btree_iter_peek(&iter); 1036 ret = bkey_err(k); 1037 if (ret) 1038 goto err; 1039 1040 for (i = 0; i < nr_snapids; i++) { 1041 k = bch2_btree_iter_prev_slot(&iter); 1042 ret = bkey_err(k); 1043 if (ret) 1044 goto err; 1045 1046 if (!k.k || !k.k->p.offset) { 1047 ret = -BCH_ERR_ENOSPC_snapshot_create; 1048 goto err; 1049 } 1050 1051 n = bch2_bkey_alloc(trans, &iter, 0, snapshot); 1052 ret = PTR_ERR_OR_ZERO(n); 1053 if (ret) 1054 goto err; 1055 1056 n->v.flags = 0; 1057 n->v.parent = cpu_to_le32(parent); 1058 n->v.subvol = cpu_to_le32(snapshot_subvols[i]); 1059 n->v.tree = cpu_to_le32(tree); 1060 n->v.depth = cpu_to_le32(depth); 1061 1062 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) 1063 n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); 1064 1065 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); 1066 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); 1067 1068 ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, 1069 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); 1070 if (ret) 1071 goto err; 1072 1073 new_snapids[i] = iter.pos.offset; 1074 1075 mutex_lock(&c->snapshot_table_lock); 1076 snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i]; 1077 mutex_unlock(&c->snapshot_table_lock); 1078 } 1079 err: 1080 bch2_trans_iter_exit(trans, &iter); 1081 return ret; 1082 } 1083 1084 /* 1085 * Create new snapshot IDs as children of an existing snapshot ID: 1086 */ 1087 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, 1088 u32 *new_snapids, 1089 u32 *snapshot_subvols, 1090 unsigned nr_snapids) 1091 { 1092 struct btree_iter iter; 1093 struct bkey_i_snapshot *n_parent; 1094 int ret = 0; 1095 1096 n_parent = bch2_bkey_get_mut_typed(trans, &iter, 1097 BTREE_ID_snapshots, POS(0, parent), 1098 0, snapshot); 1099 ret = PTR_ERR_OR_ZERO(n_parent); 1100 if (unlikely(ret)) { 1101 if (bch2_err_matches(ret, ENOENT)) 1102 bch_err(trans->c, "snapshot %u not found", parent); 1103 return ret; 1104 } 1105 1106 if (n_parent->v.children[0] || n_parent->v.children[1]) { 1107 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); 1108 ret = -EINVAL; 1109 goto err; 1110 } 1111 1112 ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), 1113 new_snapids, snapshot_subvols, nr_snapids); 1114 if (ret) 1115 goto err; 1116 1117 n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); 1118 n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); 1119 n_parent->v.subvol = 0; 1120 SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); 1121 err: 1122 bch2_trans_iter_exit(trans, &iter); 1123 return ret; 1124 } 1125 1126 /* 1127 * Create a snapshot node that is the root of a new tree: 1128 */ 1129 static int bch2_snapshot_node_create_tree(struct btree_trans *trans, 1130 u32 *new_snapids, 1131 u32 *snapshot_subvols, 1132 unsigned nr_snapids) 1133 { 1134 struct bkey_i_snapshot_tree *n_tree; 1135 int ret; 1136 1137 n_tree = __bch2_snapshot_tree_create(trans); 1138 ret = PTR_ERR_OR_ZERO(n_tree) ?: 1139 create_snapids(trans, 0, n_tree->k.p.offset, 1140 new_snapids, snapshot_subvols, nr_snapids); 1141 if (ret) 1142 return ret; 1143 1144 n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); 1145 n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); 1146 return 0; 1147 } 1148 1149 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, 1150 u32 *new_snapids, 1151 u32 *snapshot_subvols, 1152 unsigned nr_snapids) 1153 { 1154 BUG_ON((parent == 0) != (nr_snapids == 1)); 1155 BUG_ON((parent != 0) != (nr_snapids == 2)); 1156 1157 return parent 1158 ? bch2_snapshot_node_create_children(trans, parent, 1159 new_snapids, snapshot_subvols, nr_snapids) 1160 : bch2_snapshot_node_create_tree(trans, 1161 new_snapids, snapshot_subvols, nr_snapids); 1162 1163 } 1164 1165 /* 1166 * If we have an unlinked inode in an internal snapshot node, and the inode 1167 * really has been deleted in all child snapshots, how does this get cleaned up? 1168 * 1169 * first there is the problem of how keys that have been overwritten in all 1170 * child snapshots get deleted (unimplemented?), but inodes may perhaps be 1171 * special? 1172 * 1173 * also: unlinked inode in internal snapshot appears to not be getting deleted 1174 * correctly if inode doesn't exist in leaf snapshots 1175 * 1176 * solution: 1177 * 1178 * for a key in an interior snapshot node that needs work to be done that 1179 * requires it to be mutated: iterate over all descendent leaf nodes and copy 1180 * that key to snapshot leaf nodes, where we can mutate it 1181 */ 1182 1183 static int snapshot_delete_key(struct btree_trans *trans, 1184 struct btree_iter *iter, 1185 struct bkey_s_c k, 1186 snapshot_id_list *deleted, 1187 snapshot_id_list *equiv_seen, 1188 struct bpos *last_pos) 1189 { 1190 struct bch_fs *c = trans->c; 1191 u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); 1192 1193 if (!bkey_eq(k.k->p, *last_pos)) 1194 equiv_seen->nr = 0; 1195 *last_pos = k.k->p; 1196 1197 if (snapshot_list_has_id(deleted, k.k->p.snapshot) || 1198 snapshot_list_has_id(equiv_seen, equiv)) { 1199 return bch2_btree_delete_at(trans, iter, 1200 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 1201 } else { 1202 return snapshot_list_add(c, equiv_seen, equiv); 1203 } 1204 } 1205 1206 static int move_key_to_correct_snapshot(struct btree_trans *trans, 1207 struct btree_iter *iter, 1208 struct bkey_s_c k) 1209 { 1210 struct bch_fs *c = trans->c; 1211 u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); 1212 1213 /* 1214 * When we have a linear chain of snapshot nodes, we consider 1215 * those to form an equivalence class: we're going to collapse 1216 * them all down to a single node, and keep the leaf-most node - 1217 * which has the same id as the equivalence class id. 1218 * 1219 * If there are multiple keys in different snapshots at the same 1220 * position, we're only going to keep the one in the newest 1221 * snapshot - the rest have been overwritten and are redundant, 1222 * and for the key we're going to keep we need to move it to the 1223 * equivalance class ID if it's not there already. 1224 */ 1225 if (equiv != k.k->p.snapshot) { 1226 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); 1227 struct btree_iter new_iter; 1228 int ret; 1229 1230 ret = PTR_ERR_OR_ZERO(new); 1231 if (ret) 1232 return ret; 1233 1234 new->k.p.snapshot = equiv; 1235 1236 bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, 1237 BTREE_ITER_ALL_SNAPSHOTS| 1238 BTREE_ITER_CACHED| 1239 BTREE_ITER_INTENT); 1240 1241 ret = bch2_btree_iter_traverse(&new_iter) ?: 1242 bch2_trans_update(trans, &new_iter, new, 1243 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1244 bch2_btree_delete_at(trans, iter, 1245 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 1246 bch2_trans_iter_exit(trans, &new_iter); 1247 if (ret) 1248 return ret; 1249 } 1250 1251 return 0; 1252 } 1253 1254 /* 1255 * For a given snapshot, if it doesn't have a subvolume that points to it, and 1256 * it doesn't have child snapshot nodes - it's now redundant and we can mark it 1257 * as deleted. 1258 */ 1259 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, 1260 struct bkey_s_c k) 1261 { 1262 struct bkey_s_c_snapshot snap; 1263 u32 children[2]; 1264 int ret; 1265 1266 if (k.k->type != KEY_TYPE_snapshot) 1267 return 0; 1268 1269 snap = bkey_s_c_to_snapshot(k); 1270 if (BCH_SNAPSHOT_DELETED(snap.v) || 1271 BCH_SNAPSHOT_SUBVOL(snap.v)) 1272 return 0; 1273 1274 children[0] = le32_to_cpu(snap.v->children[0]); 1275 children[1] = le32_to_cpu(snap.v->children[1]); 1276 1277 ret = bch2_snapshot_live(trans, children[0]) ?: 1278 bch2_snapshot_live(trans, children[1]); 1279 if (ret < 0) 1280 return ret; 1281 1282 if (!ret) 1283 return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); 1284 return 0; 1285 } 1286 1287 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, 1288 snapshot_id_list *skip) 1289 { 1290 rcu_read_lock(); 1291 while (snapshot_list_has_id(skip, id)) 1292 id = __bch2_snapshot_parent(c, id); 1293 1294 while (n--) { 1295 do { 1296 id = __bch2_snapshot_parent(c, id); 1297 } while (snapshot_list_has_id(skip, id)); 1298 } 1299 rcu_read_unlock(); 1300 1301 return id; 1302 } 1303 1304 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, 1305 struct btree_iter *iter, struct bkey_s_c k, 1306 snapshot_id_list *deleted) 1307 { 1308 struct bch_fs *c = trans->c; 1309 u32 nr_deleted_ancestors = 0; 1310 struct bkey_i_snapshot *s; 1311 u32 *i; 1312 int ret; 1313 1314 if (k.k->type != KEY_TYPE_snapshot) 1315 return 0; 1316 1317 if (snapshot_list_has_id(deleted, k.k->p.offset)) 1318 return 0; 1319 1320 s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); 1321 ret = PTR_ERR_OR_ZERO(s); 1322 if (ret) 1323 return ret; 1324 1325 darray_for_each(*deleted, i) 1326 nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); 1327 1328 if (!nr_deleted_ancestors) 1329 return 0; 1330 1331 le32_add_cpu(&s->v.depth, -nr_deleted_ancestors); 1332 1333 if (!s->v.depth) { 1334 s->v.skip[0] = 0; 1335 s->v.skip[1] = 0; 1336 s->v.skip[2] = 0; 1337 } else { 1338 u32 depth = le32_to_cpu(s->v.depth); 1339 u32 parent = bch2_snapshot_parent(c, s->k.p.offset); 1340 1341 for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { 1342 u32 id = le32_to_cpu(s->v.skip[j]); 1343 1344 if (snapshot_list_has_id(deleted, id)) { 1345 id = depth > 1 1346 ? bch2_snapshot_nth_parent_skip(c, 1347 parent, 1348 get_random_u32_below(depth - 1), 1349 deleted) 1350 : parent; 1351 s->v.skip[j] = cpu_to_le32(id); 1352 } 1353 } 1354 1355 bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32); 1356 } 1357 1358 return bch2_trans_update(trans, iter, &s->k_i, 0); 1359 } 1360 1361 int bch2_delete_dead_snapshots(struct bch_fs *c) 1362 { 1363 struct btree_trans *trans; 1364 struct btree_iter iter; 1365 struct bkey_s_c k; 1366 struct bkey_s_c_snapshot snap; 1367 snapshot_id_list deleted = { 0 }; 1368 snapshot_id_list deleted_interior = { 0 }; 1369 u32 *i, id; 1370 int ret = 0; 1371 1372 if (!test_bit(BCH_FS_STARTED, &c->flags)) { 1373 ret = bch2_fs_read_write_early(c); 1374 if (ret) { 1375 bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); 1376 return ret; 1377 } 1378 } 1379 1380 trans = bch2_trans_get(c); 1381 1382 /* 1383 * For every snapshot node: If we have no live children and it's not 1384 * pointed to by a subvolume, delete it: 1385 */ 1386 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, 1387 POS_MIN, 0, k, 1388 NULL, NULL, 0, 1389 bch2_delete_redundant_snapshot(trans, &iter, k)); 1390 if (ret) { 1391 bch_err_msg(c, ret, "deleting redundant snapshots"); 1392 goto err; 1393 } 1394 1395 ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots, 1396 POS_MIN, 0, k, 1397 bch2_snapshot_set_equiv(trans, k)); 1398 if (ret) { 1399 bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); 1400 goto err; 1401 } 1402 1403 for_each_btree_key(trans, iter, BTREE_ID_snapshots, 1404 POS_MIN, 0, k, ret) { 1405 if (k.k->type != KEY_TYPE_snapshot) 1406 continue; 1407 1408 snap = bkey_s_c_to_snapshot(k); 1409 if (BCH_SNAPSHOT_DELETED(snap.v)) { 1410 ret = snapshot_list_add(c, &deleted, k.k->p.offset); 1411 if (ret) 1412 break; 1413 } 1414 } 1415 bch2_trans_iter_exit(trans, &iter); 1416 1417 if (ret) { 1418 bch_err_msg(c, ret, "walking snapshots"); 1419 goto err; 1420 } 1421 1422 for (id = 0; id < BTREE_ID_NR; id++) { 1423 struct bpos last_pos = POS_MIN; 1424 snapshot_id_list equiv_seen = { 0 }; 1425 struct disk_reservation res = { 0 }; 1426 1427 if (!btree_type_has_snapshots(id)) 1428 continue; 1429 1430 ret = for_each_btree_key_commit(trans, iter, 1431 id, POS_MIN, 1432 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, 1433 &res, NULL, BTREE_INSERT_NOFAIL, 1434 snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: 1435 for_each_btree_key_commit(trans, iter, 1436 id, POS_MIN, 1437 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, 1438 &res, NULL, BTREE_INSERT_NOFAIL, 1439 move_key_to_correct_snapshot(trans, &iter, k)); 1440 1441 bch2_disk_reservation_put(c, &res); 1442 darray_exit(&equiv_seen); 1443 1444 if (ret) { 1445 bch_err_msg(c, ret, "deleting keys from dying snapshots"); 1446 goto err; 1447 } 1448 } 1449 1450 down_write(&c->snapshot_create_lock); 1451 1452 for_each_btree_key(trans, iter, BTREE_ID_snapshots, 1453 POS_MIN, 0, k, ret) { 1454 u32 snapshot = k.k->p.offset; 1455 u32 equiv = bch2_snapshot_equiv(c, snapshot); 1456 1457 if (equiv != snapshot) 1458 snapshot_list_add(c, &deleted_interior, snapshot); 1459 } 1460 bch2_trans_iter_exit(trans, &iter); 1461 1462 if (ret) 1463 goto err_create_lock; 1464 1465 /* 1466 * Fixing children of deleted snapshots can't be done completely 1467 * atomically, if we crash between here and when we delete the interior 1468 * nodes some depth fields will be off: 1469 */ 1470 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, 1471 BTREE_ITER_INTENT, k, 1472 NULL, NULL, BTREE_INSERT_NOFAIL, 1473 bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); 1474 if (ret) 1475 goto err_create_lock; 1476 1477 darray_for_each(deleted, i) { 1478 ret = commit_do(trans, NULL, NULL, 0, 1479 bch2_snapshot_node_delete(trans, *i)); 1480 if (ret) { 1481 bch_err_msg(c, ret, "deleting snapshot %u", *i); 1482 goto err_create_lock; 1483 } 1484 } 1485 1486 darray_for_each(deleted_interior, i) { 1487 ret = commit_do(trans, NULL, NULL, 0, 1488 bch2_snapshot_node_delete(trans, *i)); 1489 if (ret) { 1490 bch_err_msg(c, ret, "deleting snapshot %u", *i); 1491 goto err_create_lock; 1492 } 1493 } 1494 1495 clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); 1496 err_create_lock: 1497 up_write(&c->snapshot_create_lock); 1498 err: 1499 darray_exit(&deleted_interior); 1500 darray_exit(&deleted); 1501 bch2_trans_put(trans); 1502 if (ret) 1503 bch_err_fn(c, ret); 1504 return ret; 1505 } 1506 1507 void bch2_delete_dead_snapshots_work(struct work_struct *work) 1508 { 1509 struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); 1510 1511 if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) 1512 bch2_delete_dead_snapshots(c); 1513 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); 1514 } 1515 1516 void bch2_delete_dead_snapshots_async(struct bch_fs *c) 1517 { 1518 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && 1519 !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) 1520 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); 1521 } 1522 1523 int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, 1524 struct btree_trans_commit_hook *h) 1525 { 1526 struct bch_fs *c = trans->c; 1527 1528 set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); 1529 1530 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) 1531 return 0; 1532 1533 bch2_delete_dead_snapshots_async(c); 1534 return 0; 1535 } 1536 1537 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, 1538 enum btree_id id, 1539 struct bpos pos) 1540 { 1541 struct bch_fs *c = trans->c; 1542 struct btree_iter iter; 1543 struct bkey_s_c k; 1544 int ret; 1545 1546 bch2_trans_iter_init(trans, &iter, id, pos, 1547 BTREE_ITER_NOT_EXTENTS| 1548 BTREE_ITER_ALL_SNAPSHOTS); 1549 while (1) { 1550 k = bch2_btree_iter_prev(&iter); 1551 ret = bkey_err(k); 1552 if (ret) 1553 break; 1554 1555 if (!k.k) 1556 break; 1557 1558 if (!bkey_eq(pos, k.k->p)) 1559 break; 1560 1561 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { 1562 ret = 1; 1563 break; 1564 } 1565 } 1566 bch2_trans_iter_exit(trans, &iter); 1567 1568 return ret; 1569 } 1570 1571 static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) 1572 { 1573 const struct snapshot_t *s = snapshot_t(c, id); 1574 1575 return s->children[1] ?: s->children[0]; 1576 } 1577 1578 static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) 1579 { 1580 u32 child; 1581 1582 while ((child = bch2_snapshot_smallest_child(c, id))) 1583 id = child; 1584 return id; 1585 } 1586 1587 static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, 1588 enum btree_id btree, 1589 struct bkey_s_c interior_k, 1590 u32 leaf_id, struct bpos *new_min_pos) 1591 { 1592 struct btree_iter iter; 1593 struct bpos pos = interior_k.k->p; 1594 struct bkey_s_c k; 1595 struct bkey_i *new; 1596 int ret; 1597 1598 pos.snapshot = leaf_id; 1599 1600 bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); 1601 k = bch2_btree_iter_peek_slot(&iter); 1602 ret = bkey_err(k); 1603 if (ret) 1604 goto out; 1605 1606 /* key already overwritten in this snapshot? */ 1607 if (k.k->p.snapshot != interior_k.k->p.snapshot) 1608 goto out; 1609 1610 if (bpos_eq(*new_min_pos, POS_MIN)) { 1611 *new_min_pos = k.k->p; 1612 new_min_pos->snapshot = leaf_id; 1613 } 1614 1615 new = bch2_bkey_make_mut_noupdate(trans, interior_k); 1616 ret = PTR_ERR_OR_ZERO(new); 1617 if (ret) 1618 goto out; 1619 1620 new->k.p.snapshot = leaf_id; 1621 ret = bch2_trans_update(trans, &iter, new, 0); 1622 out: 1623 bch2_trans_iter_exit(trans, &iter); 1624 return ret; 1625 } 1626 1627 int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, 1628 enum btree_id btree, 1629 struct bkey_s_c k, 1630 struct bpos *new_min_pos) 1631 { 1632 struct bch_fs *c = trans->c; 1633 struct bkey_buf sk; 1634 u32 restart_count = trans->restart_count; 1635 int ret = 0; 1636 1637 bch2_bkey_buf_init(&sk); 1638 bch2_bkey_buf_reassemble(&sk, c, k); 1639 k = bkey_i_to_s_c(sk.k); 1640 1641 *new_min_pos = POS_MIN; 1642 1643 for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); 1644 id < k.k->p.snapshot; 1645 id++) { 1646 if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || 1647 !bch2_snapshot_is_leaf(c, id)) 1648 continue; 1649 again: 1650 ret = btree_trans_too_many_iters(trans) ?: 1651 bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?: 1652 bch2_trans_commit(trans, NULL, NULL, 0); 1653 if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 1654 bch2_trans_begin(trans); 1655 goto again; 1656 } 1657 1658 if (ret) 1659 break; 1660 } 1661 1662 bch2_bkey_buf_exit(&sk, c); 1663 1664 return ret ?: trans_was_restarted(trans, restart_count); 1665 } 1666 1667 int bch2_snapshots_read(struct bch_fs *c) 1668 { 1669 struct btree_iter iter; 1670 struct bkey_s_c k; 1671 int ret = 0; 1672 1673 ret = bch2_trans_run(c, 1674 for_each_btree_key2(trans, iter, BTREE_ID_snapshots, 1675 POS_MIN, 0, k, 1676 bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: 1677 bch2_snapshot_set_equiv(trans, k)) ?: 1678 for_each_btree_key2(trans, iter, BTREE_ID_snapshots, 1679 POS_MIN, 0, k, 1680 (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); 1681 if (ret) 1682 bch_err_fn(c, ret); 1683 return ret; 1684 } 1685 1686 void bch2_fs_snapshots_exit(struct bch_fs *c) 1687 { 1688 kfree(rcu_dereference_protected(c->snapshots, true)); 1689 } 1690