1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_cache.h" 6 #include "btree_key_cache.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "fs.h" 12 #include "recovery_passes.h" 13 #include "snapshot.h" 14 15 #include <linux/random.h> 16 17 /* 18 * Snapshot trees: 19 * 20 * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they 21 * exist to provide a stable identifier for the whole lifetime of a snapshot 22 * tree. 23 */ 24 25 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, 26 struct bkey_s_c k) 27 { 28 struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); 29 30 prt_printf(out, "subvol %u root snapshot %u", 31 le32_to_cpu(t.v->master_subvol), 32 le32_to_cpu(t.v->root_snapshot)); 33 } 34 35 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, 36 struct bkey_validate_context from) 37 { 38 int ret = 0; 39 40 bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || 41 bkey_lt(k.k->p, POS(0, 1)), 42 c, snapshot_tree_pos_bad, 43 "bad pos"); 44 fsck_err: 45 return ret; 46 } 47 48 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, 49 struct bch_snapshot_tree *s) 50 { 51 int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), 52 BTREE_ITER_with_updates, snapshot_tree, s); 53 54 if (bch2_err_matches(ret, ENOENT)) 55 ret = -BCH_ERR_ENOENT_snapshot_tree; 56 return ret; 57 } 58 59 struct bkey_i_snapshot_tree * 60 __bch2_snapshot_tree_create(struct btree_trans *trans) 61 { 62 struct btree_iter iter; 63 int ret = bch2_bkey_get_empty_slot(trans, &iter, 64 BTREE_ID_snapshot_trees, POS(0, U32_MAX)); 65 struct bkey_i_snapshot_tree *s_t; 66 67 if (ret == -BCH_ERR_ENOSPC_btree_slot) 68 ret = -BCH_ERR_ENOSPC_snapshot_tree; 69 if (ret) 70 return ERR_PTR(ret); 71 72 s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); 73 ret = PTR_ERR_OR_ZERO(s_t); 74 bch2_trans_iter_exit(trans, &iter); 75 return ret ? ERR_PTR(ret) : s_t; 76 } 77 78 static int bch2_snapshot_tree_create(struct btree_trans *trans, 79 u32 root_id, u32 subvol_id, u32 *tree_id) 80 { 81 struct bkey_i_snapshot_tree *n_tree = 82 __bch2_snapshot_tree_create(trans); 83 84 if (IS_ERR(n_tree)) 85 return PTR_ERR(n_tree); 86 87 n_tree->v.master_subvol = cpu_to_le32(subvol_id); 88 n_tree->v.root_snapshot = cpu_to_le32(root_id); 89 *tree_id = n_tree->k.p.offset; 90 return 0; 91 } 92 93 /* Snapshot nodes: */ 94 95 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor) 96 { 97 while (id && id < ancestor) { 98 const struct snapshot_t *s = __snapshot_t(t, id); 99 id = s ? s->parent : 0; 100 } 101 return id == ancestor; 102 } 103 104 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) 105 { 106 rcu_read_lock(); 107 bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); 108 rcu_read_unlock(); 109 110 return ret; 111 } 112 113 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) 114 { 115 const struct snapshot_t *s = __snapshot_t(t, id); 116 if (!s) 117 return 0; 118 119 if (s->skip[2] <= ancestor) 120 return s->skip[2]; 121 if (s->skip[1] <= ancestor) 122 return s->skip[1]; 123 if (s->skip[0] <= ancestor) 124 return s->skip[0]; 125 return s->parent; 126 } 127 128 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) 129 { 130 const struct snapshot_t *s = __snapshot_t(t, id); 131 if (!s) 132 return false; 133 134 return test_bit(ancestor - id - 1, s->is_ancestor); 135 } 136 137 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) 138 { 139 bool ret; 140 141 rcu_read_lock(); 142 struct snapshot_table *t = rcu_dereference(c->snapshots); 143 144 if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { 145 ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); 146 goto out; 147 } 148 149 if (likely(ancestor >= IS_ANCESTOR_BITMAP)) 150 while (id && id < ancestor - IS_ANCESTOR_BITMAP) 151 id = get_ancestor_below(t, id, ancestor); 152 153 ret = id && id < ancestor 154 ? test_ancestor_bitmap(t, id, ancestor) 155 : id == ancestor; 156 157 EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); 158 out: 159 rcu_read_unlock(); 160 161 return ret; 162 } 163 164 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) 165 { 166 size_t idx = U32_MAX - id; 167 struct snapshot_table *new, *old; 168 169 size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); 170 size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); 171 172 if (unlikely(new_bytes > INT_MAX)) 173 return NULL; 174 175 new = kvzalloc(new_bytes, GFP_KERNEL); 176 if (!new) 177 return NULL; 178 179 new->nr = new_size; 180 181 old = rcu_dereference_protected(c->snapshots, true); 182 if (old) 183 memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr); 184 185 rcu_assign_pointer(c->snapshots, new); 186 kvfree_rcu(old, rcu); 187 188 return &rcu_dereference_protected(c->snapshots, 189 lockdep_is_held(&c->snapshot_table_lock))->s[idx]; 190 } 191 192 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) 193 { 194 size_t idx = U32_MAX - id; 195 struct snapshot_table *table = 196 rcu_dereference_protected(c->snapshots, 197 lockdep_is_held(&c->snapshot_table_lock)); 198 199 lockdep_assert_held(&c->snapshot_table_lock); 200 201 if (likely(table && idx < table->nr)) 202 return &table->s[idx]; 203 204 return __snapshot_t_mut(c, id); 205 } 206 207 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, 208 struct bkey_s_c k) 209 { 210 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); 211 212 prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", 213 BCH_SNAPSHOT_SUBVOL(s.v), 214 BCH_SNAPSHOT_DELETED(s.v), 215 le32_to_cpu(s.v->parent), 216 le32_to_cpu(s.v->children[0]), 217 le32_to_cpu(s.v->children[1]), 218 le32_to_cpu(s.v->subvol), 219 le32_to_cpu(s.v->tree)); 220 221 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) 222 prt_printf(out, " depth %u skiplist %u %u %u", 223 le32_to_cpu(s.v->depth), 224 le32_to_cpu(s.v->skip[0]), 225 le32_to_cpu(s.v->skip[1]), 226 le32_to_cpu(s.v->skip[2])); 227 } 228 229 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, 230 struct bkey_validate_context from) 231 { 232 struct bkey_s_c_snapshot s; 233 u32 i, id; 234 int ret = 0; 235 236 bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || 237 bkey_lt(k.k->p, POS(0, 1)), 238 c, snapshot_pos_bad, 239 "bad pos"); 240 241 s = bkey_s_c_to_snapshot(k); 242 243 id = le32_to_cpu(s.v->parent); 244 bkey_fsck_err_on(id && id <= k.k->p.offset, 245 c, snapshot_parent_bad, 246 "bad parent node (%u <= %llu)", 247 id, k.k->p.offset); 248 249 bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), 250 c, snapshot_children_not_normalized, 251 "children not normalized"); 252 253 bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], 254 c, snapshot_child_duplicate, 255 "duplicate child nodes"); 256 257 for (i = 0; i < 2; i++) { 258 id = le32_to_cpu(s.v->children[i]); 259 260 bkey_fsck_err_on(id >= k.k->p.offset, 261 c, snapshot_child_bad, 262 "bad child node (%u >= %llu)", 263 id, k.k->p.offset); 264 } 265 266 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { 267 bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || 268 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), 269 c, snapshot_skiplist_not_normalized, 270 "skiplist not normalized"); 271 272 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { 273 id = le32_to_cpu(s.v->skip[i]); 274 275 bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), 276 c, snapshot_skiplist_bad, 277 "bad skiplist node %u", id); 278 } 279 } 280 fsck_err: 281 return ret; 282 } 283 284 static int __bch2_mark_snapshot(struct btree_trans *trans, 285 enum btree_id btree, unsigned level, 286 struct bkey_s_c old, struct bkey_s_c new, 287 enum btree_iter_update_trigger_flags flags) 288 { 289 struct bch_fs *c = trans->c; 290 struct snapshot_t *t; 291 u32 id = new.k->p.offset; 292 int ret = 0; 293 294 mutex_lock(&c->snapshot_table_lock); 295 296 t = snapshot_t_mut(c, id); 297 if (!t) { 298 ret = -BCH_ERR_ENOMEM_mark_snapshot; 299 goto err; 300 } 301 302 if (new.k->type == KEY_TYPE_snapshot) { 303 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); 304 305 t->live = true; 306 t->parent = le32_to_cpu(s.v->parent); 307 t->children[0] = le32_to_cpu(s.v->children[0]); 308 t->children[1] = le32_to_cpu(s.v->children[1]); 309 t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; 310 t->tree = le32_to_cpu(s.v->tree); 311 312 if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { 313 t->depth = le32_to_cpu(s.v->depth); 314 t->skip[0] = le32_to_cpu(s.v->skip[0]); 315 t->skip[1] = le32_to_cpu(s.v->skip[1]); 316 t->skip[2] = le32_to_cpu(s.v->skip[2]); 317 } else { 318 t->depth = 0; 319 t->skip[0] = 0; 320 t->skip[1] = 0; 321 t->skip[2] = 0; 322 } 323 324 u32 parent = id; 325 326 while ((parent = bch2_snapshot_parent_early(c, parent)) && 327 parent - id - 1 < IS_ANCESTOR_BITMAP) 328 __set_bit(parent - id - 1, t->is_ancestor); 329 330 if (BCH_SNAPSHOT_DELETED(s.v)) { 331 set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); 332 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) 333 bch2_delete_dead_snapshots_async(c); 334 } 335 } else { 336 memset(t, 0, sizeof(*t)); 337 } 338 err: 339 mutex_unlock(&c->snapshot_table_lock); 340 return ret; 341 } 342 343 int bch2_mark_snapshot(struct btree_trans *trans, 344 enum btree_id btree, unsigned level, 345 struct bkey_s_c old, struct bkey_s new, 346 enum btree_iter_update_trigger_flags flags) 347 { 348 return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags); 349 } 350 351 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, 352 struct bch_snapshot *s) 353 { 354 return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), 355 BTREE_ITER_with_updates, snapshot, s); 356 } 357 358 /* fsck: */ 359 360 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) 361 { 362 return snapshot_t(c, id)->children[child]; 363 } 364 365 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) 366 { 367 return bch2_snapshot_child(c, id, 0); 368 } 369 370 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) 371 { 372 return bch2_snapshot_child(c, id, 1); 373 } 374 375 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) 376 { 377 u32 n, parent; 378 379 n = bch2_snapshot_left_child(c, id); 380 if (n) 381 return n; 382 383 while ((parent = bch2_snapshot_parent(c, id))) { 384 n = bch2_snapshot_right_child(c, parent); 385 if (n && n != id) 386 return n; 387 id = parent; 388 } 389 390 return 0; 391 } 392 393 u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) 394 { 395 u32 id = snapshot_root; 396 u32 subvol = 0, s; 397 398 rcu_read_lock(); 399 while (id && bch2_snapshot_exists(c, id)) { 400 s = snapshot_t(c, id)->subvol; 401 402 if (s && (!subvol || s < subvol)) 403 subvol = s; 404 405 id = bch2_snapshot_tree_next(c, id); 406 } 407 rcu_read_unlock(); 408 409 return subvol; 410 } 411 412 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, 413 u32 snapshot_root, u32 *subvol_id) 414 { 415 struct bch_fs *c = trans->c; 416 struct btree_iter iter; 417 struct bkey_s_c k; 418 bool found = false; 419 int ret; 420 421 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 422 0, k, ret) { 423 if (k.k->type != KEY_TYPE_subvolume) 424 continue; 425 426 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 427 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) 428 continue; 429 if (!BCH_SUBVOLUME_SNAP(s.v)) { 430 *subvol_id = s.k->p.offset; 431 found = true; 432 break; 433 } 434 } 435 bch2_trans_iter_exit(trans, &iter); 436 437 if (!ret && !found) { 438 struct bkey_i_subvolume *u; 439 440 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); 441 442 u = bch2_bkey_get_mut_typed(trans, &iter, 443 BTREE_ID_subvolumes, POS(0, *subvol_id), 444 0, subvolume); 445 ret = PTR_ERR_OR_ZERO(u); 446 if (ret) 447 return ret; 448 449 SET_BCH_SUBVOLUME_SNAP(&u->v, false); 450 } 451 452 return ret; 453 } 454 455 static int check_snapshot_tree(struct btree_trans *trans, 456 struct btree_iter *iter, 457 struct bkey_s_c k) 458 { 459 struct bch_fs *c = trans->c; 460 struct bkey_s_c_snapshot_tree st; 461 struct bch_snapshot s; 462 struct bch_subvolume subvol; 463 struct printbuf buf = PRINTBUF; 464 struct btree_iter snapshot_iter = {}; 465 u32 root_id; 466 int ret; 467 468 if (k.k->type != KEY_TYPE_snapshot_tree) 469 return 0; 470 471 st = bkey_s_c_to_snapshot_tree(k); 472 root_id = le32_to_cpu(st.v->root_snapshot); 473 474 struct bkey_s_c_snapshot snapshot_k = 475 bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, 476 POS(0, root_id), 0, snapshot); 477 ret = bkey_err(snapshot_k); 478 if (ret && !bch2_err_matches(ret, ENOENT)) 479 goto err; 480 481 if (!ret) 482 bkey_val_copy(&s, snapshot_k); 483 484 if (fsck_err_on(ret || 485 root_id != bch2_snapshot_root(c, root_id) || 486 st.k->p.offset != le32_to_cpu(s.tree), 487 trans, snapshot_tree_to_missing_snapshot, 488 "snapshot tree points to missing/incorrect snapshot:\n%s", 489 (bch2_bkey_val_to_text(&buf, c, st.s_c), 490 prt_newline(&buf), 491 ret 492 ? prt_printf(&buf, "(%s)", bch2_err_str(ret)) 493 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c), 494 buf.buf))) { 495 ret = bch2_btree_delete_at(trans, iter, 0); 496 goto err; 497 } 498 499 if (!st.v->master_subvol) 500 goto out; 501 502 ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); 503 if (ret && !bch2_err_matches(ret, ENOENT)) 504 goto err; 505 506 if (fsck_err_on(ret, 507 trans, snapshot_tree_to_missing_subvol, 508 "snapshot tree points to missing subvolume:\n%s", 509 (printbuf_reset(&buf), 510 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || 511 fsck_err_on(!bch2_snapshot_is_ancestor(c, 512 le32_to_cpu(subvol.snapshot), 513 root_id), 514 trans, snapshot_tree_to_wrong_subvol, 515 "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s", 516 (printbuf_reset(&buf), 517 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || 518 fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), 519 trans, snapshot_tree_to_snapshot_subvol, 520 "snapshot tree points to snapshot subvolume:\n%s", 521 (printbuf_reset(&buf), 522 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { 523 struct bkey_i_snapshot_tree *u; 524 u32 subvol_id; 525 526 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); 527 bch_err_fn(c, ret); 528 529 if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ 530 ret = 0; 531 goto err; 532 } 533 534 if (ret) 535 goto err; 536 537 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); 538 ret = PTR_ERR_OR_ZERO(u); 539 if (ret) 540 goto err; 541 542 u->v.master_subvol = cpu_to_le32(subvol_id); 543 st = snapshot_tree_i_to_s_c(u); 544 } 545 out: 546 err: 547 fsck_err: 548 bch2_trans_iter_exit(trans, &snapshot_iter); 549 printbuf_exit(&buf); 550 return ret; 551 } 552 553 /* 554 * For each snapshot_tree, make sure it points to the root of a snapshot tree 555 * and that snapshot entry points back to it, or delete it. 556 * 557 * And, make sure it points to a subvolume within that snapshot tree, or correct 558 * it to point to the oldest subvolume within that snapshot tree. 559 */ 560 int bch2_check_snapshot_trees(struct bch_fs *c) 561 { 562 int ret = bch2_trans_run(c, 563 for_each_btree_key_commit(trans, iter, 564 BTREE_ID_snapshot_trees, POS_MIN, 565 BTREE_ITER_prefetch, k, 566 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 567 check_snapshot_tree(trans, &iter, k))); 568 bch_err_fn(c, ret); 569 return ret; 570 } 571 572 /* 573 * Look up snapshot tree for @tree_id and find root, 574 * make sure @snap_id is a descendent: 575 */ 576 static int snapshot_tree_ptr_good(struct btree_trans *trans, 577 u32 snap_id, u32 tree_id) 578 { 579 struct bch_snapshot_tree s_t; 580 int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); 581 582 if (bch2_err_matches(ret, ENOENT)) 583 return 0; 584 if (ret) 585 return ret; 586 587 return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); 588 } 589 590 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) 591 { 592 const struct snapshot_t *s; 593 594 if (!id) 595 return 0; 596 597 rcu_read_lock(); 598 s = snapshot_t(c, id); 599 if (s->parent) 600 id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); 601 rcu_read_unlock(); 602 603 return id; 604 } 605 606 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) 607 { 608 unsigned i; 609 610 for (i = 0; i < 3; i++) 611 if (!s.parent) { 612 if (s.skip[i]) 613 return false; 614 } else { 615 if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i]))) 616 return false; 617 } 618 619 return true; 620 } 621 622 /* 623 * snapshot_tree pointer was incorrect: look up root snapshot node, make sure 624 * its snapshot_tree pointer is correct (allocate new one if necessary), then 625 * update this node's pointer to root node's pointer: 626 */ 627 static int snapshot_tree_ptr_repair(struct btree_trans *trans, 628 struct btree_iter *iter, 629 struct bkey_s_c k, 630 struct bch_snapshot *s) 631 { 632 struct bch_fs *c = trans->c; 633 struct btree_iter root_iter; 634 struct bch_snapshot_tree s_t; 635 struct bkey_s_c_snapshot root; 636 struct bkey_i_snapshot *u; 637 u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; 638 int ret; 639 640 root = bch2_bkey_get_iter_typed(trans, &root_iter, 641 BTREE_ID_snapshots, POS(0, root_id), 642 BTREE_ITER_with_updates, snapshot); 643 ret = bkey_err(root); 644 if (ret) 645 goto err; 646 647 tree_id = le32_to_cpu(root.v->tree); 648 649 ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); 650 if (ret && !bch2_err_matches(ret, ENOENT)) 651 return ret; 652 653 if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { 654 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); 655 ret = PTR_ERR_OR_ZERO(u) ?: 656 bch2_snapshot_tree_create(trans, root_id, 657 bch2_snapshot_tree_oldest_subvol(c, root_id), 658 &tree_id); 659 if (ret) 660 goto err; 661 662 u->v.tree = cpu_to_le32(tree_id); 663 if (k.k->p.offset == root_id) 664 *s = u->v; 665 } 666 667 if (k.k->p.offset != root_id) { 668 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); 669 ret = PTR_ERR_OR_ZERO(u); 670 if (ret) 671 goto err; 672 673 u->v.tree = cpu_to_le32(tree_id); 674 *s = u->v; 675 } 676 err: 677 bch2_trans_iter_exit(trans, &root_iter); 678 return ret; 679 } 680 681 static int check_snapshot(struct btree_trans *trans, 682 struct btree_iter *iter, 683 struct bkey_s_c k) 684 { 685 struct bch_fs *c = trans->c; 686 struct bch_snapshot s; 687 struct bch_subvolume subvol; 688 struct bch_snapshot v; 689 struct bkey_i_snapshot *u; 690 u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); 691 u32 real_depth; 692 struct printbuf buf = PRINTBUF; 693 u32 i, id; 694 int ret = 0; 695 696 if (k.k->type != KEY_TYPE_snapshot) 697 return 0; 698 699 memset(&s, 0, sizeof(s)); 700 memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); 701 702 id = le32_to_cpu(s.parent); 703 if (id) { 704 ret = bch2_snapshot_lookup(trans, id, &v); 705 if (bch2_err_matches(ret, ENOENT)) 706 bch_err(c, "snapshot with nonexistent parent:\n %s", 707 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 708 if (ret) 709 goto err; 710 711 if (le32_to_cpu(v.children[0]) != k.k->p.offset && 712 le32_to_cpu(v.children[1]) != k.k->p.offset) { 713 bch_err(c, "snapshot parent %u missing pointer to child %llu", 714 id, k.k->p.offset); 715 ret = -EINVAL; 716 goto err; 717 } 718 } 719 720 for (i = 0; i < 2 && s.children[i]; i++) { 721 id = le32_to_cpu(s.children[i]); 722 723 ret = bch2_snapshot_lookup(trans, id, &v); 724 if (bch2_err_matches(ret, ENOENT)) 725 bch_err(c, "snapshot node %llu has nonexistent child %u", 726 k.k->p.offset, id); 727 if (ret) 728 goto err; 729 730 if (le32_to_cpu(v.parent) != k.k->p.offset) { 731 bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", 732 id, le32_to_cpu(v.parent), k.k->p.offset); 733 ret = -EINVAL; 734 goto err; 735 } 736 } 737 738 bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && 739 !BCH_SNAPSHOT_DELETED(&s); 740 741 if (should_have_subvol) { 742 id = le32_to_cpu(s.subvol); 743 ret = bch2_subvolume_get(trans, id, false, &subvol); 744 if (bch2_err_matches(ret, ENOENT)) 745 bch_err(c, "snapshot points to nonexistent subvolume:\n %s", 746 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 747 if (ret) 748 goto err; 749 750 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { 751 bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", 752 k.k->p.offset); 753 ret = -EINVAL; 754 goto err; 755 } 756 } else { 757 if (fsck_err_on(s.subvol, 758 trans, snapshot_should_not_have_subvol, 759 "snapshot should not point to subvol:\n%s", 760 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 761 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); 762 ret = PTR_ERR_OR_ZERO(u); 763 if (ret) 764 goto err; 765 766 u->v.subvol = 0; 767 s = u->v; 768 } 769 } 770 771 ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); 772 if (ret < 0) 773 goto err; 774 775 if (fsck_err_on(!ret, 776 trans, snapshot_to_bad_snapshot_tree, 777 "snapshot points to missing/incorrect tree:\n%s", 778 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 779 ret = snapshot_tree_ptr_repair(trans, iter, k, &s); 780 if (ret) 781 goto err; 782 } 783 ret = 0; 784 785 real_depth = bch2_snapshot_depth(c, parent_id); 786 787 if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, 788 trans, snapshot_bad_depth, 789 "snapshot with incorrect depth field, should be %u:\n%s", 790 real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 791 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); 792 ret = PTR_ERR_OR_ZERO(u); 793 if (ret) 794 goto err; 795 796 u->v.depth = cpu_to_le32(real_depth); 797 s = u->v; 798 } 799 800 ret = snapshot_skiplist_good(trans, k.k->p.offset, s); 801 if (ret < 0) 802 goto err; 803 804 if (fsck_err_on(!ret, 805 trans, snapshot_bad_skiplist, 806 "snapshot with bad skiplist field:\n%s", 807 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 808 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); 809 ret = PTR_ERR_OR_ZERO(u); 810 if (ret) 811 goto err; 812 813 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) 814 u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id)); 815 816 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); 817 s = u->v; 818 } 819 ret = 0; 820 err: 821 fsck_err: 822 printbuf_exit(&buf); 823 return ret; 824 } 825 826 int bch2_check_snapshots(struct bch_fs *c) 827 { 828 /* 829 * We iterate backwards as checking/fixing the depth field requires that 830 * the parent's depth already be correct: 831 */ 832 int ret = bch2_trans_run(c, 833 for_each_btree_key_reverse_commit(trans, iter, 834 BTREE_ID_snapshots, POS_MAX, 835 BTREE_ITER_prefetch, k, 836 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 837 check_snapshot(trans, &iter, k))); 838 bch_err_fn(c, ret); 839 return ret; 840 } 841 842 static int check_snapshot_exists(struct btree_trans *trans, u32 id) 843 { 844 struct bch_fs *c = trans->c; 845 846 /* Do we need to reconstruct the snapshot_tree entry as well? */ 847 struct btree_iter iter; 848 struct bkey_s_c k; 849 int ret = 0; 850 u32 tree_id = 0; 851 852 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, 853 0, k, ret) { 854 if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { 855 tree_id = k.k->p.offset; 856 break; 857 } 858 } 859 bch2_trans_iter_exit(trans, &iter); 860 861 if (ret) 862 return ret; 863 864 if (!tree_id) { 865 ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); 866 if (ret) 867 return ret; 868 } 869 870 struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); 871 ret = PTR_ERR_OR_ZERO(snapshot); 872 if (ret) 873 return ret; 874 875 bkey_snapshot_init(&snapshot->k_i); 876 snapshot->k.p = POS(0, id); 877 snapshot->v.tree = cpu_to_le32(tree_id); 878 snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); 879 880 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 881 0, k, ret) { 882 if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { 883 snapshot->v.subvol = cpu_to_le32(k.k->p.offset); 884 SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); 885 break; 886 } 887 } 888 bch2_trans_iter_exit(trans, &iter); 889 890 return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: 891 bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, 892 bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); 893 } 894 895 /* Figure out which snapshot nodes belong in the same tree: */ 896 struct snapshot_tree_reconstruct { 897 enum btree_id btree; 898 struct bpos cur_pos; 899 snapshot_id_list cur_ids; 900 DARRAY(snapshot_id_list) trees; 901 }; 902 903 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r) 904 { 905 darray_for_each(r->trees, i) 906 darray_exit(i); 907 darray_exit(&r->trees); 908 darray_exit(&r->cur_ids); 909 } 910 911 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos) 912 { 913 return r->btree == BTREE_ID_inodes 914 ? r->cur_pos.offset == pos.offset 915 : r->cur_pos.inode == pos.inode; 916 } 917 918 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) 919 { 920 darray_for_each(*l, i) 921 if (snapshot_list_has_id(r, *i)) 922 return true; 923 return false; 924 } 925 926 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) 927 { 928 bool first = true; 929 darray_for_each(*s, i) { 930 if (!first) 931 prt_char(out, ' '); 932 first = false; 933 prt_printf(out, "%u", *i); 934 } 935 } 936 937 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r) 938 { 939 if (r->cur_ids.nr) { 940 darray_for_each(r->trees, i) 941 if (snapshot_id_lists_have_common(i, &r->cur_ids)) { 942 int ret = snapshot_list_merge(c, i, &r->cur_ids); 943 if (ret) 944 return ret; 945 goto out; 946 } 947 darray_push(&r->trees, r->cur_ids); 948 darray_init(&r->cur_ids); 949 } 950 out: 951 r->cur_ids.nr = 0; 952 return 0; 953 } 954 955 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos) 956 { 957 if (!same_snapshot(r, pos)) 958 snapshot_tree_reconstruct_next(c, r); 959 r->cur_pos = pos; 960 return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot); 961 } 962 963 int bch2_reconstruct_snapshots(struct bch_fs *c) 964 { 965 struct btree_trans *trans = bch2_trans_get(c); 966 struct printbuf buf = PRINTBUF; 967 struct snapshot_tree_reconstruct r = {}; 968 int ret = 0; 969 970 for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { 971 if (btree_type_has_snapshots(btree)) { 972 r.btree = btree; 973 974 ret = for_each_btree_key(trans, iter, btree, POS_MIN, 975 BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ 976 get_snapshot_trees(c, &r, k.k->p); 977 })); 978 if (ret) 979 goto err; 980 981 snapshot_tree_reconstruct_next(c, &r); 982 } 983 } 984 985 darray_for_each(r.trees, t) { 986 printbuf_reset(&buf); 987 snapshot_id_list_to_text(&buf, t); 988 989 darray_for_each(*t, id) { 990 if (fsck_err_on(!bch2_snapshot_exists(c, *id), 991 trans, snapshot_node_missing, 992 "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { 993 if (t->nr > 1) { 994 bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); 995 ret = -BCH_ERR_fsck_repair_unimplemented; 996 goto err; 997 } 998 999 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1000 check_snapshot_exists(trans, *id)); 1001 if (ret) 1002 goto err; 1003 } 1004 } 1005 } 1006 fsck_err: 1007 err: 1008 bch2_trans_put(trans); 1009 snapshot_tree_reconstruct_exit(&r); 1010 printbuf_exit(&buf); 1011 bch_err_fn(c, ret); 1012 return ret; 1013 } 1014 1015 int bch2_check_key_has_snapshot(struct btree_trans *trans, 1016 struct btree_iter *iter, 1017 struct bkey_s_c k) 1018 { 1019 struct bch_fs *c = trans->c; 1020 struct printbuf buf = PRINTBUF; 1021 int ret = 0; 1022 1023 if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), 1024 trans, bkey_in_missing_snapshot, 1025 "key in missing snapshot %s, delete?", 1026 (bch2_btree_id_to_text(&buf, iter->btree_id), 1027 prt_char(&buf, ' '), 1028 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1029 ret = bch2_btree_delete_at(trans, iter, 1030 BTREE_UPDATE_internal_snapshot_node) ?: 1; 1031 fsck_err: 1032 printbuf_exit(&buf); 1033 return ret; 1034 } 1035 1036 /* 1037 * Mark a snapshot as deleted, for future cleanup: 1038 */ 1039 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) 1040 { 1041 struct btree_iter iter; 1042 struct bkey_i_snapshot *s = 1043 bch2_bkey_get_mut_typed(trans, &iter, 1044 BTREE_ID_snapshots, POS(0, id), 1045 0, snapshot); 1046 int ret = PTR_ERR_OR_ZERO(s); 1047 if (unlikely(ret)) { 1048 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), 1049 trans->c, "missing snapshot %u", id); 1050 return ret; 1051 } 1052 1053 /* already deleted? */ 1054 if (BCH_SNAPSHOT_DELETED(&s->v)) 1055 goto err; 1056 1057 SET_BCH_SNAPSHOT_DELETED(&s->v, true); 1058 SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); 1059 s->v.subvol = 0; 1060 err: 1061 bch2_trans_iter_exit(trans, &iter); 1062 return ret; 1063 } 1064 1065 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) 1066 { 1067 if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) 1068 swap(s->children[0], s->children[1]); 1069 } 1070 1071 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) 1072 { 1073 struct bch_fs *c = trans->c; 1074 struct btree_iter iter, p_iter = {}; 1075 struct btree_iter c_iter = {}; 1076 struct btree_iter tree_iter = {}; 1077 struct bkey_s_c_snapshot s; 1078 u32 parent_id, child_id; 1079 unsigned i; 1080 int ret = 0; 1081 1082 s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), 1083 BTREE_ITER_intent, snapshot); 1084 ret = bkey_err(s); 1085 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, 1086 "missing snapshot %u", id); 1087 1088 if (ret) 1089 goto err; 1090 1091 BUG_ON(s.v->children[1]); 1092 1093 parent_id = le32_to_cpu(s.v->parent); 1094 child_id = le32_to_cpu(s.v->children[0]); 1095 1096 if (parent_id) { 1097 struct bkey_i_snapshot *parent; 1098 1099 parent = bch2_bkey_get_mut_typed(trans, &p_iter, 1100 BTREE_ID_snapshots, POS(0, parent_id), 1101 0, snapshot); 1102 ret = PTR_ERR_OR_ZERO(parent); 1103 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, 1104 "missing snapshot %u", parent_id); 1105 if (unlikely(ret)) 1106 goto err; 1107 1108 /* find entry in parent->children for node being deleted */ 1109 for (i = 0; i < 2; i++) 1110 if (le32_to_cpu(parent->v.children[i]) == id) 1111 break; 1112 1113 if (bch2_fs_inconsistent_on(i == 2, c, 1114 "snapshot %u missing child pointer to %u", 1115 parent_id, id)) 1116 goto err; 1117 1118 parent->v.children[i] = cpu_to_le32(child_id); 1119 1120 normalize_snapshot_child_pointers(&parent->v); 1121 } 1122 1123 if (child_id) { 1124 struct bkey_i_snapshot *child; 1125 1126 child = bch2_bkey_get_mut_typed(trans, &c_iter, 1127 BTREE_ID_snapshots, POS(0, child_id), 1128 0, snapshot); 1129 ret = PTR_ERR_OR_ZERO(child); 1130 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, 1131 "missing snapshot %u", child_id); 1132 if (unlikely(ret)) 1133 goto err; 1134 1135 child->v.parent = cpu_to_le32(parent_id); 1136 1137 if (!child->v.parent) { 1138 child->v.skip[0] = 0; 1139 child->v.skip[1] = 0; 1140 child->v.skip[2] = 0; 1141 } 1142 } 1143 1144 if (!parent_id) { 1145 /* 1146 * We're deleting the root of a snapshot tree: update the 1147 * snapshot_tree entry to point to the new root, or delete it if 1148 * this is the last snapshot ID in this tree: 1149 */ 1150 struct bkey_i_snapshot_tree *s_t; 1151 1152 BUG_ON(s.v->children[1]); 1153 1154 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, 1155 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), 1156 0, snapshot_tree); 1157 ret = PTR_ERR_OR_ZERO(s_t); 1158 if (ret) 1159 goto err; 1160 1161 if (s.v->children[0]) { 1162 s_t->v.root_snapshot = s.v->children[0]; 1163 } else { 1164 s_t->k.type = KEY_TYPE_deleted; 1165 set_bkey_val_u64s(&s_t->k, 0); 1166 } 1167 } 1168 1169 ret = bch2_btree_delete_at(trans, &iter, 0); 1170 err: 1171 bch2_trans_iter_exit(trans, &tree_iter); 1172 bch2_trans_iter_exit(trans, &p_iter); 1173 bch2_trans_iter_exit(trans, &c_iter); 1174 bch2_trans_iter_exit(trans, &iter); 1175 return ret; 1176 } 1177 1178 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, 1179 u32 *new_snapids, 1180 u32 *snapshot_subvols, 1181 unsigned nr_snapids) 1182 { 1183 struct bch_fs *c = trans->c; 1184 struct btree_iter iter; 1185 struct bkey_i_snapshot *n; 1186 struct bkey_s_c k; 1187 unsigned i, j; 1188 u32 depth = bch2_snapshot_depth(c, parent); 1189 int ret; 1190 1191 bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, 1192 POS_MIN, BTREE_ITER_intent); 1193 k = bch2_btree_iter_peek(trans, &iter); 1194 ret = bkey_err(k); 1195 if (ret) 1196 goto err; 1197 1198 for (i = 0; i < nr_snapids; i++) { 1199 k = bch2_btree_iter_prev_slot(trans, &iter); 1200 ret = bkey_err(k); 1201 if (ret) 1202 goto err; 1203 1204 if (!k.k || !k.k->p.offset) { 1205 ret = -BCH_ERR_ENOSPC_snapshot_create; 1206 goto err; 1207 } 1208 1209 n = bch2_bkey_alloc(trans, &iter, 0, snapshot); 1210 ret = PTR_ERR_OR_ZERO(n); 1211 if (ret) 1212 goto err; 1213 1214 n->v.flags = 0; 1215 n->v.parent = cpu_to_le32(parent); 1216 n->v.subvol = cpu_to_le32(snapshot_subvols[i]); 1217 n->v.tree = cpu_to_le32(tree); 1218 n->v.depth = cpu_to_le32(depth); 1219 n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); 1220 n->v.btime.hi = 0; 1221 1222 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) 1223 n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); 1224 1225 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); 1226 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); 1227 1228 ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, 1229 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); 1230 if (ret) 1231 goto err; 1232 1233 new_snapids[i] = iter.pos.offset; 1234 } 1235 err: 1236 bch2_trans_iter_exit(trans, &iter); 1237 return ret; 1238 } 1239 1240 /* 1241 * Create new snapshot IDs as children of an existing snapshot ID: 1242 */ 1243 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, 1244 u32 *new_snapids, 1245 u32 *snapshot_subvols, 1246 unsigned nr_snapids) 1247 { 1248 struct btree_iter iter; 1249 struct bkey_i_snapshot *n_parent; 1250 int ret = 0; 1251 1252 n_parent = bch2_bkey_get_mut_typed(trans, &iter, 1253 BTREE_ID_snapshots, POS(0, parent), 1254 0, snapshot); 1255 ret = PTR_ERR_OR_ZERO(n_parent); 1256 if (unlikely(ret)) { 1257 if (bch2_err_matches(ret, ENOENT)) 1258 bch_err(trans->c, "snapshot %u not found", parent); 1259 return ret; 1260 } 1261 1262 if (n_parent->v.children[0] || n_parent->v.children[1]) { 1263 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); 1264 ret = -EINVAL; 1265 goto err; 1266 } 1267 1268 ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), 1269 new_snapids, snapshot_subvols, nr_snapids); 1270 if (ret) 1271 goto err; 1272 1273 n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); 1274 n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); 1275 n_parent->v.subvol = 0; 1276 SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); 1277 err: 1278 bch2_trans_iter_exit(trans, &iter); 1279 return ret; 1280 } 1281 1282 /* 1283 * Create a snapshot node that is the root of a new tree: 1284 */ 1285 static int bch2_snapshot_node_create_tree(struct btree_trans *trans, 1286 u32 *new_snapids, 1287 u32 *snapshot_subvols, 1288 unsigned nr_snapids) 1289 { 1290 struct bkey_i_snapshot_tree *n_tree; 1291 int ret; 1292 1293 n_tree = __bch2_snapshot_tree_create(trans); 1294 ret = PTR_ERR_OR_ZERO(n_tree) ?: 1295 create_snapids(trans, 0, n_tree->k.p.offset, 1296 new_snapids, snapshot_subvols, nr_snapids); 1297 if (ret) 1298 return ret; 1299 1300 n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); 1301 n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); 1302 return 0; 1303 } 1304 1305 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, 1306 u32 *new_snapids, 1307 u32 *snapshot_subvols, 1308 unsigned nr_snapids) 1309 { 1310 BUG_ON((parent == 0) != (nr_snapids == 1)); 1311 BUG_ON((parent != 0) != (nr_snapids == 2)); 1312 1313 return parent 1314 ? bch2_snapshot_node_create_children(trans, parent, 1315 new_snapids, snapshot_subvols, nr_snapids) 1316 : bch2_snapshot_node_create_tree(trans, 1317 new_snapids, snapshot_subvols, nr_snapids); 1318 1319 } 1320 1321 /* 1322 * If we have an unlinked inode in an internal snapshot node, and the inode 1323 * really has been deleted in all child snapshots, how does this get cleaned up? 1324 * 1325 * first there is the problem of how keys that have been overwritten in all 1326 * child snapshots get deleted (unimplemented?), but inodes may perhaps be 1327 * special? 1328 * 1329 * also: unlinked inode in internal snapshot appears to not be getting deleted 1330 * correctly if inode doesn't exist in leaf snapshots 1331 * 1332 * solution: 1333 * 1334 * for a key in an interior snapshot node that needs work to be done that 1335 * requires it to be mutated: iterate over all descendent leaf nodes and copy 1336 * that key to snapshot leaf nodes, where we can mutate it 1337 */ 1338 1339 struct snapshot_interior_delete { 1340 u32 id; 1341 u32 live_child; 1342 }; 1343 typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; 1344 1345 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) 1346 { 1347 darray_for_each(*l, i) 1348 if (i->id == id) 1349 return i->live_child; 1350 return 0; 1351 } 1352 1353 static unsigned __live_child(struct snapshot_table *t, u32 id, 1354 snapshot_id_list *delete_leaves, 1355 interior_delete_list *delete_interior) 1356 { 1357 struct snapshot_t *s = __snapshot_t(t, id); 1358 if (!s) 1359 return 0; 1360 1361 for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) 1362 if (s->children[i] && 1363 !snapshot_list_has_id(delete_leaves, s->children[i]) && 1364 !interior_delete_has_id(delete_interior, s->children[i])) 1365 return s->children[i]; 1366 1367 for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { 1368 u32 live_child = s->children[i] 1369 ? __live_child(t, s->children[i], delete_leaves, delete_interior) 1370 : 0; 1371 if (live_child) 1372 return live_child; 1373 } 1374 1375 return 0; 1376 } 1377 1378 static unsigned live_child(struct bch_fs *c, u32 id, 1379 snapshot_id_list *delete_leaves, 1380 interior_delete_list *delete_interior) 1381 { 1382 rcu_read_lock(); 1383 u32 ret = __live_child(rcu_dereference(c->snapshots), id, 1384 delete_leaves, delete_interior); 1385 rcu_read_unlock(); 1386 return ret; 1387 } 1388 1389 static int delete_dead_snapshots_process_key(struct btree_trans *trans, 1390 struct btree_iter *iter, 1391 struct bkey_s_c k, 1392 snapshot_id_list *delete_leaves, 1393 interior_delete_list *delete_interior) 1394 { 1395 if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) 1396 return bch2_btree_delete_at(trans, iter, 1397 BTREE_UPDATE_internal_snapshot_node); 1398 1399 u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); 1400 if (live_child) { 1401 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); 1402 int ret = PTR_ERR_OR_ZERO(new); 1403 if (ret) 1404 return ret; 1405 1406 new->k.p.snapshot = live_child; 1407 1408 struct btree_iter dst_iter; 1409 struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, 1410 iter->btree_id, new->k.p, 1411 BTREE_ITER_all_snapshots| 1412 BTREE_ITER_intent); 1413 ret = bkey_err(dst_k); 1414 if (ret) 1415 return ret; 1416 1417 ret = (bkey_deleted(dst_k.k) 1418 ? bch2_trans_update(trans, &dst_iter, new, 1419 BTREE_UPDATE_internal_snapshot_node) 1420 : 0) ?: 1421 bch2_btree_delete_at(trans, iter, 1422 BTREE_UPDATE_internal_snapshot_node); 1423 bch2_trans_iter_exit(trans, &dst_iter); 1424 return ret; 1425 } 1426 1427 return 0; 1428 } 1429 1430 /* 1431 * For a given snapshot, if it doesn't have a subvolume that points to it, and 1432 * it doesn't have child snapshot nodes - it's now redundant and we can mark it 1433 * as deleted. 1434 */ 1435 static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, 1436 snapshot_id_list *delete_leaves, 1437 interior_delete_list *delete_interior) 1438 { 1439 if (k.k->type != KEY_TYPE_snapshot) 1440 return 0; 1441 1442 struct bch_fs *c = trans->c; 1443 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); 1444 unsigned live_children = 0; 1445 1446 if (BCH_SNAPSHOT_SUBVOL(s.v)) 1447 return 0; 1448 1449 for (unsigned i = 0; i < 2; i++) { 1450 u32 child = le32_to_cpu(s.v->children[i]); 1451 1452 live_children += child && 1453 !snapshot_list_has_id(delete_leaves, child); 1454 } 1455 1456 if (live_children == 0) { 1457 return snapshot_list_add(c, delete_leaves, s.k->p.offset); 1458 } else if (live_children == 1) { 1459 struct snapshot_interior_delete d = { 1460 .id = s.k->p.offset, 1461 .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), 1462 }; 1463 1464 if (!d.live_child) { 1465 bch_err(c, "error finding live child of snapshot %u", d.id); 1466 return -EINVAL; 1467 } 1468 1469 return darray_push(delete_interior, d); 1470 } else { 1471 return 0; 1472 } 1473 } 1474 1475 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, 1476 interior_delete_list *skip) 1477 { 1478 rcu_read_lock(); 1479 while (interior_delete_has_id(skip, id)) 1480 id = __bch2_snapshot_parent(c, id); 1481 1482 while (n--) { 1483 do { 1484 id = __bch2_snapshot_parent(c, id); 1485 } while (interior_delete_has_id(skip, id)); 1486 } 1487 rcu_read_unlock(); 1488 1489 return id; 1490 } 1491 1492 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, 1493 struct btree_iter *iter, struct bkey_s_c k, 1494 interior_delete_list *deleted) 1495 { 1496 struct bch_fs *c = trans->c; 1497 u32 nr_deleted_ancestors = 0; 1498 struct bkey_i_snapshot *s; 1499 int ret; 1500 1501 if (k.k->type != KEY_TYPE_snapshot) 1502 return 0; 1503 1504 if (interior_delete_has_id(deleted, k.k->p.offset)) 1505 return 0; 1506 1507 s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); 1508 ret = PTR_ERR_OR_ZERO(s); 1509 if (ret) 1510 return ret; 1511 1512 darray_for_each(*deleted, i) 1513 nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); 1514 1515 if (!nr_deleted_ancestors) 1516 return 0; 1517 1518 le32_add_cpu(&s->v.depth, -nr_deleted_ancestors); 1519 1520 if (!s->v.depth) { 1521 s->v.skip[0] = 0; 1522 s->v.skip[1] = 0; 1523 s->v.skip[2] = 0; 1524 } else { 1525 u32 depth = le32_to_cpu(s->v.depth); 1526 u32 parent = bch2_snapshot_parent(c, s->k.p.offset); 1527 1528 for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { 1529 u32 id = le32_to_cpu(s->v.skip[j]); 1530 1531 if (interior_delete_has_id(deleted, id)) { 1532 id = bch2_snapshot_nth_parent_skip(c, 1533 parent, 1534 depth > 1 1535 ? get_random_u32_below(depth - 1) 1536 : 0, 1537 deleted); 1538 s->v.skip[j] = cpu_to_le32(id); 1539 } 1540 } 1541 1542 bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32); 1543 } 1544 1545 return bch2_trans_update(trans, iter, &s->k_i, 0); 1546 } 1547 1548 int bch2_delete_dead_snapshots(struct bch_fs *c) 1549 { 1550 if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) 1551 return 0; 1552 1553 struct btree_trans *trans = bch2_trans_get(c); 1554 snapshot_id_list delete_leaves = {}; 1555 interior_delete_list delete_interior = {}; 1556 int ret = 0; 1557 1558 /* 1559 * For every snapshot node: If we have no live children and it's not 1560 * pointed to by a subvolume, delete it: 1561 */ 1562 ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, 1563 check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); 1564 if (!bch2_err_matches(ret, EROFS)) 1565 bch_err_msg(c, ret, "walking snapshots"); 1566 if (ret) 1567 goto err; 1568 1569 if (!delete_leaves.nr && !delete_interior.nr) 1570 goto err; 1571 1572 { 1573 struct printbuf buf = PRINTBUF; 1574 prt_printf(&buf, "deleting leaves"); 1575 darray_for_each(delete_leaves, i) 1576 prt_printf(&buf, " %u", *i); 1577 1578 prt_printf(&buf, " interior"); 1579 darray_for_each(delete_interior, i) 1580 prt_printf(&buf, " %u->%u", i->id, i->live_child); 1581 1582 ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); 1583 printbuf_exit(&buf); 1584 if (ret) 1585 goto err; 1586 } 1587 1588 for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { 1589 struct disk_reservation res = { 0 }; 1590 1591 if (!btree_type_has_snapshots(btree)) 1592 continue; 1593 1594 ret = for_each_btree_key_commit(trans, iter, 1595 btree, POS_MIN, 1596 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1597 &res, NULL, BCH_TRANS_COMMIT_no_enospc, 1598 delete_dead_snapshots_process_key(trans, &iter, k, 1599 &delete_leaves, 1600 &delete_interior)); 1601 1602 bch2_disk_reservation_put(c, &res); 1603 1604 if (!bch2_err_matches(ret, EROFS)) 1605 bch_err_msg(c, ret, "deleting keys from dying snapshots"); 1606 if (ret) 1607 goto err; 1608 } 1609 1610 darray_for_each(delete_leaves, i) { 1611 ret = commit_do(trans, NULL, NULL, 0, 1612 bch2_snapshot_node_delete(trans, *i)); 1613 if (!bch2_err_matches(ret, EROFS)) 1614 bch_err_msg(c, ret, "deleting snapshot %u", *i); 1615 if (ret) 1616 goto err; 1617 } 1618 1619 /* 1620 * Fixing children of deleted snapshots can't be done completely 1621 * atomically, if we crash between here and when we delete the interior 1622 * nodes some depth fields will be off: 1623 */ 1624 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, 1625 BTREE_ITER_intent, k, 1626 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1627 bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); 1628 if (ret) 1629 goto err; 1630 1631 darray_for_each(delete_interior, i) { 1632 ret = commit_do(trans, NULL, NULL, 0, 1633 bch2_snapshot_node_delete(trans, i->id)); 1634 if (!bch2_err_matches(ret, EROFS)) 1635 bch_err_msg(c, ret, "deleting snapshot %u", i->id); 1636 if (ret) 1637 goto err; 1638 } 1639 err: 1640 darray_exit(&delete_interior); 1641 darray_exit(&delete_leaves); 1642 bch2_trans_put(trans); 1643 if (!bch2_err_matches(ret, EROFS)) 1644 bch_err_fn(c, ret); 1645 return ret; 1646 } 1647 1648 void bch2_delete_dead_snapshots_work(struct work_struct *work) 1649 { 1650 struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); 1651 1652 set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); 1653 1654 bch2_delete_dead_snapshots(c); 1655 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); 1656 } 1657 1658 void bch2_delete_dead_snapshots_async(struct bch_fs *c) 1659 { 1660 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) 1661 return; 1662 1663 BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); 1664 1665 if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) 1666 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); 1667 } 1668 1669 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, 1670 enum btree_id id, 1671 struct bpos pos) 1672 { 1673 struct bch_fs *c = trans->c; 1674 struct btree_iter iter; 1675 struct bkey_s_c k; 1676 int ret; 1677 1678 for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos), 1679 BTREE_ITER_not_extents| 1680 BTREE_ITER_all_snapshots, 1681 k, ret) { 1682 if (!bkey_eq(pos, k.k->p)) 1683 break; 1684 1685 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { 1686 ret = 1; 1687 break; 1688 } 1689 } 1690 bch2_trans_iter_exit(trans, &iter); 1691 1692 return ret; 1693 } 1694 1695 static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) 1696 { 1697 /* If there's one child, it's redundant and keys will be moved to the child */ 1698 return !!snap.v->children[0] + !!snap.v->children[1] == 1; 1699 } 1700 1701 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) 1702 { 1703 if (k.k->type != KEY_TYPE_snapshot) 1704 return 0; 1705 1706 struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); 1707 if (BCH_SNAPSHOT_DELETED(snap.v) || 1708 interior_snapshot_needs_delete(snap)) 1709 set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); 1710 1711 return 0; 1712 } 1713 1714 int bch2_snapshots_read(struct bch_fs *c) 1715 { 1716 /* 1717 * Initializing the is_ancestor bitmaps requires ancestors to already be 1718 * initialized - so mark in reverse: 1719 */ 1720 int ret = bch2_trans_run(c, 1721 for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, 1722 POS_MAX, 0, k, 1723 __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: 1724 bch2_check_snapshot_needs_deletion(trans, k))); 1725 bch_err_fn(c, ret); 1726 1727 /* 1728 * It's important that we check if we need to reconstruct snapshots 1729 * before going RW, so we mark that pass as required in the superblock - 1730 * otherwise, we could end up deleting keys with missing snapshot nodes 1731 * instead 1732 */ 1733 BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && 1734 test_bit(BCH_FS_may_go_rw, &c->flags)); 1735 1736 if (bch2_err_matches(ret, EIO) || 1737 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) 1738 ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); 1739 1740 return ret; 1741 } 1742 1743 void bch2_fs_snapshots_exit(struct bch_fs *c) 1744 { 1745 kvfree(rcu_dereference_protected(c->snapshots, true)); 1746 } 1747