1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_io.h" 6 #include "btree_journal_iter.h" 7 #include "btree_node_scan.h" 8 #include "btree_update_interior.h" 9 #include "buckets.h" 10 #include "error.h" 11 #include "journal_io.h" 12 #include "recovery_passes.h" 13 14 #include <linux/kthread.h> 15 #include <linux/min_heap.h> 16 #include <linux/sort.h> 17 18 struct find_btree_nodes_worker { 19 struct closure *cl; 20 struct find_btree_nodes *f; 21 struct bch_dev *ca; 22 }; 23 24 static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) 25 { 26 bch2_btree_id_level_to_text(out, n->btree_id, n->level); 27 prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ", 28 n->seq, n->journal_seq, n->cookie); 29 bch2_bpos_to_text(out, n->min_key); 30 prt_str(out, "-"); 31 bch2_bpos_to_text(out, n->max_key); 32 33 if (n->range_updated) 34 prt_str(out, " range updated"); 35 36 for (unsigned i = 0; i < n->nr_ptrs; i++) { 37 prt_char(out, ' '); 38 bch2_extent_ptr_to_text(out, c, n->ptrs + i); 39 } 40 } 41 42 static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) 43 { 44 printbuf_indent_add(out, 2); 45 darray_for_each(nodes, i) { 46 found_btree_node_to_text(out, c, i); 47 prt_newline(out); 48 } 49 printbuf_indent_sub(out, 2); 50 } 51 52 static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) 53 { 54 struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); 55 56 set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); 57 bp->k.p = f->max_key; 58 bp->v.seq = cpu_to_le64(f->cookie); 59 bp->v.sectors_written = 0; 60 bp->v.flags = 0; 61 bp->v.sectors_written = cpu_to_le16(f->sectors_written); 62 bp->v.min_key = f->min_key; 63 SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); 64 memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); 65 } 66 67 static inline u64 bkey_journal_seq(struct bkey_s_c k) 68 { 69 switch (k.k->type) { 70 case KEY_TYPE_inode_v3: 71 return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq); 72 default: 73 return 0; 74 } 75 } 76 77 static bool found_btree_node_is_readable(struct btree_trans *trans, 78 struct found_btree_node *f) 79 { 80 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; 81 82 found_btree_node_to_key(&tmp.k, f); 83 84 struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false); 85 bool ret = !IS_ERR_OR_NULL(b); 86 if (!ret) 87 return ret; 88 89 f->sectors_written = b->written; 90 f->journal_seq = le64_to_cpu(b->data->keys.journal_seq); 91 92 struct bkey_s_c k; 93 struct bkey unpacked; 94 struct btree_node_iter iter; 95 for_each_btree_node_key_unpack(b, k, &iter, &unpacked) 96 f->journal_seq = max(f->journal_seq, bkey_journal_seq(k)); 97 98 six_unlock_read(&b->c.lock); 99 100 /* 101 * We might update this node's range; if that happens, we need the node 102 * to be re-read so the read path can trim keys that are no longer in 103 * this node 104 */ 105 if (b != btree_node_root(trans->c, b)) 106 bch2_btree_node_evict(trans, &tmp.k); 107 return ret; 108 } 109 110 static int found_btree_node_cmp_cookie(const void *_l, const void *_r) 111 { 112 const struct found_btree_node *l = _l; 113 const struct found_btree_node *r = _r; 114 115 return cmp_int(l->btree_id, r->btree_id) ?: 116 cmp_int(l->level, r->level) ?: 117 cmp_int(l->cookie, r->cookie); 118 } 119 120 /* 121 * Given two found btree nodes, if their sequence numbers are equal, take the 122 * one that's readable: 123 */ 124 static int found_btree_node_cmp_time(const struct found_btree_node *l, 125 const struct found_btree_node *r) 126 { 127 return cmp_int(l->seq, r->seq) ?: 128 cmp_int(l->journal_seq, r->journal_seq); 129 } 130 131 static int found_btree_node_cmp_pos(const void *_l, const void *_r) 132 { 133 const struct found_btree_node *l = _l; 134 const struct found_btree_node *r = _r; 135 136 return cmp_int(l->btree_id, r->btree_id) ?: 137 -cmp_int(l->level, r->level) ?: 138 bpos_cmp(l->min_key, r->min_key) ?: 139 -found_btree_node_cmp_time(l, r); 140 } 141 142 static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg) 143 { 144 return found_btree_node_cmp_pos(l, r) < 0; 145 } 146 147 static inline void found_btree_node_swap(void *_l, void *_r, void *arg) 148 { 149 struct found_btree_node *l = _l; 150 struct found_btree_node *r = _r; 151 152 swap(*l, *r); 153 } 154 155 static const struct min_heap_callbacks found_btree_node_heap_cbs = { 156 .less = found_btree_node_cmp_pos_less, 157 .swp = found_btree_node_swap, 158 }; 159 160 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, 161 struct bio *bio, struct btree_node *bn, u64 offset) 162 { 163 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); 164 165 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); 166 bio->bi_iter.bi_sector = offset; 167 bch2_bio_map(bio, bn, PAGE_SIZE); 168 169 submit_bio_wait(bio); 170 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, 171 "IO error in try_read_btree_node() at %llu: %s", 172 offset, bch2_blk_status_to_str(bio->bi_status))) 173 return; 174 175 if (le64_to_cpu(bn->magic) != bset_magic(c)) 176 return; 177 178 if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { 179 if (!c->chacha20) 180 return; 181 182 struct nonce nonce = btree_nonce(&bn->keys, 0); 183 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; 184 185 bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); 186 } 187 188 if (btree_id_is_alloc(BTREE_NODE_ID(bn))) 189 return; 190 191 if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) 192 return; 193 194 if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) 195 return; 196 197 rcu_read_lock(); 198 struct found_btree_node n = { 199 .btree_id = BTREE_NODE_ID(bn), 200 .level = BTREE_NODE_LEVEL(bn), 201 .seq = BTREE_NODE_SEQ(bn), 202 .cookie = le64_to_cpu(bn->keys.seq), 203 .min_key = bn->min_key, 204 .max_key = bn->max_key, 205 .nr_ptrs = 1, 206 .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, 207 .ptrs[0].offset = offset, 208 .ptrs[0].dev = ca->dev_idx, 209 .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)), 210 }; 211 rcu_read_unlock(); 212 213 if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { 214 mutex_lock(&f->lock); 215 if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { 216 bch_err(c, "try_read_btree_node() can't handle endian conversion"); 217 f->ret = -EINVAL; 218 goto unlock; 219 } 220 221 if (darray_push(&f->nodes, n)) 222 f->ret = -ENOMEM; 223 unlock: 224 mutex_unlock(&f->lock); 225 } 226 } 227 228 static int read_btree_nodes_worker(void *p) 229 { 230 struct find_btree_nodes_worker *w = p; 231 struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); 232 struct bch_dev *ca = w->ca; 233 void *buf = (void *) __get_free_page(GFP_KERNEL); 234 struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); 235 unsigned long last_print = jiffies; 236 237 if (!buf || !bio) { 238 bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); 239 w->f->ret = -ENOMEM; 240 goto err; 241 } 242 243 for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) 244 for (unsigned bucket_offset = 0; 245 bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; 246 bucket_offset += btree_sectors(c)) { 247 if (time_after(jiffies, last_print + HZ * 30)) { 248 u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; 249 u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; 250 251 bch_info(ca, "%s: %2u%% done", __func__, 252 (unsigned) div64_u64(cur_sector * 100, end_sector)); 253 last_print = jiffies; 254 } 255 256 u64 sector = bucket * ca->mi.bucket_size + bucket_offset; 257 258 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && 259 !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) 260 continue; 261 262 try_read_btree_node(w->f, ca, bio, buf, sector); 263 } 264 err: 265 bio_put(bio); 266 free_page((unsigned long) buf); 267 percpu_ref_get(&ca->io_ref); 268 closure_put(w->cl); 269 kfree(w); 270 return 0; 271 } 272 273 static int read_btree_nodes(struct find_btree_nodes *f) 274 { 275 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); 276 struct closure cl; 277 int ret = 0; 278 279 closure_init_stack(&cl); 280 281 for_each_online_member(c, ca) { 282 if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) 283 continue; 284 285 struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); 286 struct task_struct *t; 287 288 if (!w) { 289 percpu_ref_put(&ca->io_ref); 290 ret = -ENOMEM; 291 goto err; 292 } 293 294 percpu_ref_get(&ca->io_ref); 295 closure_get(&cl); 296 w->cl = &cl; 297 w->f = f; 298 w->ca = ca; 299 300 t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); 301 ret = PTR_ERR_OR_ZERO(t); 302 if (ret) { 303 percpu_ref_put(&ca->io_ref); 304 closure_put(&cl); 305 f->ret = ret; 306 bch_err(c, "error starting kthread: %i", ret); 307 break; 308 } 309 } 310 err: 311 closure_sync(&cl); 312 return f->ret ?: ret; 313 } 314 315 static bool nodes_overlap(const struct found_btree_node *l, 316 const struct found_btree_node *r) 317 { 318 return (l->btree_id == r->btree_id && 319 l->level == r->level && 320 bpos_gt(l->max_key, r->min_key)); 321 } 322 323 static int handle_overwrites(struct bch_fs *c, 324 struct found_btree_node *l, 325 found_btree_nodes *nodes_heap) 326 { 327 struct found_btree_node *r; 328 329 while ((r = min_heap_peek(nodes_heap)) && 330 nodes_overlap(l, r)) { 331 int cmp = found_btree_node_cmp_time(l, r); 332 333 if (cmp > 0) { 334 if (bpos_cmp(l->max_key, r->max_key) >= 0) 335 min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); 336 else { 337 r->range_updated = true; 338 r->min_key = bpos_successor(l->max_key); 339 r->range_updated = true; 340 min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); 341 } 342 } else if (cmp < 0) { 343 BUG_ON(bpos_eq(l->min_key, r->min_key)); 344 345 l->max_key = bpos_predecessor(r->min_key); 346 l->range_updated = true; 347 } else if (r->level) { 348 min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); 349 } else { 350 if (bpos_cmp(l->max_key, r->max_key) >= 0) 351 min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); 352 else { 353 r->range_updated = true; 354 r->min_key = bpos_successor(l->max_key); 355 r->range_updated = true; 356 min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); 357 } 358 } 359 } 360 361 return 0; 362 } 363 364 int bch2_scan_for_btree_nodes(struct bch_fs *c) 365 { 366 struct find_btree_nodes *f = &c->found_btree_nodes; 367 struct printbuf buf = PRINTBUF; 368 found_btree_nodes nodes_heap = {}; 369 size_t dst; 370 int ret = 0; 371 372 if (f->nodes.nr) 373 return 0; 374 375 mutex_init(&f->lock); 376 377 ret = read_btree_nodes(f); 378 if (ret) 379 return ret; 380 381 if (!f->nodes.nr) { 382 bch_err(c, "%s: no btree nodes found", __func__); 383 ret = -EINVAL; 384 goto err; 385 } 386 387 if (0 && c->opts.verbose) { 388 printbuf_reset(&buf); 389 prt_printf(&buf, "%s: nodes found:\n", __func__); 390 found_btree_nodes_to_text(&buf, c, f->nodes); 391 bch2_print_string_as_lines(KERN_INFO, buf.buf); 392 } 393 394 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); 395 396 dst = 0; 397 darray_for_each(f->nodes, i) { 398 struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; 399 400 if (prev && 401 prev->cookie == i->cookie) { 402 if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { 403 bch_err(c, "%s: found too many replicas for btree node", __func__); 404 ret = -EINVAL; 405 goto err; 406 } 407 prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; 408 } else { 409 f->nodes.data[dst++] = *i; 410 } 411 } 412 f->nodes.nr = dst; 413 414 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); 415 416 if (0 && c->opts.verbose) { 417 printbuf_reset(&buf); 418 prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); 419 found_btree_nodes_to_text(&buf, c, f->nodes); 420 bch2_print_string_as_lines(KERN_INFO, buf.buf); 421 } 422 423 swap(nodes_heap, f->nodes); 424 425 { 426 /* darray must have same layout as a heap */ 427 min_heap_char real_heap; 428 BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr)); 429 BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size)); 430 BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr)); 431 BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size)); 432 } 433 434 min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL); 435 436 if (nodes_heap.nr) { 437 ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); 438 if (ret) 439 goto err; 440 441 min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); 442 } 443 444 while (true) { 445 ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap); 446 if (ret) 447 goto err; 448 449 if (!nodes_heap.nr) 450 break; 451 452 ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); 453 if (ret) 454 goto err; 455 456 min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); 457 } 458 459 for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++) 460 BUG_ON(nodes_overlap(n, n + 1)); 461 462 if (0 && c->opts.verbose) { 463 printbuf_reset(&buf); 464 prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); 465 found_btree_nodes_to_text(&buf, c, f->nodes); 466 bch2_print_string_as_lines(KERN_INFO, buf.buf); 467 } else { 468 bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); 469 } 470 471 eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); 472 err: 473 darray_exit(&nodes_heap); 474 printbuf_exit(&buf); 475 return ret; 476 } 477 478 static int found_btree_node_range_start_cmp(const void *_l, const void *_r) 479 { 480 const struct found_btree_node *l = _l; 481 const struct found_btree_node *r = _r; 482 483 return cmp_int(l->btree_id, r->btree_id) ?: 484 -cmp_int(l->level, r->level) ?: 485 bpos_cmp(l->max_key, r->min_key); 486 } 487 488 #define for_each_found_btree_node_in_range(_f, _search, _idx) \ 489 for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ 490 sizeof((_f)->nodes.data[0]), \ 491 found_btree_node_range_start_cmp, &search); \ 492 _idx < (_f)->nodes.nr && \ 493 (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ 494 (_f)->nodes.data[_idx].level == _search.level && \ 495 bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ 496 _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) 497 498 bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) 499 { 500 struct find_btree_nodes *f = &c->found_btree_nodes; 501 502 struct found_btree_node search = { 503 .btree_id = b->c.btree_id, 504 .level = b->c.level, 505 .min_key = b->data->min_key, 506 .max_key = b->key.k.p, 507 }; 508 509 for_each_found_btree_node_in_range(f, search, idx) 510 if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) 511 return true; 512 return false; 513 } 514 515 bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) 516 { 517 struct found_btree_node search = { 518 .btree_id = btree, 519 .level = 0, 520 .min_key = POS_MIN, 521 .max_key = SPOS_MAX, 522 }; 523 524 for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) 525 return true; 526 return false; 527 } 528 529 int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, 530 unsigned level, struct bpos node_min, struct bpos node_max) 531 { 532 if (btree_id_is_alloc(btree)) 533 return 0; 534 535 struct find_btree_nodes *f = &c->found_btree_nodes; 536 537 int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); 538 if (ret) 539 return ret; 540 541 if (c->opts.verbose) { 542 struct printbuf buf = PRINTBUF; 543 544 prt_str(&buf, "recovery "); 545 bch2_btree_id_level_to_text(&buf, btree, level); 546 prt_str(&buf, " "); 547 bch2_bpos_to_text(&buf, node_min); 548 prt_str(&buf, " - "); 549 bch2_bpos_to_text(&buf, node_max); 550 551 bch_info(c, "%s(): %s", __func__, buf.buf); 552 printbuf_exit(&buf); 553 } 554 555 struct found_btree_node search = { 556 .btree_id = btree, 557 .level = level, 558 .min_key = node_min, 559 .max_key = node_max, 560 }; 561 562 for_each_found_btree_node_in_range(f, search, idx) { 563 struct found_btree_node n = f->nodes.data[idx]; 564 565 n.range_updated |= bpos_lt(n.min_key, node_min); 566 n.min_key = bpos_max(n.min_key, node_min); 567 568 n.range_updated |= bpos_gt(n.max_key, node_max); 569 n.max_key = bpos_min(n.max_key, node_max); 570 571 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; 572 573 found_btree_node_to_key(&tmp.k, &n); 574 575 struct printbuf buf = PRINTBUF; 576 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); 577 bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); 578 printbuf_exit(&buf); 579 580 BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), 581 (struct bkey_validate_context) { 582 .from = BKEY_VALIDATE_btree_node, 583 .level = level + 1, 584 .btree = btree, 585 })); 586 587 ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); 588 if (ret) 589 return ret; 590 } 591 592 return 0; 593 } 594 595 void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) 596 { 597 darray_exit(&f->nodes); 598 } 599