1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_io.h" 6 #include "btree_journal_iter.h" 7 #include "btree_node_scan.h" 8 #include "btree_update_interior.h" 9 #include "buckets.h" 10 #include "error.h" 11 #include "journal_io.h" 12 #include "recovery_passes.h" 13 14 #include <linux/kthread.h> 15 #include <linux/min_heap.h> 16 #include <linux/sched/sysctl.h> 17 #include <linux/sort.h> 18 19 struct find_btree_nodes_worker { 20 struct closure *cl; 21 struct find_btree_nodes *f; 22 struct bch_dev *ca; 23 }; 24 25 static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) 26 { 27 bch2_btree_id_level_to_text(out, n->btree_id, n->level); 28 prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ", 29 n->seq, n->journal_seq, n->cookie); 30 bch2_bpos_to_text(out, n->min_key); 31 prt_str(out, "-"); 32 bch2_bpos_to_text(out, n->max_key); 33 34 if (n->range_updated) 35 prt_str(out, " range updated"); 36 37 for (unsigned i = 0; i < n->nr_ptrs; i++) { 38 prt_char(out, ' '); 39 bch2_extent_ptr_to_text(out, c, n->ptrs + i); 40 } 41 } 42 43 static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) 44 { 45 printbuf_indent_add(out, 2); 46 darray_for_each(nodes, i) { 47 found_btree_node_to_text(out, c, i); 48 prt_newline(out); 49 } 50 printbuf_indent_sub(out, 2); 51 } 52 53 static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) 54 { 55 struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); 56 57 set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); 58 bp->k.p = f->max_key; 59 bp->v.seq = cpu_to_le64(f->cookie); 60 bp->v.sectors_written = 0; 61 bp->v.flags = 0; 62 bp->v.sectors_written = cpu_to_le16(f->sectors_written); 63 bp->v.min_key = f->min_key; 64 SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); 65 memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); 66 } 67 68 static inline u64 bkey_journal_seq(struct bkey_s_c k) 69 { 70 switch (k.k->type) { 71 case KEY_TYPE_inode_v3: 72 return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq); 73 default: 74 return 0; 75 } 76 } 77 78 static int found_btree_node_cmp_cookie(const void *_l, const void *_r) 79 { 80 const struct found_btree_node *l = _l; 81 const struct found_btree_node *r = _r; 82 83 return cmp_int(l->btree_id, r->btree_id) ?: 84 cmp_int(l->level, r->level) ?: 85 cmp_int(l->cookie, r->cookie); 86 } 87 88 /* 89 * Given two found btree nodes, if their sequence numbers are equal, take the 90 * one that's readable: 91 */ 92 static int found_btree_node_cmp_time(const struct found_btree_node *l, 93 const struct found_btree_node *r) 94 { 95 return cmp_int(l->seq, r->seq) ?: 96 cmp_int(l->journal_seq, r->journal_seq); 97 } 98 99 static int found_btree_node_cmp_pos(const void *_l, const void *_r) 100 { 101 const struct found_btree_node *l = _l; 102 const struct found_btree_node *r = _r; 103 104 return cmp_int(l->btree_id, r->btree_id) ?: 105 -cmp_int(l->level, r->level) ?: 106 bpos_cmp(l->min_key, r->min_key) ?: 107 -found_btree_node_cmp_time(l, r); 108 } 109 110 static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg) 111 { 112 return found_btree_node_cmp_pos(l, r) < 0; 113 } 114 115 static inline void found_btree_node_swap(void *_l, void *_r, void *arg) 116 { 117 struct found_btree_node *l = _l; 118 struct found_btree_node *r = _r; 119 120 swap(*l, *r); 121 } 122 123 static const struct min_heap_callbacks found_btree_node_heap_cbs = { 124 .less = found_btree_node_cmp_pos_less, 125 .swp = found_btree_node_swap, 126 }; 127 128 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, 129 struct btree *b, struct bio *bio, u64 offset) 130 { 131 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); 132 struct btree_node *bn = b->data; 133 134 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); 135 bio->bi_iter.bi_sector = offset; 136 bch2_bio_map(bio, b->data, c->opts.block_size); 137 138 u64 submit_time = local_clock(); 139 submit_bio_wait(bio); 140 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); 141 142 if (bio->bi_status) { 143 bch_err_dev_ratelimited(ca, 144 "IO error in try_read_btree_node() at %llu: %s", 145 offset, bch2_blk_status_to_str(bio->bi_status)); 146 return; 147 } 148 149 if (le64_to_cpu(bn->magic) != bset_magic(c)) 150 return; 151 152 if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { 153 if (!c->chacha20_key_set) 154 return; 155 156 struct nonce nonce = btree_nonce(&bn->keys, 0); 157 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; 158 159 bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); 160 } 161 162 if (btree_id_is_alloc(BTREE_NODE_ID(bn))) 163 return; 164 165 if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) 166 return; 167 168 if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) 169 return; 170 171 rcu_read_lock(); 172 struct found_btree_node n = { 173 .btree_id = BTREE_NODE_ID(bn), 174 .level = BTREE_NODE_LEVEL(bn), 175 .seq = BTREE_NODE_SEQ(bn), 176 .cookie = le64_to_cpu(bn->keys.seq), 177 .min_key = bn->min_key, 178 .max_key = bn->max_key, 179 .nr_ptrs = 1, 180 .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, 181 .ptrs[0].offset = offset, 182 .ptrs[0].dev = ca->dev_idx, 183 .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)), 184 }; 185 rcu_read_unlock(); 186 187 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); 188 bio->bi_iter.bi_sector = offset; 189 bch2_bio_map(bio, b->data, c->opts.btree_node_size); 190 191 submit_time = local_clock(); 192 submit_bio_wait(bio); 193 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); 194 195 found_btree_node_to_key(&b->key, &n); 196 197 CLASS(printbuf, buf)(); 198 if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) { 199 /* read_done will swap out b->data for another buffer */ 200 bn = b->data; 201 /* 202 * Grab journal_seq here because we want the max journal_seq of 203 * any bset; read_done sorts down to a single set and picks the 204 * max journal_seq 205 */ 206 n.journal_seq = le64_to_cpu(bn->keys.journal_seq), 207 n.sectors_written = b->written; 208 209 mutex_lock(&f->lock); 210 if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { 211 bch_err(c, "try_read_btree_node() can't handle endian conversion"); 212 f->ret = -EINVAL; 213 goto unlock; 214 } 215 216 if (darray_push(&f->nodes, n)) 217 f->ret = -ENOMEM; 218 unlock: 219 mutex_unlock(&f->lock); 220 } 221 } 222 223 static int read_btree_nodes_worker(void *p) 224 { 225 struct find_btree_nodes_worker *w = p; 226 struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); 227 struct bch_dev *ca = w->ca; 228 unsigned long last_print = jiffies; 229 struct btree *b = NULL; 230 struct bio *bio = NULL; 231 232 b = __bch2_btree_node_mem_alloc(c); 233 if (!b) { 234 bch_err(c, "read_btree_nodes_worker: error allocating buf"); 235 w->f->ret = -ENOMEM; 236 goto err; 237 } 238 239 bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL); 240 if (!bio) { 241 bch_err(c, "read_btree_nodes_worker: error allocating bio"); 242 w->f->ret = -ENOMEM; 243 goto err; 244 } 245 246 for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) 247 for (unsigned bucket_offset = 0; 248 bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; 249 bucket_offset += btree_sectors(c)) { 250 if (time_after(jiffies, last_print + HZ * 30)) { 251 u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; 252 u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; 253 254 bch_info(ca, "%s: %2u%% done", __func__, 255 (unsigned) div64_u64(cur_sector * 100, end_sector)); 256 last_print = jiffies; 257 } 258 259 u64 sector = bucket * ca->mi.bucket_size + bucket_offset; 260 261 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && 262 !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) 263 continue; 264 265 try_read_btree_node(w->f, ca, b, bio, sector); 266 } 267 err: 268 if (b) 269 __btree_node_data_free(b); 270 kfree(b); 271 bio_put(bio); 272 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); 273 closure_put(w->cl); 274 kfree(w); 275 return 0; 276 } 277 278 static int read_btree_nodes(struct find_btree_nodes *f) 279 { 280 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); 281 struct closure cl; 282 int ret = 0; 283 284 closure_init_stack(&cl); 285 286 for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) { 287 if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) 288 continue; 289 290 struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); 291 if (!w) { 292 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); 293 ret = -ENOMEM; 294 goto err; 295 } 296 297 w->cl = &cl; 298 w->f = f; 299 w->ca = ca; 300 301 struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); 302 ret = PTR_ERR_OR_ZERO(t); 303 if (ret) { 304 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); 305 kfree(w); 306 bch_err_msg(c, ret, "starting kthread"); 307 break; 308 } 309 310 closure_get(&cl); 311 enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); 312 wake_up_process(t); 313 } 314 err: 315 while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2)) 316 ; 317 return f->ret ?: ret; 318 } 319 320 static bool nodes_overlap(const struct found_btree_node *l, 321 const struct found_btree_node *r) 322 { 323 return (l->btree_id == r->btree_id && 324 l->level == r->level && 325 bpos_gt(l->max_key, r->min_key)); 326 } 327 328 static int handle_overwrites(struct bch_fs *c, 329 struct found_btree_node *l, 330 found_btree_nodes *nodes_heap) 331 { 332 struct found_btree_node *r; 333 334 while ((r = min_heap_peek(nodes_heap)) && 335 nodes_overlap(l, r)) { 336 int cmp = found_btree_node_cmp_time(l, r); 337 338 if (cmp > 0) { 339 if (bpos_cmp(l->max_key, r->max_key) >= 0) 340 min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); 341 else { 342 r->range_updated = true; 343 r->min_key = bpos_successor(l->max_key); 344 r->range_updated = true; 345 min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); 346 } 347 } else if (cmp < 0) { 348 BUG_ON(bpos_eq(l->min_key, r->min_key)); 349 350 l->max_key = bpos_predecessor(r->min_key); 351 l->range_updated = true; 352 } else if (r->level) { 353 min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); 354 } else { 355 if (bpos_cmp(l->max_key, r->max_key) >= 0) 356 min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); 357 else { 358 r->range_updated = true; 359 r->min_key = bpos_successor(l->max_key); 360 r->range_updated = true; 361 min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); 362 } 363 } 364 365 cond_resched(); 366 } 367 368 return 0; 369 } 370 371 int bch2_scan_for_btree_nodes(struct bch_fs *c) 372 { 373 struct find_btree_nodes *f = &c->found_btree_nodes; 374 struct printbuf buf = PRINTBUF; 375 found_btree_nodes nodes_heap = {}; 376 size_t dst; 377 int ret = 0; 378 379 if (f->nodes.nr) 380 return 0; 381 382 mutex_init(&f->lock); 383 384 ret = read_btree_nodes(f); 385 if (ret) 386 return ret; 387 388 if (!f->nodes.nr) { 389 bch_err(c, "%s: no btree nodes found", __func__); 390 ret = -EINVAL; 391 goto err; 392 } 393 394 if (0 && c->opts.verbose) { 395 printbuf_reset(&buf); 396 prt_printf(&buf, "%s: nodes found:\n", __func__); 397 found_btree_nodes_to_text(&buf, c, f->nodes); 398 bch2_print_str(c, KERN_INFO, buf.buf); 399 } 400 401 sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); 402 403 dst = 0; 404 darray_for_each(f->nodes, i) { 405 struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; 406 407 if (prev && 408 prev->cookie == i->cookie) { 409 if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { 410 bch_err(c, "%s: found too many replicas for btree node", __func__); 411 ret = -EINVAL; 412 goto err; 413 } 414 prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; 415 } else { 416 f->nodes.data[dst++] = *i; 417 } 418 } 419 f->nodes.nr = dst; 420 421 sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); 422 423 if (0 && c->opts.verbose) { 424 printbuf_reset(&buf); 425 prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); 426 found_btree_nodes_to_text(&buf, c, f->nodes); 427 bch2_print_str(c, KERN_INFO, buf.buf); 428 } 429 430 swap(nodes_heap, f->nodes); 431 432 { 433 /* darray must have same layout as a heap */ 434 min_heap_char real_heap; 435 BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr)); 436 BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size)); 437 BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr)); 438 BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size)); 439 } 440 441 min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL); 442 443 if (nodes_heap.nr) { 444 ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); 445 if (ret) 446 goto err; 447 448 min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); 449 } 450 451 while (true) { 452 ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap); 453 if (ret) 454 goto err; 455 456 if (!nodes_heap.nr) 457 break; 458 459 ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); 460 if (ret) 461 goto err; 462 463 min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); 464 } 465 466 for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++) 467 BUG_ON(nodes_overlap(n, n + 1)); 468 469 if (0 && c->opts.verbose) { 470 printbuf_reset(&buf); 471 prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); 472 found_btree_nodes_to_text(&buf, c, f->nodes); 473 bch2_print_str(c, KERN_INFO, buf.buf); 474 } else { 475 bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); 476 } 477 478 eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); 479 err: 480 darray_exit(&nodes_heap); 481 printbuf_exit(&buf); 482 return ret; 483 } 484 485 static int found_btree_node_range_start_cmp(const void *_l, const void *_r) 486 { 487 const struct found_btree_node *l = _l; 488 const struct found_btree_node *r = _r; 489 490 return cmp_int(l->btree_id, r->btree_id) ?: 491 -cmp_int(l->level, r->level) ?: 492 bpos_cmp(l->max_key, r->min_key); 493 } 494 495 #define for_each_found_btree_node_in_range(_f, _search, _idx) \ 496 for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ 497 sizeof((_f)->nodes.data[0]), \ 498 found_btree_node_range_start_cmp, &search); \ 499 _idx < (_f)->nodes.nr && \ 500 (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ 501 (_f)->nodes.data[_idx].level == _search.level && \ 502 bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ 503 _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) 504 505 bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) 506 { 507 struct find_btree_nodes *f = &c->found_btree_nodes; 508 509 struct found_btree_node search = { 510 .btree_id = b->c.btree_id, 511 .level = b->c.level, 512 .min_key = b->data->min_key, 513 .max_key = b->key.k.p, 514 }; 515 516 for_each_found_btree_node_in_range(f, search, idx) 517 if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) 518 return true; 519 return false; 520 } 521 522 int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) 523 { 524 int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); 525 if (ret) 526 return ret; 527 528 struct found_btree_node search = { 529 .btree_id = btree, 530 .level = 0, 531 .min_key = POS_MIN, 532 .max_key = SPOS_MAX, 533 }; 534 535 for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) 536 return true; 537 return false; 538 } 539 540 int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, 541 unsigned level, struct bpos node_min, struct bpos node_max) 542 { 543 if (btree_id_is_alloc(btree)) 544 return 0; 545 546 struct find_btree_nodes *f = &c->found_btree_nodes; 547 548 int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); 549 if (ret) 550 return ret; 551 552 if (c->opts.verbose) { 553 struct printbuf buf = PRINTBUF; 554 555 prt_str(&buf, "recovery "); 556 bch2_btree_id_level_to_text(&buf, btree, level); 557 prt_str(&buf, " "); 558 bch2_bpos_to_text(&buf, node_min); 559 prt_str(&buf, " - "); 560 bch2_bpos_to_text(&buf, node_max); 561 562 bch_info(c, "%s(): %s", __func__, buf.buf); 563 printbuf_exit(&buf); 564 } 565 566 struct found_btree_node search = { 567 .btree_id = btree, 568 .level = level, 569 .min_key = node_min, 570 .max_key = node_max, 571 }; 572 573 for_each_found_btree_node_in_range(f, search, idx) { 574 struct found_btree_node n = f->nodes.data[idx]; 575 576 n.range_updated |= bpos_lt(n.min_key, node_min); 577 n.min_key = bpos_max(n.min_key, node_min); 578 579 n.range_updated |= bpos_gt(n.max_key, node_max); 580 n.max_key = bpos_min(n.max_key, node_max); 581 582 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; 583 584 found_btree_node_to_key(&tmp.k, &n); 585 586 if (c->opts.verbose) { 587 struct printbuf buf = PRINTBUF; 588 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); 589 bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); 590 printbuf_exit(&buf); 591 } 592 593 BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), 594 (struct bkey_validate_context) { 595 .from = BKEY_VALIDATE_btree_node, 596 .level = level + 1, 597 .btree = btree, 598 })); 599 600 ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); 601 if (ret) 602 return ret; 603 } 604 605 return 0; 606 } 607 608 void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) 609 { 610 darray_exit(&f->nodes); 611 } 612