1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_io.h" 6 #include "btree_journal_iter.h" 7 #include "btree_node_scan.h" 8 #include "btree_update_interior.h" 9 #include "buckets.h" 10 #include "error.h" 11 #include "journal_io.h" 12 #include "recovery_passes.h" 13 14 #include <linux/kthread.h> 15 #include <linux/sort.h> 16 17 struct find_btree_nodes_worker { 18 struct closure *cl; 19 struct find_btree_nodes *f; 20 struct bch_dev *ca; 21 }; 22 23 static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) 24 { 25 prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie); 26 bch2_bpos_to_text(out, n->min_key); 27 prt_str(out, "-"); 28 bch2_bpos_to_text(out, n->max_key); 29 30 if (n->range_updated) 31 prt_str(out, " range updated"); 32 if (n->overwritten) 33 prt_str(out, " overwritten"); 34 35 for (unsigned i = 0; i < n->nr_ptrs; i++) { 36 prt_char(out, ' '); 37 bch2_extent_ptr_to_text(out, c, n->ptrs + i); 38 } 39 } 40 41 static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) 42 { 43 printbuf_indent_add(out, 2); 44 darray_for_each(nodes, i) { 45 found_btree_node_to_text(out, c, i); 46 prt_newline(out); 47 } 48 printbuf_indent_sub(out, 2); 49 } 50 51 static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) 52 { 53 struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); 54 55 set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); 56 bp->k.p = f->max_key; 57 bp->v.seq = cpu_to_le64(f->cookie); 58 bp->v.sectors_written = 0; 59 bp->v.flags = 0; 60 bp->v.sectors_written = cpu_to_le16(f->sectors_written); 61 bp->v.min_key = f->min_key; 62 SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); 63 memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); 64 } 65 66 static bool found_btree_node_is_readable(struct btree_trans *trans, 67 struct found_btree_node *f) 68 { 69 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k; 70 71 found_btree_node_to_key(&k.k, f); 72 73 struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); 74 bool ret = !IS_ERR_OR_NULL(b); 75 if (!ret) 76 return ret; 77 78 f->sectors_written = b->written; 79 six_unlock_read(&b->c.lock); 80 81 /* 82 * We might update this node's range; if that happens, we need the node 83 * to be re-read so the read path can trim keys that are no longer in 84 * this node 85 */ 86 if (b != btree_node_root(trans->c, b)) 87 bch2_btree_node_evict(trans, &k.k); 88 return ret; 89 } 90 91 static int found_btree_node_cmp_cookie(const void *_l, const void *_r) 92 { 93 const struct found_btree_node *l = _l; 94 const struct found_btree_node *r = _r; 95 96 return cmp_int(l->btree_id, r->btree_id) ?: 97 cmp_int(l->level, r->level) ?: 98 cmp_int(l->cookie, r->cookie); 99 } 100 101 /* 102 * Given two found btree nodes, if their sequence numbers are equal, take the 103 * one that's readable: 104 */ 105 static int found_btree_node_cmp_time(const struct found_btree_node *l, 106 const struct found_btree_node *r) 107 { 108 return cmp_int(l->seq, r->seq); 109 } 110 111 static int found_btree_node_cmp_pos(const void *_l, const void *_r) 112 { 113 const struct found_btree_node *l = _l; 114 const struct found_btree_node *r = _r; 115 116 return cmp_int(l->btree_id, r->btree_id) ?: 117 -cmp_int(l->level, r->level) ?: 118 bpos_cmp(l->min_key, r->min_key) ?: 119 -found_btree_node_cmp_time(l, r); 120 } 121 122 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, 123 struct bio *bio, struct btree_node *bn, u64 offset) 124 { 125 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); 126 127 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); 128 bio->bi_iter.bi_sector = offset; 129 bch2_bio_map(bio, bn, PAGE_SIZE); 130 131 submit_bio_wait(bio); 132 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, 133 "IO error in try_read_btree_node() at %llu: %s", 134 offset, bch2_blk_status_to_str(bio->bi_status))) 135 return; 136 137 if (le64_to_cpu(bn->magic) != bset_magic(c)) 138 return; 139 140 if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { 141 struct nonce nonce = btree_nonce(&bn->keys, 0); 142 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; 143 144 bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); 145 } 146 147 if (btree_id_is_alloc(BTREE_NODE_ID(bn))) 148 return; 149 150 if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) 151 return; 152 153 rcu_read_lock(); 154 struct found_btree_node n = { 155 .btree_id = BTREE_NODE_ID(bn), 156 .level = BTREE_NODE_LEVEL(bn), 157 .seq = BTREE_NODE_SEQ(bn), 158 .cookie = le64_to_cpu(bn->keys.seq), 159 .min_key = bn->min_key, 160 .max_key = bn->max_key, 161 .nr_ptrs = 1, 162 .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, 163 .ptrs[0].offset = offset, 164 .ptrs[0].dev = ca->dev_idx, 165 .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), 166 }; 167 rcu_read_unlock(); 168 169 if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { 170 mutex_lock(&f->lock); 171 if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { 172 bch_err(c, "try_read_btree_node() can't handle endian conversion"); 173 f->ret = -EINVAL; 174 goto unlock; 175 } 176 177 if (darray_push(&f->nodes, n)) 178 f->ret = -ENOMEM; 179 unlock: 180 mutex_unlock(&f->lock); 181 } 182 } 183 184 static int read_btree_nodes_worker(void *p) 185 { 186 struct find_btree_nodes_worker *w = p; 187 struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); 188 struct bch_dev *ca = w->ca; 189 void *buf = (void *) __get_free_page(GFP_KERNEL); 190 struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); 191 unsigned long last_print = jiffies; 192 193 if (!buf || !bio) { 194 bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); 195 w->f->ret = -ENOMEM; 196 goto err; 197 } 198 199 for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) 200 for (unsigned bucket_offset = 0; 201 bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; 202 bucket_offset += btree_sectors(c)) { 203 if (time_after(jiffies, last_print + HZ * 30)) { 204 u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; 205 u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; 206 207 bch_info(ca, "%s: %2u%% done", __func__, 208 (unsigned) div64_u64(cur_sector * 100, end_sector)); 209 last_print = jiffies; 210 } 211 212 u64 sector = bucket * ca->mi.bucket_size + bucket_offset; 213 214 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && 215 !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) 216 continue; 217 218 try_read_btree_node(w->f, ca, bio, buf, sector); 219 } 220 err: 221 bio_put(bio); 222 free_page((unsigned long) buf); 223 percpu_ref_get(&ca->io_ref); 224 closure_put(w->cl); 225 kfree(w); 226 return 0; 227 } 228 229 static int read_btree_nodes(struct find_btree_nodes *f) 230 { 231 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); 232 struct closure cl; 233 int ret = 0; 234 235 closure_init_stack(&cl); 236 237 for_each_online_member(c, ca) { 238 if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) 239 continue; 240 241 struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); 242 struct task_struct *t; 243 244 if (!w) { 245 percpu_ref_put(&ca->io_ref); 246 ret = -ENOMEM; 247 goto err; 248 } 249 250 percpu_ref_get(&ca->io_ref); 251 closure_get(&cl); 252 w->cl = &cl; 253 w->f = f; 254 w->ca = ca; 255 256 t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); 257 ret = IS_ERR_OR_NULL(t); 258 if (ret) { 259 percpu_ref_put(&ca->io_ref); 260 closure_put(&cl); 261 f->ret = ret; 262 bch_err(c, "error starting kthread: %i", ret); 263 break; 264 } 265 } 266 err: 267 closure_sync(&cl); 268 return f->ret ?: ret; 269 } 270 271 static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) 272 { 273 while (n + 1 < end && 274 found_btree_node_cmp_pos(n, n + 1) > 0) { 275 swap(n[0], n[1]); 276 n++; 277 } 278 } 279 280 static int handle_overwrites(struct bch_fs *c, 281 struct found_btree_node *start, 282 struct found_btree_node *end) 283 { 284 struct found_btree_node *n; 285 again: 286 for (n = start + 1; 287 n < end && 288 n->btree_id == start->btree_id && 289 n->level == start->level && 290 bpos_lt(n->min_key, start->max_key); 291 n++) { 292 int cmp = found_btree_node_cmp_time(start, n); 293 294 if (cmp > 0) { 295 if (bpos_cmp(start->max_key, n->max_key) >= 0) 296 n->overwritten = true; 297 else { 298 n->range_updated = true; 299 n->min_key = bpos_successor(start->max_key); 300 n->range_updated = true; 301 bubble_up(n, end); 302 goto again; 303 } 304 } else if (cmp < 0) { 305 BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); 306 307 start->max_key = bpos_predecessor(n->min_key); 308 start->range_updated = true; 309 } else if (n->level) { 310 n->overwritten = true; 311 } else { 312 struct printbuf buf = PRINTBUF; 313 314 prt_str(&buf, "overlapping btree nodes with same seq! halting\n "); 315 found_btree_node_to_text(&buf, c, start); 316 prt_str(&buf, "\n "); 317 found_btree_node_to_text(&buf, c, n); 318 bch_err(c, "%s", buf.buf); 319 printbuf_exit(&buf); 320 return -BCH_ERR_fsck_repair_unimplemented; 321 } 322 } 323 324 return 0; 325 } 326 327 int bch2_scan_for_btree_nodes(struct bch_fs *c) 328 { 329 struct find_btree_nodes *f = &c->found_btree_nodes; 330 struct printbuf buf = PRINTBUF; 331 size_t dst; 332 int ret = 0; 333 334 if (f->nodes.nr) 335 return 0; 336 337 mutex_init(&f->lock); 338 339 ret = read_btree_nodes(f); 340 if (ret) 341 return ret; 342 343 if (!f->nodes.nr) { 344 bch_err(c, "%s: no btree nodes found", __func__); 345 ret = -EINVAL; 346 goto err; 347 } 348 349 if (0 && c->opts.verbose) { 350 printbuf_reset(&buf); 351 prt_printf(&buf, "%s: nodes found:\n", __func__); 352 found_btree_nodes_to_text(&buf, c, f->nodes); 353 bch2_print_string_as_lines(KERN_INFO, buf.buf); 354 } 355 356 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); 357 358 dst = 0; 359 darray_for_each(f->nodes, i) { 360 struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; 361 362 if (prev && 363 prev->cookie == i->cookie) { 364 if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { 365 bch_err(c, "%s: found too many replicas for btree node", __func__); 366 ret = -EINVAL; 367 goto err; 368 } 369 prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; 370 } else { 371 f->nodes.data[dst++] = *i; 372 } 373 } 374 f->nodes.nr = dst; 375 376 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); 377 378 if (0 && c->opts.verbose) { 379 printbuf_reset(&buf); 380 prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); 381 found_btree_nodes_to_text(&buf, c, f->nodes); 382 bch2_print_string_as_lines(KERN_INFO, buf.buf); 383 } 384 385 dst = 0; 386 darray_for_each(f->nodes, i) { 387 if (i->overwritten) 388 continue; 389 390 ret = handle_overwrites(c, i, &darray_top(f->nodes)); 391 if (ret) 392 goto err; 393 394 BUG_ON(i->overwritten); 395 f->nodes.data[dst++] = *i; 396 } 397 f->nodes.nr = dst; 398 399 if (c->opts.verbose) { 400 printbuf_reset(&buf); 401 prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); 402 found_btree_nodes_to_text(&buf, c, f->nodes); 403 bch2_print_string_as_lines(KERN_INFO, buf.buf); 404 } 405 406 eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); 407 err: 408 printbuf_exit(&buf); 409 return ret; 410 } 411 412 static int found_btree_node_range_start_cmp(const void *_l, const void *_r) 413 { 414 const struct found_btree_node *l = _l; 415 const struct found_btree_node *r = _r; 416 417 return cmp_int(l->btree_id, r->btree_id) ?: 418 -cmp_int(l->level, r->level) ?: 419 bpos_cmp(l->max_key, r->min_key); 420 } 421 422 #define for_each_found_btree_node_in_range(_f, _search, _idx) \ 423 for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ 424 sizeof((_f)->nodes.data[0]), \ 425 found_btree_node_range_start_cmp, &search); \ 426 _idx < (_f)->nodes.nr && \ 427 (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ 428 (_f)->nodes.data[_idx].level == _search.level && \ 429 bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ 430 _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) 431 432 bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) 433 { 434 struct find_btree_nodes *f = &c->found_btree_nodes; 435 436 struct found_btree_node search = { 437 .btree_id = b->c.btree_id, 438 .level = b->c.level, 439 .min_key = b->data->min_key, 440 .max_key = b->key.k.p, 441 }; 442 443 for_each_found_btree_node_in_range(f, search, idx) 444 if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) 445 return true; 446 return false; 447 } 448 449 bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) 450 { 451 struct found_btree_node search = { 452 .btree_id = btree, 453 .level = 0, 454 .min_key = POS_MIN, 455 .max_key = SPOS_MAX, 456 }; 457 458 for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) 459 return true; 460 return false; 461 } 462 463 int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, 464 unsigned level, struct bpos node_min, struct bpos node_max) 465 { 466 if (btree_id_is_alloc(btree)) 467 return 0; 468 469 struct find_btree_nodes *f = &c->found_btree_nodes; 470 471 int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); 472 if (ret) 473 return ret; 474 475 if (c->opts.verbose) { 476 struct printbuf buf = PRINTBUF; 477 478 prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); 479 bch2_bpos_to_text(&buf, node_min); 480 prt_str(&buf, " - "); 481 bch2_bpos_to_text(&buf, node_max); 482 483 bch_info(c, "%s(): %s", __func__, buf.buf); 484 printbuf_exit(&buf); 485 } 486 487 struct found_btree_node search = { 488 .btree_id = btree, 489 .level = level, 490 .min_key = node_min, 491 .max_key = node_max, 492 }; 493 494 for_each_found_btree_node_in_range(f, search, idx) { 495 struct found_btree_node n = f->nodes.data[idx]; 496 497 n.range_updated |= bpos_lt(n.min_key, node_min); 498 n.min_key = bpos_max(n.min_key, node_min); 499 500 n.range_updated |= bpos_gt(n.max_key, node_max); 501 n.max_key = bpos_min(n.max_key, node_max); 502 503 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; 504 505 found_btree_node_to_key(&tmp.k, &n); 506 507 struct printbuf buf = PRINTBUF; 508 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); 509 bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); 510 printbuf_exit(&buf); 511 512 BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); 513 514 ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); 515 if (ret) 516 return ret; 517 } 518 519 return 0; 520 } 521 522 void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) 523 { 524 darray_exit(&f->nodes); 525 } 526