1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_io.h" 6 #include "btree_journal_iter.h" 7 #include "btree_node_scan.h" 8 #include "btree_update_interior.h" 9 #include "buckets.h" 10 #include "error.h" 11 #include "journal_io.h" 12 #include "recovery_passes.h" 13 14 #include <linux/kthread.h> 15 #include <linux/sort.h> 16 17 struct find_btree_nodes_worker { 18 struct closure *cl; 19 struct find_btree_nodes *f; 20 struct bch_dev *ca; 21 }; 22 23 static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) 24 { 25 prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ", 26 bch2_btree_id_str(n->btree_id), n->level, n->seq, 27 n->journal_seq, n->cookie); 28 bch2_bpos_to_text(out, n->min_key); 29 prt_str(out, "-"); 30 bch2_bpos_to_text(out, n->max_key); 31 32 if (n->range_updated) 33 prt_str(out, " range updated"); 34 if (n->overwritten) 35 prt_str(out, " overwritten"); 36 37 for (unsigned i = 0; i < n->nr_ptrs; i++) { 38 prt_char(out, ' '); 39 bch2_extent_ptr_to_text(out, c, n->ptrs + i); 40 } 41 } 42 43 static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) 44 { 45 printbuf_indent_add(out, 2); 46 darray_for_each(nodes, i) { 47 found_btree_node_to_text(out, c, i); 48 prt_newline(out); 49 } 50 printbuf_indent_sub(out, 2); 51 } 52 53 static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) 54 { 55 struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); 56 57 set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); 58 bp->k.p = f->max_key; 59 bp->v.seq = cpu_to_le64(f->cookie); 60 bp->v.sectors_written = 0; 61 bp->v.flags = 0; 62 bp->v.sectors_written = cpu_to_le16(f->sectors_written); 63 bp->v.min_key = f->min_key; 64 SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); 65 memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); 66 } 67 68 static inline u64 bkey_journal_seq(struct bkey_s_c k) 69 { 70 switch (k.k->type) { 71 case KEY_TYPE_inode_v3: 72 return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq); 73 default: 74 return 0; 75 } 76 } 77 78 static bool found_btree_node_is_readable(struct btree_trans *trans, 79 struct found_btree_node *f) 80 { 81 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; 82 83 found_btree_node_to_key(&tmp.k, f); 84 85 struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false); 86 bool ret = !IS_ERR_OR_NULL(b); 87 if (!ret) 88 return ret; 89 90 f->sectors_written = b->written; 91 f->journal_seq = le64_to_cpu(b->data->keys.journal_seq); 92 93 struct bkey_s_c k; 94 struct bkey unpacked; 95 struct btree_node_iter iter; 96 for_each_btree_node_key_unpack(b, k, &iter, &unpacked) 97 f->journal_seq = max(f->journal_seq, bkey_journal_seq(k)); 98 99 six_unlock_read(&b->c.lock); 100 101 /* 102 * We might update this node's range; if that happens, we need the node 103 * to be re-read so the read path can trim keys that are no longer in 104 * this node 105 */ 106 if (b != btree_node_root(trans->c, b)) 107 bch2_btree_node_evict(trans, &tmp.k); 108 return ret; 109 } 110 111 static int found_btree_node_cmp_cookie(const void *_l, const void *_r) 112 { 113 const struct found_btree_node *l = _l; 114 const struct found_btree_node *r = _r; 115 116 return cmp_int(l->btree_id, r->btree_id) ?: 117 cmp_int(l->level, r->level) ?: 118 cmp_int(l->cookie, r->cookie); 119 } 120 121 /* 122 * Given two found btree nodes, if their sequence numbers are equal, take the 123 * one that's readable: 124 */ 125 static int found_btree_node_cmp_time(const struct found_btree_node *l, 126 const struct found_btree_node *r) 127 { 128 return cmp_int(l->seq, r->seq) ?: 129 cmp_int(l->journal_seq, r->journal_seq); 130 } 131 132 static int found_btree_node_cmp_pos(const void *_l, const void *_r) 133 { 134 const struct found_btree_node *l = _l; 135 const struct found_btree_node *r = _r; 136 137 return cmp_int(l->btree_id, r->btree_id) ?: 138 -cmp_int(l->level, r->level) ?: 139 bpos_cmp(l->min_key, r->min_key) ?: 140 -found_btree_node_cmp_time(l, r); 141 } 142 143 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, 144 struct bio *bio, struct btree_node *bn, u64 offset) 145 { 146 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); 147 148 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); 149 bio->bi_iter.bi_sector = offset; 150 bch2_bio_map(bio, bn, PAGE_SIZE); 151 152 submit_bio_wait(bio); 153 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, 154 "IO error in try_read_btree_node() at %llu: %s", 155 offset, bch2_blk_status_to_str(bio->bi_status))) 156 return; 157 158 if (le64_to_cpu(bn->magic) != bset_magic(c)) 159 return; 160 161 if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { 162 struct nonce nonce = btree_nonce(&bn->keys, 0); 163 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; 164 165 bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); 166 } 167 168 if (btree_id_is_alloc(BTREE_NODE_ID(bn))) 169 return; 170 171 if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) 172 return; 173 174 rcu_read_lock(); 175 struct found_btree_node n = { 176 .btree_id = BTREE_NODE_ID(bn), 177 .level = BTREE_NODE_LEVEL(bn), 178 .seq = BTREE_NODE_SEQ(bn), 179 .cookie = le64_to_cpu(bn->keys.seq), 180 .min_key = bn->min_key, 181 .max_key = bn->max_key, 182 .nr_ptrs = 1, 183 .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, 184 .ptrs[0].offset = offset, 185 .ptrs[0].dev = ca->dev_idx, 186 .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), 187 }; 188 rcu_read_unlock(); 189 190 if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { 191 mutex_lock(&f->lock); 192 if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { 193 bch_err(c, "try_read_btree_node() can't handle endian conversion"); 194 f->ret = -EINVAL; 195 goto unlock; 196 } 197 198 if (darray_push(&f->nodes, n)) 199 f->ret = -ENOMEM; 200 unlock: 201 mutex_unlock(&f->lock); 202 } 203 } 204 205 static int read_btree_nodes_worker(void *p) 206 { 207 struct find_btree_nodes_worker *w = p; 208 struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); 209 struct bch_dev *ca = w->ca; 210 void *buf = (void *) __get_free_page(GFP_KERNEL); 211 struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); 212 unsigned long last_print = jiffies; 213 214 if (!buf || !bio) { 215 bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); 216 w->f->ret = -ENOMEM; 217 goto err; 218 } 219 220 for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) 221 for (unsigned bucket_offset = 0; 222 bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; 223 bucket_offset += btree_sectors(c)) { 224 if (time_after(jiffies, last_print + HZ * 30)) { 225 u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; 226 u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; 227 228 bch_info(ca, "%s: %2u%% done", __func__, 229 (unsigned) div64_u64(cur_sector * 100, end_sector)); 230 last_print = jiffies; 231 } 232 233 u64 sector = bucket * ca->mi.bucket_size + bucket_offset; 234 235 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && 236 !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) 237 continue; 238 239 try_read_btree_node(w->f, ca, bio, buf, sector); 240 } 241 err: 242 bio_put(bio); 243 free_page((unsigned long) buf); 244 percpu_ref_get(&ca->io_ref); 245 closure_put(w->cl); 246 kfree(w); 247 return 0; 248 } 249 250 static int read_btree_nodes(struct find_btree_nodes *f) 251 { 252 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); 253 struct closure cl; 254 int ret = 0; 255 256 closure_init_stack(&cl); 257 258 for_each_online_member(c, ca) { 259 if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) 260 continue; 261 262 struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); 263 struct task_struct *t; 264 265 if (!w) { 266 percpu_ref_put(&ca->io_ref); 267 ret = -ENOMEM; 268 goto err; 269 } 270 271 percpu_ref_get(&ca->io_ref); 272 closure_get(&cl); 273 w->cl = &cl; 274 w->f = f; 275 w->ca = ca; 276 277 t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); 278 ret = IS_ERR_OR_NULL(t); 279 if (ret) { 280 percpu_ref_put(&ca->io_ref); 281 closure_put(&cl); 282 f->ret = ret; 283 bch_err(c, "error starting kthread: %i", ret); 284 break; 285 } 286 } 287 err: 288 closure_sync(&cl); 289 return f->ret ?: ret; 290 } 291 292 static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) 293 { 294 while (n + 1 < end && 295 found_btree_node_cmp_pos(n, n + 1) > 0) { 296 swap(n[0], n[1]); 297 n++; 298 } 299 } 300 301 static int handle_overwrites(struct bch_fs *c, 302 struct found_btree_node *start, 303 struct found_btree_node *end) 304 { 305 struct found_btree_node *n; 306 again: 307 for (n = start + 1; 308 n < end && 309 n->btree_id == start->btree_id && 310 n->level == start->level && 311 bpos_lt(n->min_key, start->max_key); 312 n++) { 313 int cmp = found_btree_node_cmp_time(start, n); 314 315 if (cmp > 0) { 316 if (bpos_cmp(start->max_key, n->max_key) >= 0) 317 n->overwritten = true; 318 else { 319 n->range_updated = true; 320 n->min_key = bpos_successor(start->max_key); 321 n->range_updated = true; 322 bubble_up(n, end); 323 goto again; 324 } 325 } else if (cmp < 0) { 326 BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); 327 328 start->max_key = bpos_predecessor(n->min_key); 329 start->range_updated = true; 330 } else if (n->level) { 331 n->overwritten = true; 332 } else { 333 if (bpos_cmp(start->max_key, n->max_key) >= 0) 334 n->overwritten = true; 335 else { 336 n->range_updated = true; 337 n->min_key = bpos_successor(start->max_key); 338 n->range_updated = true; 339 bubble_up(n, end); 340 goto again; 341 } 342 } 343 } 344 345 return 0; 346 } 347 348 int bch2_scan_for_btree_nodes(struct bch_fs *c) 349 { 350 struct find_btree_nodes *f = &c->found_btree_nodes; 351 struct printbuf buf = PRINTBUF; 352 size_t dst; 353 int ret = 0; 354 355 if (f->nodes.nr) 356 return 0; 357 358 mutex_init(&f->lock); 359 360 ret = read_btree_nodes(f); 361 if (ret) 362 return ret; 363 364 if (!f->nodes.nr) { 365 bch_err(c, "%s: no btree nodes found", __func__); 366 ret = -EINVAL; 367 goto err; 368 } 369 370 if (0 && c->opts.verbose) { 371 printbuf_reset(&buf); 372 prt_printf(&buf, "%s: nodes found:\n", __func__); 373 found_btree_nodes_to_text(&buf, c, f->nodes); 374 bch2_print_string_as_lines(KERN_INFO, buf.buf); 375 } 376 377 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); 378 379 dst = 0; 380 darray_for_each(f->nodes, i) { 381 struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; 382 383 if (prev && 384 prev->cookie == i->cookie) { 385 if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { 386 bch_err(c, "%s: found too many replicas for btree node", __func__); 387 ret = -EINVAL; 388 goto err; 389 } 390 prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; 391 } else { 392 f->nodes.data[dst++] = *i; 393 } 394 } 395 f->nodes.nr = dst; 396 397 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); 398 399 if (0 && c->opts.verbose) { 400 printbuf_reset(&buf); 401 prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); 402 found_btree_nodes_to_text(&buf, c, f->nodes); 403 bch2_print_string_as_lines(KERN_INFO, buf.buf); 404 } 405 406 dst = 0; 407 darray_for_each(f->nodes, i) { 408 if (i->overwritten) 409 continue; 410 411 ret = handle_overwrites(c, i, &darray_top(f->nodes)); 412 if (ret) 413 goto err; 414 415 BUG_ON(i->overwritten); 416 f->nodes.data[dst++] = *i; 417 } 418 f->nodes.nr = dst; 419 420 if (c->opts.verbose) { 421 printbuf_reset(&buf); 422 prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); 423 found_btree_nodes_to_text(&buf, c, f->nodes); 424 bch2_print_string_as_lines(KERN_INFO, buf.buf); 425 } 426 427 eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); 428 err: 429 printbuf_exit(&buf); 430 return ret; 431 } 432 433 static int found_btree_node_range_start_cmp(const void *_l, const void *_r) 434 { 435 const struct found_btree_node *l = _l; 436 const struct found_btree_node *r = _r; 437 438 return cmp_int(l->btree_id, r->btree_id) ?: 439 -cmp_int(l->level, r->level) ?: 440 bpos_cmp(l->max_key, r->min_key); 441 } 442 443 #define for_each_found_btree_node_in_range(_f, _search, _idx) \ 444 for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ 445 sizeof((_f)->nodes.data[0]), \ 446 found_btree_node_range_start_cmp, &search); \ 447 _idx < (_f)->nodes.nr && \ 448 (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ 449 (_f)->nodes.data[_idx].level == _search.level && \ 450 bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ 451 _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) 452 453 bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) 454 { 455 struct find_btree_nodes *f = &c->found_btree_nodes; 456 457 struct found_btree_node search = { 458 .btree_id = b->c.btree_id, 459 .level = b->c.level, 460 .min_key = b->data->min_key, 461 .max_key = b->key.k.p, 462 }; 463 464 for_each_found_btree_node_in_range(f, search, idx) 465 if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) 466 return true; 467 return false; 468 } 469 470 bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) 471 { 472 struct found_btree_node search = { 473 .btree_id = btree, 474 .level = 0, 475 .min_key = POS_MIN, 476 .max_key = SPOS_MAX, 477 }; 478 479 for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) 480 return true; 481 return false; 482 } 483 484 int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, 485 unsigned level, struct bpos node_min, struct bpos node_max) 486 { 487 if (btree_id_is_alloc(btree)) 488 return 0; 489 490 struct find_btree_nodes *f = &c->found_btree_nodes; 491 492 int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); 493 if (ret) 494 return ret; 495 496 if (c->opts.verbose) { 497 struct printbuf buf = PRINTBUF; 498 499 prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); 500 bch2_bpos_to_text(&buf, node_min); 501 prt_str(&buf, " - "); 502 bch2_bpos_to_text(&buf, node_max); 503 504 bch_info(c, "%s(): %s", __func__, buf.buf); 505 printbuf_exit(&buf); 506 } 507 508 struct found_btree_node search = { 509 .btree_id = btree, 510 .level = level, 511 .min_key = node_min, 512 .max_key = node_max, 513 }; 514 515 for_each_found_btree_node_in_range(f, search, idx) { 516 struct found_btree_node n = f->nodes.data[idx]; 517 518 n.range_updated |= bpos_lt(n.min_key, node_min); 519 n.min_key = bpos_max(n.min_key, node_min); 520 521 n.range_updated |= bpos_gt(n.max_key, node_max); 522 n.max_key = bpos_min(n.max_key, node_max); 523 524 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; 525 526 found_btree_node_to_key(&tmp.k, &n); 527 528 struct printbuf buf = PRINTBUF; 529 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); 530 bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); 531 printbuf_exit(&buf); 532 533 BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); 534 535 ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); 536 if (ret) 537 return ret; 538 } 539 540 return 0; 541 } 542 543 void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) 544 { 545 darray_exit(&f->nodes); 546 } 547