1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "bbpos.h" 4 #include "alloc_background.h" 5 #include "backpointers.h" 6 #include "bkey_buf.h" 7 #include "btree_cache.h" 8 #include "btree_update.h" 9 #include "btree_update_interior.h" 10 #include "btree_write_buffer.h" 11 #include "checksum.h" 12 #include "disk_accounting.h" 13 #include "error.h" 14 15 #include <linux/mm.h> 16 17 int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, 18 struct bkey_validate_context from) 19 { 20 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); 21 int ret = 0; 22 23 bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH, 24 c, backpointer_level_bad, 25 "backpointer level bad: %u >= %u", 26 bp.v->level, BTREE_MAX_DEPTH); 27 28 bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, 29 c, backpointer_dev_bad, 30 "backpointer for BCH_SB_MEMBER_INVALID"); 31 fsck_err: 32 return ret; 33 } 34 35 void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 36 { 37 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); 38 39 rcu_read_lock(); 40 struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); 41 if (ca) { 42 u32 bucket_offset; 43 struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); 44 rcu_read_unlock(); 45 prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); 46 } else { 47 rcu_read_unlock(); 48 prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); 49 } 50 51 bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); 52 prt_printf(out, " suboffset=%u len=%u gen=%u pos=", 53 (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), 54 bp.v->bucket_len, 55 bp.v->bucket_gen); 56 bch2_bpos_to_text(out, bp.v->pos); 57 } 58 59 void bch2_backpointer_swab(struct bkey_s k) 60 { 61 struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); 62 63 bp.v->bucket_len = swab32(bp.v->bucket_len); 64 bch2_bpos_swab(&bp.v->pos); 65 } 66 67 static bool extent_matches_bp(struct bch_fs *c, 68 enum btree_id btree_id, unsigned level, 69 struct bkey_s_c k, 70 struct bkey_s_c_backpointer bp) 71 { 72 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 73 const union bch_extent_entry *entry; 74 struct extent_ptr_decoded p; 75 76 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 77 struct bkey_i_backpointer bp2; 78 bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); 79 80 if (bpos_eq(bp.k->p, bp2.k.p) && 81 !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) 82 return true; 83 } 84 85 return false; 86 } 87 88 static noinline int backpointer_mod_err(struct btree_trans *trans, 89 struct bkey_s_c orig_k, 90 struct bkey_i_backpointer *new_bp, 91 struct bkey_s_c found_bp, 92 bool insert) 93 { 94 struct bch_fs *c = trans->c; 95 struct printbuf buf = PRINTBUF; 96 97 if (insert) { 98 prt_printf(&buf, "existing backpointer found when inserting "); 99 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); 100 prt_newline(&buf); 101 printbuf_indent_add(&buf, 2); 102 103 prt_printf(&buf, "found "); 104 bch2_bkey_val_to_text(&buf, c, found_bp); 105 prt_newline(&buf); 106 107 prt_printf(&buf, "for "); 108 bch2_bkey_val_to_text(&buf, c, orig_k); 109 110 bch_err(c, "%s", buf.buf); 111 } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { 112 prt_printf(&buf, "backpointer not found when deleting\n"); 113 printbuf_indent_add(&buf, 2); 114 115 prt_printf(&buf, "searching for "); 116 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); 117 prt_newline(&buf); 118 119 prt_printf(&buf, "got "); 120 bch2_bkey_val_to_text(&buf, c, found_bp); 121 prt_newline(&buf); 122 123 prt_printf(&buf, "for "); 124 bch2_bkey_val_to_text(&buf, c, orig_k); 125 126 bch_err(c, "%s", buf.buf); 127 } 128 129 printbuf_exit(&buf); 130 131 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { 132 return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0; 133 } else { 134 return 0; 135 } 136 } 137 138 int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, 139 struct bkey_s_c orig_k, 140 struct bkey_i_backpointer *bp, 141 bool insert) 142 { 143 struct btree_iter bp_iter; 144 struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, 145 bp->k.p, 146 BTREE_ITER_intent| 147 BTREE_ITER_slots| 148 BTREE_ITER_with_updates); 149 int ret = bkey_err(k); 150 if (ret) 151 return ret; 152 153 if (insert 154 ? k.k->type 155 : (k.k->type != KEY_TYPE_backpointer || 156 memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { 157 ret = backpointer_mod_err(trans, orig_k, bp, k, insert); 158 if (ret) 159 goto err; 160 } 161 162 if (!insert) { 163 bp->k.type = KEY_TYPE_deleted; 164 set_bkey_val_u64s(&bp->k, 0); 165 } 166 167 ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); 168 err: 169 bch2_trans_iter_exit(trans, &bp_iter); 170 return ret; 171 } 172 173 static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) 174 { 175 return (likely(!bch2_backpointers_no_use_write_buffer) 176 ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) 177 : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: 178 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 179 } 180 181 static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, 182 struct bkey_s_c visiting_k, 183 struct bkey_buf *last_flushed) 184 { 185 return likely(!bch2_backpointers_no_use_write_buffer) 186 ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) 187 : 0; 188 } 189 190 static int backpointer_target_not_found(struct btree_trans *trans, 191 struct bkey_s_c_backpointer bp, 192 struct bkey_s_c target_k, 193 struct bkey_buf *last_flushed) 194 { 195 struct bch_fs *c = trans->c; 196 struct printbuf buf = PRINTBUF; 197 int ret = 0; 198 199 /* 200 * If we're using the btree write buffer, the backpointer we were 201 * looking at may have already been deleted - failure to find what it 202 * pointed to is not an error: 203 */ 204 ret = last_flushed 205 ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) 206 : 0; 207 if (ret) 208 return ret; 209 210 prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", 211 bp.v->level ? "btree node" : "extent"); 212 bch2_bkey_val_to_text(&buf, c, bp.s_c); 213 214 prt_printf(&buf, "\n "); 215 bch2_bkey_val_to_text(&buf, c, target_k); 216 217 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); 218 const union bch_extent_entry *entry; 219 struct extent_ptr_decoded p; 220 bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) 221 if (p.ptr.dev == bp.k->p.inode) { 222 prt_printf(&buf, "\n "); 223 struct bkey_i_backpointer bp2; 224 bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); 225 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); 226 } 227 228 if (fsck_err(trans, backpointer_to_missing_ptr, 229 "%s", buf.buf)) 230 ret = bch2_backpointer_del(trans, bp.k->p); 231 fsck_err: 232 printbuf_exit(&buf); 233 return ret; 234 } 235 236 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, 237 struct bkey_s_c_backpointer bp, 238 struct btree_iter *iter, 239 unsigned iter_flags, 240 struct bkey_buf *last_flushed) 241 { 242 struct bch_fs *c = trans->c; 243 244 if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) 245 return bkey_s_c_null; 246 247 if (likely(!bp.v->level)) { 248 bch2_trans_node_iter_init(trans, iter, 249 bp.v->btree_id, 250 bp.v->pos, 251 0, 0, 252 iter_flags); 253 struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); 254 if (bkey_err(k)) { 255 bch2_trans_iter_exit(trans, iter); 256 return k; 257 } 258 259 if (k.k && 260 extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) 261 return k; 262 263 bch2_trans_iter_exit(trans, iter); 264 int ret = backpointer_target_not_found(trans, bp, k, last_flushed); 265 return ret ? bkey_s_c_err(ret) : bkey_s_c_null; 266 } else { 267 struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); 268 if (IS_ERR_OR_NULL(b)) 269 return ((struct bkey_s_c) { .k = ERR_CAST(b) }); 270 271 return bkey_i_to_s_c(&b->key); 272 } 273 } 274 275 struct btree *bch2_backpointer_get_node(struct btree_trans *trans, 276 struct bkey_s_c_backpointer bp, 277 struct btree_iter *iter, 278 struct bkey_buf *last_flushed) 279 { 280 struct bch_fs *c = trans->c; 281 282 BUG_ON(!bp.v->level); 283 284 bch2_trans_node_iter_init(trans, iter, 285 bp.v->btree_id, 286 bp.v->pos, 287 0, 288 bp.v->level - 1, 289 0); 290 struct btree *b = bch2_btree_iter_peek_node(iter); 291 if (IS_ERR_OR_NULL(b)) 292 goto err; 293 294 BUG_ON(b->c.level != bp.v->level - 1); 295 296 if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, 297 bkey_i_to_s_c(&b->key), bp)) 298 return b; 299 300 if (btree_node_will_make_reachable(b)) { 301 b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); 302 } else { 303 int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); 304 b = ret ? ERR_PTR(ret) : NULL; 305 } 306 err: 307 bch2_trans_iter_exit(trans, iter); 308 return b; 309 } 310 311 static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, 312 struct bkey_buf *last_flushed) 313 { 314 if (k.k->type != KEY_TYPE_backpointer) 315 return 0; 316 317 struct bch_fs *c = trans->c; 318 struct btree_iter alloc_iter = { NULL }; 319 struct bkey_s_c alloc_k; 320 struct printbuf buf = PRINTBUF; 321 int ret = 0; 322 323 struct bpos bucket; 324 if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { 325 ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); 326 if (ret) 327 goto out; 328 329 if (fsck_err(trans, backpointer_to_missing_device, 330 "backpointer for missing device:\n%s", 331 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 332 ret = bch2_backpointer_del(trans, k.k->p); 333 goto out; 334 } 335 336 alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); 337 ret = bkey_err(alloc_k); 338 if (ret) 339 goto out; 340 341 if (alloc_k.k->type != KEY_TYPE_alloc_v4) { 342 ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); 343 if (ret) 344 goto out; 345 346 if (fsck_err(trans, backpointer_to_missing_alloc, 347 "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", 348 alloc_iter.pos.inode, alloc_iter.pos.offset, 349 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 350 ret = bch2_backpointer_del(trans, k.k->p); 351 } 352 out: 353 fsck_err: 354 bch2_trans_iter_exit(trans, &alloc_iter); 355 printbuf_exit(&buf); 356 return ret; 357 } 358 359 /* verify that every backpointer has a corresponding alloc key */ 360 int bch2_check_btree_backpointers(struct bch_fs *c) 361 { 362 struct bkey_buf last_flushed; 363 bch2_bkey_buf_init(&last_flushed); 364 bkey_init(&last_flushed.k->k); 365 366 int ret = bch2_trans_run(c, 367 for_each_btree_key_commit(trans, iter, 368 BTREE_ID_backpointers, POS_MIN, 0, k, 369 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 370 bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); 371 372 bch2_bkey_buf_exit(&last_flushed, c); 373 bch_err_fn(c, ret); 374 return ret; 375 } 376 377 struct extents_to_bp_state { 378 struct bpos bp_start; 379 struct bpos bp_end; 380 struct bkey_buf last_flushed; 381 }; 382 383 static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, 384 struct bkey_s_c extent, unsigned dev) 385 { 386 struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); 387 int ret = PTR_ERR_OR_ZERO(n); 388 if (ret) 389 return ret; 390 391 bch2_bkey_drop_device(bkey_i_to_s(n), dev); 392 return bch2_btree_insert_trans(trans, btree, n, 0); 393 } 394 395 static int check_extent_checksum(struct btree_trans *trans, 396 enum btree_id btree, struct bkey_s_c extent, 397 enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) 398 { 399 struct bch_fs *c = trans->c; 400 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); 401 const union bch_extent_entry *entry; 402 struct extent_ptr_decoded p; 403 struct printbuf buf = PRINTBUF; 404 void *data_buf = NULL; 405 struct bio *bio = NULL; 406 size_t bytes; 407 int ret = 0; 408 409 if (bkey_is_btree_ptr(extent.k)) 410 return false; 411 412 bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) 413 if (p.ptr.dev == dev) 414 goto found; 415 BUG(); 416 found: 417 if (!p.crc.csum_type) 418 return false; 419 420 bytes = p.crc.compressed_size << 9; 421 422 struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); 423 if (!ca) 424 return false; 425 426 data_buf = kvmalloc(bytes, GFP_KERNEL); 427 if (!data_buf) { 428 ret = -ENOMEM; 429 goto err; 430 } 431 432 bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL); 433 bio->bi_iter.bi_sector = p.ptr.offset; 434 bch2_bio_map(bio, data_buf, bytes); 435 ret = submit_bio_wait(bio); 436 if (ret) 437 goto err; 438 439 prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); 440 prt_printf(&buf, "\n "); 441 bch2_btree_id_to_text(&buf, btree); 442 prt_str(&buf, " "); 443 bch2_bkey_val_to_text(&buf, c, extent); 444 prt_printf(&buf, "\n "); 445 bch2_btree_id_to_text(&buf, o_btree); 446 prt_str(&buf, " "); 447 bch2_bkey_val_to_text(&buf, c, extent2); 448 449 struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); 450 struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); 451 if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), 452 trans, dup_backpointer_to_bad_csum_extent, 453 "%s", buf.buf)) 454 ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; 455 fsck_err: 456 err: 457 if (bio) 458 bio_put(bio); 459 kvfree(data_buf); 460 percpu_ref_put(&ca->io_ref); 461 printbuf_exit(&buf); 462 return ret; 463 } 464 465 static int check_bp_exists(struct btree_trans *trans, 466 struct extents_to_bp_state *s, 467 struct bkey_i_backpointer *bp, 468 struct bkey_s_c orig_k) 469 { 470 struct bch_fs *c = trans->c; 471 struct btree_iter other_extent_iter = {}; 472 struct printbuf buf = PRINTBUF; 473 474 if (bpos_lt(bp->k.p, s->bp_start) || 475 bpos_gt(bp->k.p, s->bp_end)) 476 return 0; 477 478 struct btree_iter bp_iter; 479 struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); 480 int ret = bkey_err(bp_k); 481 if (ret) 482 goto err; 483 484 if (bp_k.k->type != KEY_TYPE_backpointer || 485 memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { 486 ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); 487 if (ret) 488 goto err; 489 490 goto check_existing_bp; 491 } 492 out: 493 err: 494 fsck_err: 495 bch2_trans_iter_exit(trans, &other_extent_iter); 496 bch2_trans_iter_exit(trans, &bp_iter); 497 printbuf_exit(&buf); 498 return ret; 499 check_existing_bp: 500 /* Do we have a backpointer for a different extent? */ 501 if (bp_k.k->type != KEY_TYPE_backpointer) 502 goto missing; 503 504 struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); 505 506 struct bkey_s_c other_extent = 507 bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); 508 ret = bkey_err(other_extent); 509 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 510 ret = 0; 511 if (ret) 512 goto err; 513 514 if (!other_extent.k) 515 goto missing; 516 517 if (bch2_extents_match(orig_k, other_extent)) { 518 printbuf_reset(&buf); 519 prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); 520 bch2_bkey_val_to_text(&buf, c, orig_k); 521 prt_str(&buf, "\n "); 522 bch2_bkey_val_to_text(&buf, c, other_extent); 523 bch_err(c, "%s", buf.buf); 524 525 if (other_extent.k->size <= orig_k.k->size) { 526 ret = drop_dev_and_update(trans, other_bp.v->btree_id, 527 other_extent, bp->k.p.inode); 528 if (ret) 529 goto err; 530 goto out; 531 } else { 532 ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); 533 if (ret) 534 goto err; 535 goto missing; 536 } 537 } 538 539 ret = check_extent_checksum(trans, 540 other_bp.v->btree_id, other_extent, 541 bp->v.btree_id, orig_k, 542 bp->k.p.inode); 543 if (ret < 0) 544 goto err; 545 if (ret) { 546 ret = 0; 547 goto missing; 548 } 549 550 ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, 551 other_bp.v->btree_id, other_extent, bp->k.p.inode); 552 if (ret < 0) 553 goto err; 554 if (ret) { 555 ret = 0; 556 goto out; 557 } 558 559 printbuf_reset(&buf); 560 prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode); 561 bch2_bkey_val_to_text(&buf, c, orig_k); 562 prt_str(&buf, "\n "); 563 bch2_bkey_val_to_text(&buf, c, other_extent); 564 bch_err(c, "%s", buf.buf); 565 ret = -BCH_ERR_fsck_repair_unimplemented; 566 goto err; 567 missing: 568 printbuf_reset(&buf); 569 prt_str(&buf, "missing backpointer\n for: "); 570 bch2_bkey_val_to_text(&buf, c, orig_k); 571 prt_printf(&buf, "\n want: "); 572 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); 573 prt_printf(&buf, "\n got: "); 574 bch2_bkey_val_to_text(&buf, c, bp_k); 575 576 if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) 577 ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); 578 579 goto out; 580 } 581 582 static int check_extent_to_backpointers(struct btree_trans *trans, 583 struct extents_to_bp_state *s, 584 enum btree_id btree, unsigned level, 585 struct bkey_s_c k) 586 { 587 struct bch_fs *c = trans->c; 588 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 589 const union bch_extent_entry *entry; 590 struct extent_ptr_decoded p; 591 592 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 593 if (p.ptr.cached) 594 continue; 595 596 if (p.ptr.dev == BCH_SB_MEMBER_INVALID) 597 continue; 598 599 rcu_read_lock(); 600 struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); 601 bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); 602 bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); 603 rcu_read_unlock(); 604 605 if (check || empty) { 606 struct bkey_i_backpointer bp; 607 bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); 608 609 int ret = check 610 ? check_bp_exists(trans, s, &bp, k) 611 : bch2_bucket_backpointer_mod(trans, k, &bp, true); 612 if (ret) 613 return ret; 614 } 615 } 616 617 return 0; 618 } 619 620 static int check_btree_root_to_backpointers(struct btree_trans *trans, 621 struct extents_to_bp_state *s, 622 enum btree_id btree_id, 623 int *level) 624 { 625 struct bch_fs *c = trans->c; 626 struct btree_iter iter; 627 struct btree *b; 628 struct bkey_s_c k; 629 int ret; 630 retry: 631 bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 632 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); 633 b = bch2_btree_iter_peek_node(&iter); 634 ret = PTR_ERR_OR_ZERO(b); 635 if (ret) 636 goto err; 637 638 if (b != btree_node_root(c, b)) { 639 bch2_trans_iter_exit(trans, &iter); 640 goto retry; 641 } 642 643 *level = b->c.level; 644 645 k = bkey_i_to_s_c(&b->key); 646 ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); 647 err: 648 bch2_trans_iter_exit(trans, &iter); 649 return ret; 650 } 651 652 static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) 653 { 654 return (struct bbpos) { 655 .btree = bp.btree_id, 656 .pos = bp.pos, 657 }; 658 } 659 660 static u64 mem_may_pin_bytes(struct bch_fs *c) 661 { 662 struct sysinfo i; 663 si_meminfo(&i); 664 665 u64 mem_bytes = i.totalram * i.mem_unit; 666 return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); 667 } 668 669 static size_t btree_nodes_fit_in_ram(struct bch_fs *c) 670 { 671 return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); 672 } 673 674 static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, 675 u64 btree_leaf_mask, 676 u64 btree_interior_mask, 677 struct bbpos start, struct bbpos *end) 678 { 679 struct bch_fs *c = trans->c; 680 s64 mem_may_pin = mem_may_pin_bytes(c); 681 int ret = 0; 682 683 bch2_btree_cache_unpin(c); 684 685 btree_interior_mask |= btree_leaf_mask; 686 687 c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask; 688 c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask; 689 c->btree_cache.pinned_nodes_start = start; 690 c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; 691 692 for (enum btree_id btree = start.btree; 693 btree < BTREE_ID_NR && !ret; 694 btree++) { 695 unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1; 696 697 if (!(BIT_ULL(btree) & btree_leaf_mask) && 698 !(BIT_ULL(btree) & btree_interior_mask)) 699 continue; 700 701 ret = __for_each_btree_node(trans, iter, btree, 702 btree == start.btree ? start.pos : POS_MIN, 703 0, depth, BTREE_ITER_prefetch, b, ({ 704 mem_may_pin -= btree_buf_bytes(b); 705 if (mem_may_pin <= 0) { 706 c->btree_cache.pinned_nodes_end = *end = 707 BBPOS(btree, b->key.k.p); 708 break; 709 } 710 bch2_node_pin(c, b); 711 0; 712 })); 713 } 714 715 return ret; 716 } 717 718 struct progress_indicator_state { 719 unsigned long next_print; 720 u64 nodes_seen; 721 u64 nodes_total; 722 struct btree *last_node; 723 }; 724 725 static inline void progress_init(struct progress_indicator_state *s, 726 struct bch_fs *c, 727 u64 btree_id_mask) 728 { 729 memset(s, 0, sizeof(*s)); 730 731 s->next_print = jiffies + HZ * 10; 732 733 for (unsigned i = 0; i < BTREE_ID_NR; i++) { 734 if (!(btree_id_mask & BIT_ULL(i))) 735 continue; 736 737 struct disk_accounting_pos acc = { 738 .type = BCH_DISK_ACCOUNTING_btree, 739 .btree.id = i, 740 }; 741 742 u64 v; 743 bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); 744 s->nodes_total += div64_ul(v, btree_sectors(c)); 745 } 746 } 747 748 static inline bool progress_update_p(struct progress_indicator_state *s) 749 { 750 bool ret = time_after_eq(jiffies, s->next_print); 751 752 if (ret) 753 s->next_print = jiffies + HZ * 10; 754 return ret; 755 } 756 757 static void progress_update_iter(struct btree_trans *trans, 758 struct progress_indicator_state *s, 759 struct btree_iter *iter, 760 const char *msg) 761 { 762 struct bch_fs *c = trans->c; 763 struct btree *b = path_l(btree_iter_path(trans, iter))->b; 764 765 s->nodes_seen += b != s->last_node; 766 s->last_node = b; 767 768 if (progress_update_p(s)) { 769 struct printbuf buf = PRINTBUF; 770 unsigned percent = s->nodes_total 771 ? div64_u64(s->nodes_seen * 100, s->nodes_total) 772 : 0; 773 774 prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", 775 msg, percent, s->nodes_seen, s->nodes_total); 776 bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); 777 778 bch_info(c, "%s", buf.buf); 779 printbuf_exit(&buf); 780 } 781 } 782 783 static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, 784 struct extents_to_bp_state *s) 785 { 786 struct bch_fs *c = trans->c; 787 struct progress_indicator_state progress; 788 int ret = 0; 789 790 progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); 791 792 for (enum btree_id btree_id = 0; 793 btree_id < btree_id_nr_alive(c); 794 btree_id++) { 795 int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; 796 797 ret = commit_do(trans, NULL, NULL, 798 BCH_TRANS_COMMIT_no_enospc, 799 check_btree_root_to_backpointers(trans, s, btree_id, &level)); 800 if (ret) 801 return ret; 802 803 while (level >= depth) { 804 struct btree_iter iter; 805 bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, 806 BTREE_ITER_prefetch); 807 808 ret = for_each_btree_key_continue(trans, iter, 0, k, ({ 809 progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); 810 check_extent_to_backpointers(trans, s, btree_id, level, k) ?: 811 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 812 })); 813 if (ret) 814 return ret; 815 816 --level; 817 } 818 } 819 820 return 0; 821 } 822 823 enum alloc_sector_counter { 824 ALLOC_dirty, 825 ALLOC_cached, 826 ALLOC_stripe, 827 ALLOC_SECTORS_NR 828 }; 829 830 static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) 831 { 832 switch (t) { 833 case BCH_DATA_btree: 834 case BCH_DATA_user: 835 return ALLOC_dirty; 836 case BCH_DATA_cached: 837 return ALLOC_cached; 838 case BCH_DATA_stripe: 839 return ALLOC_stripe; 840 default: 841 BUG(); 842 } 843 } 844 845 static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); 846 847 static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, 848 struct bkey_buf *last_flushed) 849 { 850 struct bch_fs *c = trans->c; 851 struct bch_alloc_v4 a_convert; 852 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); 853 bool need_commit = false; 854 855 if (a->data_type == BCH_DATA_sb || 856 a->data_type == BCH_DATA_journal || 857 a->data_type == BCH_DATA_parity) 858 return 0; 859 860 u32 sectors[ALLOC_SECTORS_NR]; 861 memset(sectors, 0, sizeof(sectors)); 862 863 struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); 864 if (!ca) 865 return 0; 866 867 struct btree_iter iter; 868 struct bkey_s_c bp_k; 869 int ret = 0; 870 for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, 871 bucket_pos_to_bp_start(ca, alloc_k.k->p), 872 bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { 873 if (bp_k.k->type != KEY_TYPE_backpointer) 874 continue; 875 876 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); 877 878 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && 879 (bp.v->bucket_gen != a->gen || 880 bp.v->pad)) { 881 ret = bch2_backpointer_del(trans, bp_k.k->p); 882 if (ret) 883 break; 884 885 need_commit = true; 886 continue; 887 } 888 889 if (bp.v->bucket_gen != a->gen) 890 continue; 891 892 sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; 893 }; 894 bch2_trans_iter_exit(trans, &iter); 895 if (ret) 896 goto err; 897 898 if (need_commit) { 899 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 900 if (ret) 901 goto err; 902 } 903 904 /* Cached pointers don't have backpointers: */ 905 906 if (sectors[ALLOC_dirty] != a->dirty_sectors || 907 sectors[ALLOC_stripe] != a->stripe_sectors) { 908 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { 909 ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); 910 if (ret) 911 goto err; 912 } 913 914 if (sectors[ALLOC_dirty] > a->dirty_sectors || 915 sectors[ALLOC_stripe] > a->stripe_sectors) { 916 ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: 917 -BCH_ERR_transaction_restart_nested; 918 goto err; 919 } 920 921 if (!sectors[ALLOC_dirty] && 922 !sectors[ALLOC_stripe]) 923 __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); 924 else 925 __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); 926 } 927 err: 928 bch2_dev_put(ca); 929 return ret; 930 } 931 932 static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) 933 { 934 switch (k.k->type) { 935 case KEY_TYPE_btree_ptr_v2: { 936 bool ret = false; 937 938 rcu_read_lock(); 939 struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; 940 while (pos.inode <= k.k->p.inode) { 941 if (pos.inode >= c->sb.nr_devices) 942 break; 943 944 struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); 945 if (!ca) 946 goto next; 947 948 struct bpos bucket = bp_pos_to_bucket(ca, pos); 949 bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, 950 ca->mi.nbuckets, bucket.offset); 951 if (bucket.offset == ca->mi.nbuckets) 952 goto next; 953 954 ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); 955 if (ret) 956 break; 957 next: 958 pos = SPOS(pos.inode + 1, 0, 0); 959 } 960 rcu_read_unlock(); 961 962 return ret; 963 } 964 case KEY_TYPE_btree_ptr: 965 return true; 966 default: 967 return false; 968 } 969 } 970 971 static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, 972 enum btree_id btree, unsigned level) 973 { 974 struct btree_iter iter; 975 bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); 976 struct btree *b = bch2_btree_iter_peek_node(&iter); 977 int ret = PTR_ERR_OR_ZERO(b); 978 if (ret) 979 goto err; 980 981 if (b) 982 bch2_node_pin(trans->c, b); 983 err: 984 bch2_trans_iter_exit(trans, &iter); 985 return ret; 986 } 987 988 static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, 989 struct bpos start, struct bpos *end) 990 { 991 struct bch_fs *c = trans->c; 992 int ret = 0; 993 994 struct bkey_buf tmp; 995 bch2_bkey_buf_init(&tmp); 996 997 bch2_btree_cache_unpin(c); 998 999 *end = SPOS_MAX; 1000 1001 s64 mem_may_pin = mem_may_pin_bytes(c); 1002 struct btree_iter iter; 1003 bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, 1004 0, 1, BTREE_ITER_prefetch); 1005 ret = for_each_btree_key_continue(trans, iter, 0, k, ({ 1006 if (!backpointer_node_has_missing(c, k)) 1007 continue; 1008 1009 mem_may_pin -= c->opts.btree_node_size; 1010 if (mem_may_pin <= 0) 1011 break; 1012 1013 bch2_bkey_buf_reassemble(&tmp, c, k); 1014 struct btree_path *path = btree_iter_path(trans, &iter); 1015 1016 BUG_ON(path->level != 1); 1017 1018 bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); 1019 })); 1020 if (ret) 1021 return ret; 1022 1023 struct bpos pinned = SPOS_MAX; 1024 mem_may_pin = mem_may_pin_bytes(c); 1025 bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, 1026 0, 1, BTREE_ITER_prefetch); 1027 ret = for_each_btree_key_continue(trans, iter, 0, k, ({ 1028 if (!backpointer_node_has_missing(c, k)) 1029 continue; 1030 1031 mem_may_pin -= c->opts.btree_node_size; 1032 if (mem_may_pin <= 0) { 1033 *end = pinned; 1034 break; 1035 } 1036 1037 bch2_bkey_buf_reassemble(&tmp, c, k); 1038 struct btree_path *path = btree_iter_path(trans, &iter); 1039 1040 BUG_ON(path->level != 1); 1041 1042 int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); 1043 1044 if (!ret2) 1045 pinned = tmp.k->k.p; 1046 1047 ret; 1048 })); 1049 if (ret) 1050 return ret; 1051 1052 return ret; 1053 } 1054 1055 int bch2_check_extents_to_backpointers(struct bch_fs *c) 1056 { 1057 int ret = 0; 1058 1059 /* 1060 * Can't allow devices to come/go/resize while we have bucket bitmaps 1061 * allocated 1062 */ 1063 lockdep_assert_held(&c->state_lock); 1064 1065 for_each_member_device(c, ca) { 1066 BUG_ON(ca->bucket_backpointer_mismatches); 1067 ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), 1068 sizeof(unsigned long), 1069 GFP_KERNEL); 1070 ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), 1071 sizeof(unsigned long), 1072 GFP_KERNEL); 1073 if (!ca->bucket_backpointer_mismatches || 1074 !ca->bucket_backpointer_empty) { 1075 bch2_dev_put(ca); 1076 ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; 1077 goto err_free_bitmaps; 1078 } 1079 } 1080 1081 struct btree_trans *trans = bch2_trans_get(c); 1082 struct extents_to_bp_state s = { .bp_start = POS_MIN }; 1083 1084 bch2_bkey_buf_init(&s.last_flushed); 1085 bkey_init(&s.last_flushed.k->k); 1086 1087 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, 1088 POS_MIN, BTREE_ITER_prefetch, k, ({ 1089 check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); 1090 })); 1091 if (ret) 1092 goto err; 1093 1094 u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; 1095 for_each_member_device(c, ca) { 1096 nr_buckets += ca->mi.nbuckets; 1097 nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); 1098 nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); 1099 } 1100 1101 if (!nr_mismatches && !nr_empty) 1102 goto err; 1103 1104 bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", 1105 nr_mismatches + nr_empty, nr_buckets); 1106 1107 while (1) { 1108 ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); 1109 if (ret) 1110 break; 1111 1112 if ( bpos_eq(s.bp_start, POS_MIN) && 1113 !bpos_eq(s.bp_end, SPOS_MAX)) 1114 bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", 1115 __func__, btree_nodes_fit_in_ram(c)); 1116 1117 if (!bpos_eq(s.bp_start, POS_MIN) || 1118 !bpos_eq(s.bp_end, SPOS_MAX)) { 1119 struct printbuf buf = PRINTBUF; 1120 1121 prt_str(&buf, "check_extents_to_backpointers(): "); 1122 bch2_bpos_to_text(&buf, s.bp_start); 1123 prt_str(&buf, "-"); 1124 bch2_bpos_to_text(&buf, s.bp_end); 1125 1126 bch_verbose(c, "%s", buf.buf); 1127 printbuf_exit(&buf); 1128 } 1129 1130 ret = bch2_check_extents_to_backpointers_pass(trans, &s); 1131 if (ret || bpos_eq(s.bp_end, SPOS_MAX)) 1132 break; 1133 1134 s.bp_start = bpos_successor(s.bp_end); 1135 } 1136 err: 1137 bch2_trans_put(trans); 1138 bch2_bkey_buf_exit(&s.last_flushed, c); 1139 bch2_btree_cache_unpin(c); 1140 err_free_bitmaps: 1141 for_each_member_device(c, ca) { 1142 kvfree(ca->bucket_backpointer_empty); 1143 ca->bucket_backpointer_empty = NULL; 1144 kvfree(ca->bucket_backpointer_mismatches); 1145 ca->bucket_backpointer_mismatches = NULL; 1146 } 1147 1148 bch_err_fn(c, ret); 1149 return ret; 1150 } 1151 1152 static int check_one_backpointer(struct btree_trans *trans, 1153 struct bbpos start, 1154 struct bbpos end, 1155 struct bkey_s_c bp_k, 1156 struct bkey_buf *last_flushed) 1157 { 1158 if (bp_k.k->type != KEY_TYPE_backpointer) 1159 return 0; 1160 1161 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); 1162 struct bbpos pos = bp_to_bbpos(*bp.v); 1163 1164 if (bbpos_cmp(pos, start) < 0 || 1165 bbpos_cmp(pos, end) > 0) 1166 return 0; 1167 1168 struct btree_iter iter; 1169 struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); 1170 int ret = bkey_err(k); 1171 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 1172 return 0; 1173 if (ret) 1174 return ret; 1175 1176 bch2_trans_iter_exit(trans, &iter); 1177 return ret; 1178 } 1179 1180 static int check_bucket_backpointers_to_extents(struct btree_trans *trans, 1181 struct bch_dev *ca, struct bpos bucket) 1182 { 1183 u32 restart_count = trans->restart_count; 1184 struct bkey_buf last_flushed; 1185 bch2_bkey_buf_init(&last_flushed); 1186 bkey_init(&last_flushed.k->k); 1187 1188 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, 1189 bucket_pos_to_bp_start(ca, bucket), 1190 bucket_pos_to_bp_end(ca, bucket), 1191 0, k, 1192 check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) 1193 ); 1194 1195 bch2_bkey_buf_exit(&last_flushed, trans->c); 1196 return ret ?: trans_was_restarted(trans, restart_count); 1197 } 1198 1199 static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, 1200 struct bbpos start, 1201 struct bbpos end) 1202 { 1203 struct bch_fs *c = trans->c; 1204 struct bkey_buf last_flushed; 1205 struct progress_indicator_state progress; 1206 1207 bch2_bkey_buf_init(&last_flushed); 1208 bkey_init(&last_flushed.k->k); 1209 progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); 1210 1211 int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, 1212 POS_MIN, BTREE_ITER_prefetch, k, ({ 1213 progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); 1214 check_one_backpointer(trans, start, end, k, &last_flushed); 1215 })); 1216 1217 bch2_bkey_buf_exit(&last_flushed, c); 1218 return ret; 1219 } 1220 1221 int bch2_check_backpointers_to_extents(struct bch_fs *c) 1222 { 1223 struct btree_trans *trans = bch2_trans_get(c); 1224 struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; 1225 int ret; 1226 1227 while (1) { 1228 ret = bch2_get_btree_in_memory_pos(trans, 1229 BIT_ULL(BTREE_ID_extents)| 1230 BIT_ULL(BTREE_ID_reflink), 1231 ~0, 1232 start, &end); 1233 if (ret) 1234 break; 1235 1236 if (!bbpos_cmp(start, BBPOS_MIN) && 1237 bbpos_cmp(end, BBPOS_MAX)) 1238 bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass", 1239 __func__, btree_nodes_fit_in_ram(c)); 1240 1241 if (bbpos_cmp(start, BBPOS_MIN) || 1242 bbpos_cmp(end, BBPOS_MAX)) { 1243 struct printbuf buf = PRINTBUF; 1244 1245 prt_str(&buf, "check_backpointers_to_extents(): "); 1246 bch2_bbpos_to_text(&buf, start); 1247 prt_str(&buf, "-"); 1248 bch2_bbpos_to_text(&buf, end); 1249 1250 bch_verbose(c, "%s", buf.buf); 1251 printbuf_exit(&buf); 1252 } 1253 1254 ret = bch2_check_backpointers_to_extents_pass(trans, start, end); 1255 if (ret || !bbpos_cmp(end, BBPOS_MAX)) 1256 break; 1257 1258 start = bbpos_successor(end); 1259 } 1260 bch2_trans_put(trans); 1261 1262 bch2_btree_cache_unpin(c); 1263 1264 bch_err_fn(c, ret); 1265 return ret; 1266 } 1267