1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "bbpos.h" 4 #include "alloc_background.h" 5 #include "backpointers.h" 6 #include "bkey_buf.h" 7 #include "btree_cache.h" 8 #include "btree_update.h" 9 #include "btree_update_interior.h" 10 #include "btree_write_buffer.h" 11 #include "checksum.h" 12 #include "disk_accounting.h" 13 #include "error.h" 14 #include "progress.h" 15 16 #include <linux/mm.h> 17 18 int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, 19 struct bkey_validate_context from) 20 { 21 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); 22 int ret = 0; 23 24 bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH, 25 c, backpointer_level_bad, 26 "backpointer level bad: %u >= %u", 27 bp.v->level, BTREE_MAX_DEPTH); 28 29 bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, 30 c, backpointer_dev_bad, 31 "backpointer for BCH_SB_MEMBER_INVALID"); 32 fsck_err: 33 return ret; 34 } 35 36 void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 37 { 38 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); 39 40 rcu_read_lock(); 41 struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); 42 if (ca) { 43 u32 bucket_offset; 44 struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); 45 rcu_read_unlock(); 46 prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); 47 } else { 48 rcu_read_unlock(); 49 prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); 50 } 51 52 bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); 53 prt_str(out, " data_type="); 54 bch2_prt_data_type(out, bp.v->data_type); 55 prt_printf(out, " suboffset=%u len=%u gen=%u pos=", 56 (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), 57 bp.v->bucket_len, 58 bp.v->bucket_gen); 59 bch2_bpos_to_text(out, bp.v->pos); 60 } 61 62 void bch2_backpointer_swab(struct bkey_s k) 63 { 64 struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); 65 66 bp.v->bucket_len = swab32(bp.v->bucket_len); 67 bch2_bpos_swab(&bp.v->pos); 68 } 69 70 static bool extent_matches_bp(struct bch_fs *c, 71 enum btree_id btree_id, unsigned level, 72 struct bkey_s_c k, 73 struct bkey_s_c_backpointer bp) 74 { 75 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 76 const union bch_extent_entry *entry; 77 struct extent_ptr_decoded p; 78 79 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 80 struct bkey_i_backpointer bp2; 81 bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); 82 83 if (bpos_eq(bp.k->p, bp2.k.p) && 84 !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) 85 return true; 86 } 87 88 return false; 89 } 90 91 static noinline int backpointer_mod_err(struct btree_trans *trans, 92 struct bkey_s_c orig_k, 93 struct bkey_i_backpointer *new_bp, 94 struct bkey_s_c found_bp, 95 bool insert) 96 { 97 struct bch_fs *c = trans->c; 98 struct printbuf buf = PRINTBUF; 99 int ret = 0; 100 101 if (insert) { 102 prt_printf(&buf, "existing backpointer found when inserting "); 103 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); 104 prt_newline(&buf); 105 printbuf_indent_add(&buf, 2); 106 107 prt_printf(&buf, "found "); 108 bch2_bkey_val_to_text(&buf, c, found_bp); 109 prt_newline(&buf); 110 111 prt_printf(&buf, "for "); 112 bch2_bkey_val_to_text(&buf, c, orig_k); 113 114 bch_err(c, "%s", buf.buf); 115 } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { 116 prt_printf(&buf, "backpointer not found when deleting\n"); 117 printbuf_indent_add(&buf, 2); 118 119 prt_printf(&buf, "searching for "); 120 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); 121 prt_newline(&buf); 122 123 prt_printf(&buf, "got "); 124 bch2_bkey_val_to_text(&buf, c, found_bp); 125 prt_newline(&buf); 126 127 prt_printf(&buf, "for "); 128 bch2_bkey_val_to_text(&buf, c, orig_k); 129 } 130 131 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && 132 __bch2_inconsistent_error(c, &buf)) 133 ret = -BCH_ERR_erofs_unfixed_errors; 134 135 bch_err(c, "%s", buf.buf); 136 printbuf_exit(&buf); 137 return ret; 138 } 139 140 int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, 141 struct bkey_s_c orig_k, 142 struct bkey_i_backpointer *bp, 143 bool insert) 144 { 145 struct btree_iter bp_iter; 146 struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, 147 bp->k.p, 148 BTREE_ITER_intent| 149 BTREE_ITER_slots| 150 BTREE_ITER_with_updates); 151 int ret = bkey_err(k); 152 if (ret) 153 return ret; 154 155 if (insert 156 ? k.k->type 157 : (k.k->type != KEY_TYPE_backpointer || 158 memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { 159 ret = backpointer_mod_err(trans, orig_k, bp, k, insert); 160 if (ret) 161 goto err; 162 } 163 164 if (!insert) { 165 bp->k.type = KEY_TYPE_deleted; 166 set_bkey_val_u64s(&bp->k, 0); 167 } 168 169 ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); 170 err: 171 bch2_trans_iter_exit(trans, &bp_iter); 172 return ret; 173 } 174 175 static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) 176 { 177 return (likely(!bch2_backpointers_no_use_write_buffer) 178 ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) 179 : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: 180 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 181 } 182 183 static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, 184 struct bkey_s_c visiting_k, 185 struct bkey_buf *last_flushed) 186 { 187 return likely(!bch2_backpointers_no_use_write_buffer) 188 ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) 189 : 0; 190 } 191 192 static int backpointer_target_not_found(struct btree_trans *trans, 193 struct bkey_s_c_backpointer bp, 194 struct bkey_s_c target_k, 195 struct bkey_buf *last_flushed) 196 { 197 struct bch_fs *c = trans->c; 198 struct printbuf buf = PRINTBUF; 199 int ret = 0; 200 201 /* 202 * If we're using the btree write buffer, the backpointer we were 203 * looking at may have already been deleted - failure to find what it 204 * pointed to is not an error: 205 */ 206 ret = last_flushed 207 ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) 208 : 0; 209 if (ret) 210 return ret; 211 212 prt_printf(&buf, "backpointer doesn't match %s it points to:\n", 213 bp.v->level ? "btree node" : "extent"); 214 bch2_bkey_val_to_text(&buf, c, bp.s_c); 215 216 prt_newline(&buf); 217 bch2_bkey_val_to_text(&buf, c, target_k); 218 219 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); 220 const union bch_extent_entry *entry; 221 struct extent_ptr_decoded p; 222 bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) 223 if (p.ptr.dev == bp.k->p.inode) { 224 prt_newline(&buf); 225 struct bkey_i_backpointer bp2; 226 bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); 227 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); 228 } 229 230 if (fsck_err(trans, backpointer_to_missing_ptr, 231 "%s", buf.buf)) 232 ret = bch2_backpointer_del(trans, bp.k->p); 233 fsck_err: 234 printbuf_exit(&buf); 235 return ret; 236 } 237 238 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, 239 struct bkey_s_c_backpointer bp, 240 struct btree_iter *iter, 241 unsigned iter_flags, 242 struct bkey_buf *last_flushed) 243 { 244 struct bch_fs *c = trans->c; 245 246 if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) 247 return bkey_s_c_null; 248 249 bch2_trans_node_iter_init(trans, iter, 250 bp.v->btree_id, 251 bp.v->pos, 252 0, 253 bp.v->level, 254 iter_flags); 255 struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); 256 if (bkey_err(k)) { 257 bch2_trans_iter_exit(trans, iter); 258 return k; 259 } 260 261 /* 262 * peek_slot() doesn't normally return NULL - except when we ask for a 263 * key at a btree level that doesn't exist. 264 * 265 * We may want to revisit this and change peek_slot(): 266 */ 267 if (!k.k) { 268 bkey_init(&iter->k); 269 iter->k.p = bp.v->pos; 270 k.k = &iter->k; 271 } 272 273 if (k.k && 274 extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) 275 return k; 276 277 bch2_trans_iter_exit(trans, iter); 278 279 if (!bp.v->level) { 280 int ret = backpointer_target_not_found(trans, bp, k, last_flushed); 281 return ret ? bkey_s_c_err(ret) : bkey_s_c_null; 282 } else { 283 struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); 284 if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) 285 return bkey_s_c_null; 286 if (IS_ERR_OR_NULL(b)) 287 return ((struct bkey_s_c) { .k = ERR_CAST(b) }); 288 289 return bkey_i_to_s_c(&b->key); 290 } 291 } 292 293 struct btree *bch2_backpointer_get_node(struct btree_trans *trans, 294 struct bkey_s_c_backpointer bp, 295 struct btree_iter *iter, 296 struct bkey_buf *last_flushed) 297 { 298 struct bch_fs *c = trans->c; 299 300 BUG_ON(!bp.v->level); 301 302 bch2_trans_node_iter_init(trans, iter, 303 bp.v->btree_id, 304 bp.v->pos, 305 0, 306 bp.v->level - 1, 307 0); 308 struct btree *b = bch2_btree_iter_peek_node(trans, iter); 309 if (IS_ERR_OR_NULL(b)) 310 goto err; 311 312 BUG_ON(b->c.level != bp.v->level - 1); 313 314 if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, 315 bkey_i_to_s_c(&b->key), bp)) 316 return b; 317 318 if (btree_node_will_make_reachable(b)) { 319 b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); 320 } else { 321 int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); 322 b = ret ? ERR_PTR(ret) : NULL; 323 } 324 err: 325 bch2_trans_iter_exit(trans, iter); 326 return b; 327 } 328 329 static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, 330 struct bkey_buf *last_flushed) 331 { 332 if (k.k->type != KEY_TYPE_backpointer) 333 return 0; 334 335 struct bch_fs *c = trans->c; 336 struct btree_iter alloc_iter = {}; 337 struct bkey_s_c alloc_k; 338 struct printbuf buf = PRINTBUF; 339 int ret = 0; 340 341 struct bpos bucket; 342 if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { 343 ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); 344 if (ret) 345 goto out; 346 347 if (fsck_err(trans, backpointer_to_missing_device, 348 "backpointer for missing device:\n%s", 349 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 350 ret = bch2_backpointer_del(trans, k.k->p); 351 goto out; 352 } 353 354 alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); 355 ret = bkey_err(alloc_k); 356 if (ret) 357 goto out; 358 359 if (alloc_k.k->type != KEY_TYPE_alloc_v4) { 360 ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); 361 if (ret) 362 goto out; 363 364 if (fsck_err(trans, backpointer_to_missing_alloc, 365 "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", 366 alloc_iter.pos.inode, alloc_iter.pos.offset, 367 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 368 ret = bch2_backpointer_del(trans, k.k->p); 369 } 370 out: 371 fsck_err: 372 bch2_trans_iter_exit(trans, &alloc_iter); 373 printbuf_exit(&buf); 374 return ret; 375 } 376 377 /* verify that every backpointer has a corresponding alloc key */ 378 int bch2_check_btree_backpointers(struct bch_fs *c) 379 { 380 struct bkey_buf last_flushed; 381 bch2_bkey_buf_init(&last_flushed); 382 bkey_init(&last_flushed.k->k); 383 384 int ret = bch2_trans_run(c, 385 for_each_btree_key_commit(trans, iter, 386 BTREE_ID_backpointers, POS_MIN, 0, k, 387 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 388 bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); 389 390 bch2_bkey_buf_exit(&last_flushed, c); 391 bch_err_fn(c, ret); 392 return ret; 393 } 394 395 struct extents_to_bp_state { 396 struct bpos bp_start; 397 struct bpos bp_end; 398 struct bkey_buf last_flushed; 399 }; 400 401 static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, 402 struct bkey_s_c extent, unsigned dev) 403 { 404 struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); 405 int ret = PTR_ERR_OR_ZERO(n); 406 if (ret) 407 return ret; 408 409 bch2_bkey_drop_device(bkey_i_to_s(n), dev); 410 return bch2_btree_insert_trans(trans, btree, n, 0); 411 } 412 413 static int check_extent_checksum(struct btree_trans *trans, 414 enum btree_id btree, struct bkey_s_c extent, 415 enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) 416 { 417 struct bch_fs *c = trans->c; 418 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); 419 const union bch_extent_entry *entry; 420 struct extent_ptr_decoded p; 421 struct printbuf buf = PRINTBUF; 422 void *data_buf = NULL; 423 struct bio *bio = NULL; 424 size_t bytes; 425 int ret = 0; 426 427 if (bkey_is_btree_ptr(extent.k)) 428 return false; 429 430 bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) 431 if (p.ptr.dev == dev) 432 goto found; 433 BUG(); 434 found: 435 if (!p.crc.csum_type) 436 return false; 437 438 bytes = p.crc.compressed_size << 9; 439 440 struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); 441 if (!ca) 442 return false; 443 444 data_buf = kvmalloc(bytes, GFP_KERNEL); 445 if (!data_buf) { 446 ret = -ENOMEM; 447 goto err; 448 } 449 450 bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL); 451 bio->bi_iter.bi_sector = p.ptr.offset; 452 bch2_bio_map(bio, data_buf, bytes); 453 ret = submit_bio_wait(bio); 454 if (ret) 455 goto err; 456 457 prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n"); 458 bch2_btree_id_to_text(&buf, btree); 459 prt_str(&buf, " "); 460 bch2_bkey_val_to_text(&buf, c, extent); 461 prt_newline(&buf); 462 bch2_btree_id_to_text(&buf, o_btree); 463 prt_str(&buf, " "); 464 bch2_bkey_val_to_text(&buf, c, extent2); 465 466 struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); 467 struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); 468 if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), 469 trans, dup_backpointer_to_bad_csum_extent, 470 "%s", buf.buf)) 471 ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; 472 fsck_err: 473 err: 474 if (bio) 475 bio_put(bio); 476 kvfree(data_buf); 477 percpu_ref_put(&ca->io_ref[READ]); 478 printbuf_exit(&buf); 479 return ret; 480 } 481 482 static int check_bp_exists(struct btree_trans *trans, 483 struct extents_to_bp_state *s, 484 struct bkey_i_backpointer *bp, 485 struct bkey_s_c orig_k) 486 { 487 struct bch_fs *c = trans->c; 488 struct btree_iter other_extent_iter = {}; 489 struct printbuf buf = PRINTBUF; 490 491 if (bpos_lt(bp->k.p, s->bp_start) || 492 bpos_gt(bp->k.p, s->bp_end)) 493 return 0; 494 495 struct btree_iter bp_iter; 496 struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); 497 int ret = bkey_err(bp_k); 498 if (ret) 499 goto err; 500 501 if (bp_k.k->type != KEY_TYPE_backpointer || 502 memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { 503 ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); 504 if (ret) 505 goto err; 506 507 goto check_existing_bp; 508 } 509 out: 510 err: 511 fsck_err: 512 bch2_trans_iter_exit(trans, &other_extent_iter); 513 bch2_trans_iter_exit(trans, &bp_iter); 514 printbuf_exit(&buf); 515 return ret; 516 check_existing_bp: 517 /* Do we have a backpointer for a different extent? */ 518 if (bp_k.k->type != KEY_TYPE_backpointer) 519 goto missing; 520 521 struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); 522 523 struct bkey_s_c other_extent = 524 bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); 525 ret = bkey_err(other_extent); 526 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 527 ret = 0; 528 if (ret) 529 goto err; 530 531 if (!other_extent.k) 532 goto missing; 533 534 rcu_read_lock(); 535 struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode); 536 if (ca) { 537 struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent); 538 bkey_for_each_ptr(other_extent_ptrs, ptr) 539 if (ptr->dev == bp->k.p.inode && 540 dev_ptr_stale_rcu(ca, ptr)) { 541 ret = drop_dev_and_update(trans, other_bp.v->btree_id, 542 other_extent, bp->k.p.inode); 543 if (ret) 544 goto err; 545 goto out; 546 } 547 } 548 rcu_read_unlock(); 549 550 if (bch2_extents_match(orig_k, other_extent)) { 551 printbuf_reset(&buf); 552 prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n"); 553 bch2_bkey_val_to_text(&buf, c, orig_k); 554 prt_newline(&buf); 555 bch2_bkey_val_to_text(&buf, c, other_extent); 556 bch_err(c, "%s", buf.buf); 557 558 if (other_extent.k->size <= orig_k.k->size) { 559 ret = drop_dev_and_update(trans, other_bp.v->btree_id, 560 other_extent, bp->k.p.inode); 561 if (ret) 562 goto err; 563 goto out; 564 } else { 565 ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); 566 if (ret) 567 goto err; 568 goto missing; 569 } 570 } 571 572 ret = check_extent_checksum(trans, 573 other_bp.v->btree_id, other_extent, 574 bp->v.btree_id, orig_k, 575 bp->k.p.inode); 576 if (ret < 0) 577 goto err; 578 if (ret) { 579 ret = 0; 580 goto missing; 581 } 582 583 ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, 584 other_bp.v->btree_id, other_extent, bp->k.p.inode); 585 if (ret < 0) 586 goto err; 587 if (ret) { 588 ret = 0; 589 goto out; 590 } 591 592 printbuf_reset(&buf); 593 prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode); 594 bch2_bkey_val_to_text(&buf, c, orig_k); 595 prt_newline(&buf); 596 bch2_bkey_val_to_text(&buf, c, other_extent); 597 bch_err(c, "%s", buf.buf); 598 ret = -BCH_ERR_fsck_repair_unimplemented; 599 goto err; 600 missing: 601 printbuf_reset(&buf); 602 prt_str(&buf, "missing backpointer\nfor: "); 603 bch2_bkey_val_to_text(&buf, c, orig_k); 604 prt_printf(&buf, "\nwant: "); 605 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); 606 prt_printf(&buf, "\ngot: "); 607 bch2_bkey_val_to_text(&buf, c, bp_k); 608 609 if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) 610 ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); 611 612 goto out; 613 } 614 615 static int check_extent_to_backpointers(struct btree_trans *trans, 616 struct extents_to_bp_state *s, 617 enum btree_id btree, unsigned level, 618 struct bkey_s_c k) 619 { 620 struct bch_fs *c = trans->c; 621 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 622 const union bch_extent_entry *entry; 623 struct extent_ptr_decoded p; 624 625 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 626 if (p.ptr.dev == BCH_SB_MEMBER_INVALID) 627 continue; 628 629 rcu_read_lock(); 630 struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); 631 bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); 632 bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); 633 634 bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)); 635 rcu_read_unlock(); 636 637 if ((check || empty) && !stale) { 638 struct bkey_i_backpointer bp; 639 bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); 640 641 int ret = check 642 ? check_bp_exists(trans, s, &bp, k) 643 : bch2_bucket_backpointer_mod(trans, k, &bp, true); 644 if (ret) 645 return ret; 646 } 647 } 648 649 return 0; 650 } 651 652 static int check_btree_root_to_backpointers(struct btree_trans *trans, 653 struct extents_to_bp_state *s, 654 enum btree_id btree_id, 655 int *level) 656 { 657 struct bch_fs *c = trans->c; 658 struct btree_iter iter; 659 struct btree *b; 660 struct bkey_s_c k; 661 int ret; 662 retry: 663 bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 664 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); 665 b = bch2_btree_iter_peek_node(trans, &iter); 666 ret = PTR_ERR_OR_ZERO(b); 667 if (ret) 668 goto err; 669 670 if (b != btree_node_root(c, b)) { 671 bch2_trans_iter_exit(trans, &iter); 672 goto retry; 673 } 674 675 *level = b->c.level; 676 677 k = bkey_i_to_s_c(&b->key); 678 ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); 679 err: 680 bch2_trans_iter_exit(trans, &iter); 681 return ret; 682 } 683 684 static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) 685 { 686 return (struct bbpos) { 687 .btree = bp.btree_id, 688 .pos = bp.pos, 689 }; 690 } 691 692 static u64 mem_may_pin_bytes(struct bch_fs *c) 693 { 694 struct sysinfo i; 695 si_meminfo(&i); 696 697 u64 mem_bytes = i.totalram * i.mem_unit; 698 return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); 699 } 700 701 static size_t btree_nodes_fit_in_ram(struct bch_fs *c) 702 { 703 return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); 704 } 705 706 static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, 707 u64 btree_leaf_mask, 708 u64 btree_interior_mask, 709 struct bbpos start, struct bbpos *end) 710 { 711 struct bch_fs *c = trans->c; 712 s64 mem_may_pin = mem_may_pin_bytes(c); 713 int ret = 0; 714 715 bch2_btree_cache_unpin(c); 716 717 btree_interior_mask |= btree_leaf_mask; 718 719 c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask; 720 c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask; 721 c->btree_cache.pinned_nodes_start = start; 722 c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; 723 724 for (enum btree_id btree = start.btree; 725 btree < BTREE_ID_NR && !ret; 726 btree++) { 727 unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1; 728 729 if (!(BIT_ULL(btree) & btree_leaf_mask) && 730 !(BIT_ULL(btree) & btree_interior_mask)) 731 continue; 732 733 ret = __for_each_btree_node(trans, iter, btree, 734 btree == start.btree ? start.pos : POS_MIN, 735 0, depth, BTREE_ITER_prefetch, b, ({ 736 mem_may_pin -= btree_buf_bytes(b); 737 if (mem_may_pin <= 0) { 738 c->btree_cache.pinned_nodes_end = *end = 739 BBPOS(btree, b->key.k.p); 740 break; 741 } 742 bch2_node_pin(c, b); 743 0; 744 })); 745 } 746 747 return ret; 748 } 749 750 static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, 751 struct extents_to_bp_state *s) 752 { 753 struct bch_fs *c = trans->c; 754 struct progress_indicator_state progress; 755 int ret = 0; 756 757 bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); 758 759 for (enum btree_id btree_id = 0; 760 btree_id < btree_id_nr_alive(c); 761 btree_id++) { 762 int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; 763 764 ret = commit_do(trans, NULL, NULL, 765 BCH_TRANS_COMMIT_no_enospc, 766 check_btree_root_to_backpointers(trans, s, btree_id, &level)); 767 if (ret) 768 return ret; 769 770 while (level >= depth) { 771 struct btree_iter iter; 772 bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, 773 BTREE_ITER_prefetch); 774 775 ret = for_each_btree_key_continue(trans, iter, 0, k, ({ 776 bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); 777 check_extent_to_backpointers(trans, s, btree_id, level, k) ?: 778 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 779 })); 780 if (ret) 781 return ret; 782 783 --level; 784 } 785 } 786 787 return 0; 788 } 789 790 enum alloc_sector_counter { 791 ALLOC_dirty, 792 ALLOC_cached, 793 ALLOC_stripe, 794 ALLOC_SECTORS_NR 795 }; 796 797 static int data_type_to_alloc_counter(enum bch_data_type t) 798 { 799 switch (t) { 800 case BCH_DATA_btree: 801 case BCH_DATA_user: 802 return ALLOC_dirty; 803 case BCH_DATA_cached: 804 return ALLOC_cached; 805 case BCH_DATA_stripe: 806 case BCH_DATA_parity: 807 return ALLOC_stripe; 808 default: 809 return -1; 810 } 811 } 812 813 static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); 814 815 static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, 816 struct bkey_buf *last_flushed) 817 { 818 struct bch_fs *c = trans->c; 819 struct bch_alloc_v4 a_convert; 820 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); 821 bool need_commit = false; 822 823 if (a->data_type == BCH_DATA_sb || 824 a->data_type == BCH_DATA_journal || 825 a->data_type == BCH_DATA_parity) 826 return 0; 827 828 u32 sectors[ALLOC_SECTORS_NR]; 829 memset(sectors, 0, sizeof(sectors)); 830 831 struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); 832 if (!ca) 833 return 0; 834 835 struct btree_iter iter; 836 struct bkey_s_c bp_k; 837 int ret = 0; 838 for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, 839 bucket_pos_to_bp_start(ca, alloc_k.k->p), 840 bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { 841 if (bp_k.k->type != KEY_TYPE_backpointer) 842 continue; 843 844 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); 845 846 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && 847 (bp.v->bucket_gen != a->gen || 848 bp.v->pad)) { 849 ret = bch2_backpointer_del(trans, bp_k.k->p); 850 if (ret) 851 break; 852 853 need_commit = true; 854 continue; 855 } 856 857 if (bp.v->bucket_gen != a->gen) 858 continue; 859 860 int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); 861 if (alloc_counter < 0) 862 continue; 863 864 sectors[alloc_counter] += bp.v->bucket_len; 865 }; 866 bch2_trans_iter_exit(trans, &iter); 867 if (ret) 868 goto err; 869 870 if (need_commit) { 871 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 872 if (ret) 873 goto err; 874 } 875 876 if (sectors[ALLOC_dirty] != a->dirty_sectors || 877 sectors[ALLOC_cached] != a->cached_sectors || 878 sectors[ALLOC_stripe] != a->stripe_sectors) { 879 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { 880 ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); 881 if (ret) 882 goto err; 883 } 884 885 if (sectors[ALLOC_dirty] > a->dirty_sectors || 886 sectors[ALLOC_cached] > a->cached_sectors || 887 sectors[ALLOC_stripe] > a->stripe_sectors) { 888 ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: 889 -BCH_ERR_transaction_restart_nested; 890 goto err; 891 } 892 893 if (!sectors[ALLOC_dirty] && 894 !sectors[ALLOC_stripe] && 895 !sectors[ALLOC_cached]) 896 __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); 897 else 898 __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); 899 } 900 err: 901 bch2_dev_put(ca); 902 return ret; 903 } 904 905 static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) 906 { 907 switch (k.k->type) { 908 case KEY_TYPE_btree_ptr_v2: { 909 bool ret = false; 910 911 rcu_read_lock(); 912 struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; 913 while (pos.inode <= k.k->p.inode) { 914 if (pos.inode >= c->sb.nr_devices) 915 break; 916 917 struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); 918 if (!ca) 919 goto next; 920 921 struct bpos bucket = bp_pos_to_bucket(ca, pos); 922 bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, 923 ca->mi.nbuckets, bucket.offset); 924 if (bucket.offset == ca->mi.nbuckets) 925 goto next; 926 927 ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); 928 if (ret) 929 break; 930 next: 931 pos = SPOS(pos.inode + 1, 0, 0); 932 } 933 rcu_read_unlock(); 934 935 return ret; 936 } 937 case KEY_TYPE_btree_ptr: 938 return true; 939 default: 940 return false; 941 } 942 } 943 944 static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, 945 enum btree_id btree, unsigned level) 946 { 947 struct btree_iter iter; 948 bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); 949 struct btree *b = bch2_btree_iter_peek_node(trans, &iter); 950 int ret = PTR_ERR_OR_ZERO(b); 951 if (ret) 952 goto err; 953 954 if (b) 955 bch2_node_pin(trans->c, b); 956 err: 957 bch2_trans_iter_exit(trans, &iter); 958 return ret; 959 } 960 961 static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, 962 struct bpos start, struct bpos *end) 963 { 964 struct bch_fs *c = trans->c; 965 int ret = 0; 966 967 struct bkey_buf tmp; 968 bch2_bkey_buf_init(&tmp); 969 970 bch2_btree_cache_unpin(c); 971 972 *end = SPOS_MAX; 973 974 s64 mem_may_pin = mem_may_pin_bytes(c); 975 struct btree_iter iter; 976 bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, 977 0, 1, BTREE_ITER_prefetch); 978 ret = for_each_btree_key_continue(trans, iter, 0, k, ({ 979 if (!backpointer_node_has_missing(c, k)) 980 continue; 981 982 mem_may_pin -= c->opts.btree_node_size; 983 if (mem_may_pin <= 0) 984 break; 985 986 bch2_bkey_buf_reassemble(&tmp, c, k); 987 struct btree_path *path = btree_iter_path(trans, &iter); 988 989 BUG_ON(path->level != 1); 990 991 bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); 992 })); 993 if (ret) 994 return ret; 995 996 struct bpos pinned = SPOS_MAX; 997 mem_may_pin = mem_may_pin_bytes(c); 998 bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, 999 0, 1, BTREE_ITER_prefetch); 1000 ret = for_each_btree_key_continue(trans, iter, 0, k, ({ 1001 if (!backpointer_node_has_missing(c, k)) 1002 continue; 1003 1004 mem_may_pin -= c->opts.btree_node_size; 1005 if (mem_may_pin <= 0) { 1006 *end = pinned; 1007 break; 1008 } 1009 1010 bch2_bkey_buf_reassemble(&tmp, c, k); 1011 struct btree_path *path = btree_iter_path(trans, &iter); 1012 1013 BUG_ON(path->level != 1); 1014 1015 int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); 1016 1017 if (!ret2) 1018 pinned = tmp.k->k.p; 1019 1020 ret; 1021 })); 1022 if (ret) 1023 return ret; 1024 1025 return ret; 1026 } 1027 1028 int bch2_check_extents_to_backpointers(struct bch_fs *c) 1029 { 1030 int ret = 0; 1031 1032 /* 1033 * Can't allow devices to come/go/resize while we have bucket bitmaps 1034 * allocated 1035 */ 1036 down_read(&c->state_lock); 1037 1038 for_each_member_device(c, ca) { 1039 BUG_ON(ca->bucket_backpointer_mismatches); 1040 ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), 1041 sizeof(unsigned long), 1042 GFP_KERNEL); 1043 ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), 1044 sizeof(unsigned long), 1045 GFP_KERNEL); 1046 if (!ca->bucket_backpointer_mismatches || 1047 !ca->bucket_backpointer_empty) { 1048 bch2_dev_put(ca); 1049 ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; 1050 goto err_free_bitmaps; 1051 } 1052 } 1053 1054 struct btree_trans *trans = bch2_trans_get(c); 1055 struct extents_to_bp_state s = { .bp_start = POS_MIN }; 1056 1057 bch2_bkey_buf_init(&s.last_flushed); 1058 bkey_init(&s.last_flushed.k->k); 1059 1060 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, 1061 POS_MIN, BTREE_ITER_prefetch, k, ({ 1062 check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); 1063 })); 1064 if (ret) 1065 goto err; 1066 1067 u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; 1068 for_each_member_device(c, ca) { 1069 nr_buckets += ca->mi.nbuckets; 1070 nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); 1071 nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); 1072 } 1073 1074 if (!nr_mismatches && !nr_empty) 1075 goto err; 1076 1077 bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", 1078 nr_mismatches + nr_empty, nr_buckets); 1079 1080 while (1) { 1081 ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); 1082 if (ret) 1083 break; 1084 1085 if ( bpos_eq(s.bp_start, POS_MIN) && 1086 !bpos_eq(s.bp_end, SPOS_MAX)) 1087 bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", 1088 __func__, btree_nodes_fit_in_ram(c)); 1089 1090 if (!bpos_eq(s.bp_start, POS_MIN) || 1091 !bpos_eq(s.bp_end, SPOS_MAX)) { 1092 struct printbuf buf = PRINTBUF; 1093 1094 prt_str(&buf, "check_extents_to_backpointers(): "); 1095 bch2_bpos_to_text(&buf, s.bp_start); 1096 prt_str(&buf, "-"); 1097 bch2_bpos_to_text(&buf, s.bp_end); 1098 1099 bch_verbose(c, "%s", buf.buf); 1100 printbuf_exit(&buf); 1101 } 1102 1103 ret = bch2_check_extents_to_backpointers_pass(trans, &s); 1104 if (ret || bpos_eq(s.bp_end, SPOS_MAX)) 1105 break; 1106 1107 s.bp_start = bpos_successor(s.bp_end); 1108 } 1109 err: 1110 bch2_trans_put(trans); 1111 bch2_bkey_buf_exit(&s.last_flushed, c); 1112 bch2_btree_cache_unpin(c); 1113 err_free_bitmaps: 1114 for_each_member_device(c, ca) { 1115 kvfree(ca->bucket_backpointer_empty); 1116 ca->bucket_backpointer_empty = NULL; 1117 kvfree(ca->bucket_backpointer_mismatches); 1118 ca->bucket_backpointer_mismatches = NULL; 1119 } 1120 1121 up_read(&c->state_lock); 1122 bch_err_fn(c, ret); 1123 return ret; 1124 } 1125 1126 static int check_one_backpointer(struct btree_trans *trans, 1127 struct bbpos start, 1128 struct bbpos end, 1129 struct bkey_s_c bp_k, 1130 struct bkey_buf *last_flushed) 1131 { 1132 if (bp_k.k->type != KEY_TYPE_backpointer) 1133 return 0; 1134 1135 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); 1136 struct bbpos pos = bp_to_bbpos(*bp.v); 1137 1138 if (bbpos_cmp(pos, start) < 0 || 1139 bbpos_cmp(pos, end) > 0) 1140 return 0; 1141 1142 struct btree_iter iter; 1143 struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); 1144 int ret = bkey_err(k); 1145 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 1146 return 0; 1147 if (ret) 1148 return ret; 1149 1150 bch2_trans_iter_exit(trans, &iter); 1151 return ret; 1152 } 1153 1154 static int check_bucket_backpointers_to_extents(struct btree_trans *trans, 1155 struct bch_dev *ca, struct bpos bucket) 1156 { 1157 u32 restart_count = trans->restart_count; 1158 struct bkey_buf last_flushed; 1159 bch2_bkey_buf_init(&last_flushed); 1160 bkey_init(&last_flushed.k->k); 1161 1162 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, 1163 bucket_pos_to_bp_start(ca, bucket), 1164 bucket_pos_to_bp_end(ca, bucket), 1165 0, k, 1166 check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) 1167 ); 1168 1169 bch2_bkey_buf_exit(&last_flushed, trans->c); 1170 return ret ?: trans_was_restarted(trans, restart_count); 1171 } 1172 1173 static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, 1174 struct bbpos start, 1175 struct bbpos end) 1176 { 1177 struct bch_fs *c = trans->c; 1178 struct bkey_buf last_flushed; 1179 struct progress_indicator_state progress; 1180 1181 bch2_bkey_buf_init(&last_flushed); 1182 bkey_init(&last_flushed.k->k); 1183 bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); 1184 1185 int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, 1186 POS_MIN, BTREE_ITER_prefetch, k, ({ 1187 bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); 1188 check_one_backpointer(trans, start, end, k, &last_flushed); 1189 })); 1190 1191 bch2_bkey_buf_exit(&last_flushed, c); 1192 return ret; 1193 } 1194 1195 int bch2_check_backpointers_to_extents(struct bch_fs *c) 1196 { 1197 struct btree_trans *trans = bch2_trans_get(c); 1198 struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; 1199 int ret; 1200 1201 while (1) { 1202 ret = bch2_get_btree_in_memory_pos(trans, 1203 BIT_ULL(BTREE_ID_extents)| 1204 BIT_ULL(BTREE_ID_reflink), 1205 ~0, 1206 start, &end); 1207 if (ret) 1208 break; 1209 1210 if (!bbpos_cmp(start, BBPOS_MIN) && 1211 bbpos_cmp(end, BBPOS_MAX)) 1212 bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass", 1213 __func__, btree_nodes_fit_in_ram(c)); 1214 1215 if (bbpos_cmp(start, BBPOS_MIN) || 1216 bbpos_cmp(end, BBPOS_MAX)) { 1217 struct printbuf buf = PRINTBUF; 1218 1219 prt_str(&buf, "check_backpointers_to_extents(): "); 1220 bch2_bbpos_to_text(&buf, start); 1221 prt_str(&buf, "-"); 1222 bch2_bbpos_to_text(&buf, end); 1223 1224 bch_verbose(c, "%s", buf.buf); 1225 printbuf_exit(&buf); 1226 } 1227 1228 ret = bch2_check_backpointers_to_extents_pass(trans, start, end); 1229 if (ret || !bbpos_cmp(end, BBPOS_MAX)) 1230 break; 1231 1232 start = bbpos_successor(end); 1233 } 1234 bch2_trans_put(trans); 1235 1236 bch2_btree_cache_unpin(c); 1237 1238 bch_err_fn(c, ret); 1239 return ret; 1240 } 1241