1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "bkey_methods.h" 6 #include "bkey_sort.h" 7 #include "btree_cache.h" 8 #include "btree_io.h" 9 #include "btree_iter.h" 10 #include "btree_locking.h" 11 #include "btree_update.h" 12 #include "btree_update_interior.h" 13 #include "buckets.h" 14 #include "checksum.h" 15 #include "debug.h" 16 #include "error.h" 17 #include "extents.h" 18 #include "io_write.h" 19 #include "journal_reclaim.h" 20 #include "journal_seq_blacklist.h" 21 #include "recovery.h" 22 #include "super-io.h" 23 #include "trace.h" 24 25 #include <linux/sched/mm.h> 26 27 static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) 28 { 29 bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); 30 prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn)); 31 prt_str(out, "min: "); 32 bch2_bpos_to_text(out, bn->min_key); 33 prt_newline(out); 34 prt_str(out, "max: "); 35 bch2_bpos_to_text(out, bn->max_key); 36 } 37 38 void bch2_btree_node_io_unlock(struct btree *b) 39 { 40 EBUG_ON(!btree_node_write_in_flight(b)); 41 42 clear_btree_node_write_in_flight_inner(b); 43 clear_btree_node_write_in_flight(b); 44 wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); 45 } 46 47 void bch2_btree_node_io_lock(struct btree *b) 48 { 49 wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, 50 TASK_UNINTERRUPTIBLE); 51 } 52 53 void __bch2_btree_node_wait_on_read(struct btree *b) 54 { 55 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, 56 TASK_UNINTERRUPTIBLE); 57 } 58 59 void __bch2_btree_node_wait_on_write(struct btree *b) 60 { 61 wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, 62 TASK_UNINTERRUPTIBLE); 63 } 64 65 void bch2_btree_node_wait_on_read(struct btree *b) 66 { 67 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, 68 TASK_UNINTERRUPTIBLE); 69 } 70 71 void bch2_btree_node_wait_on_write(struct btree *b) 72 { 73 wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, 74 TASK_UNINTERRUPTIBLE); 75 } 76 77 static void verify_no_dups(struct btree *b, 78 struct bkey_packed *start, 79 struct bkey_packed *end) 80 { 81 #ifdef CONFIG_BCACHEFS_DEBUG 82 struct bkey_packed *k, *p; 83 84 if (start == end) 85 return; 86 87 for (p = start, k = bkey_p_next(start); 88 k != end; 89 p = k, k = bkey_p_next(k)) { 90 struct bkey l = bkey_unpack_key(b, p); 91 struct bkey r = bkey_unpack_key(b, k); 92 93 BUG_ON(bpos_ge(l.p, bkey_start_pos(&r))); 94 } 95 #endif 96 } 97 98 static void set_needs_whiteout(struct bset *i, int v) 99 { 100 struct bkey_packed *k; 101 102 for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) 103 k->needs_whiteout = v; 104 } 105 106 static void btree_bounce_free(struct bch_fs *c, size_t size, 107 bool used_mempool, void *p) 108 { 109 if (used_mempool) 110 mempool_free(p, &c->btree_bounce_pool); 111 else 112 kvfree(p); 113 } 114 115 static void *btree_bounce_alloc(struct bch_fs *c, size_t size, 116 bool *used_mempool) 117 { 118 unsigned flags = memalloc_nofs_save(); 119 void *p; 120 121 BUG_ON(size > c->opts.btree_node_size); 122 123 *used_mempool = false; 124 p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); 125 if (!p) { 126 *used_mempool = true; 127 p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); 128 } 129 memalloc_nofs_restore(flags); 130 return p; 131 } 132 133 static void sort_bkey_ptrs(const struct btree *bt, 134 struct bkey_packed **ptrs, unsigned nr) 135 { 136 unsigned n = nr, a = nr / 2, b, c, d; 137 138 if (!a) 139 return; 140 141 /* Heap sort: see lib/sort.c: */ 142 while (1) { 143 if (a) 144 a--; 145 else if (--n) 146 swap(ptrs[0], ptrs[n]); 147 else 148 break; 149 150 for (b = a; c = 2 * b + 1, (d = c + 1) < n;) 151 b = bch2_bkey_cmp_packed(bt, 152 ptrs[c], 153 ptrs[d]) >= 0 ? c : d; 154 if (d == n) 155 b = c; 156 157 while (b != a && 158 bch2_bkey_cmp_packed(bt, 159 ptrs[a], 160 ptrs[b]) >= 0) 161 b = (b - 1) / 2; 162 c = b; 163 while (b != a) { 164 b = (b - 1) / 2; 165 swap(ptrs[b], ptrs[c]); 166 } 167 } 168 } 169 170 static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) 171 { 172 struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; 173 bool used_mempool = false; 174 size_t bytes = b->whiteout_u64s * sizeof(u64); 175 176 if (!b->whiteout_u64s) 177 return; 178 179 new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); 180 181 ptrs = ptrs_end = ((void *) new_whiteouts + bytes); 182 183 for (k = unwritten_whiteouts_start(b); 184 k != unwritten_whiteouts_end(b); 185 k = bkey_p_next(k)) 186 *--ptrs = k; 187 188 sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); 189 190 k = new_whiteouts; 191 192 while (ptrs != ptrs_end) { 193 bkey_p_copy(k, *ptrs); 194 k = bkey_p_next(k); 195 ptrs++; 196 } 197 198 verify_no_dups(b, new_whiteouts, 199 (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); 200 201 memcpy_u64s(unwritten_whiteouts_start(b), 202 new_whiteouts, b->whiteout_u64s); 203 204 btree_bounce_free(c, bytes, used_mempool, new_whiteouts); 205 } 206 207 static bool should_compact_bset(struct btree *b, struct bset_tree *t, 208 bool compacting, enum compact_mode mode) 209 { 210 if (!bset_dead_u64s(b, t)) 211 return false; 212 213 switch (mode) { 214 case COMPACT_LAZY: 215 return should_compact_bset_lazy(b, t) || 216 (compacting && !bset_written(b, bset(b, t))); 217 case COMPACT_ALL: 218 return true; 219 default: 220 BUG(); 221 } 222 } 223 224 static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) 225 { 226 bool ret = false; 227 228 for_each_bset(b, t) { 229 struct bset *i = bset(b, t); 230 struct bkey_packed *k, *n, *out, *start, *end; 231 struct btree_node_entry *src = NULL, *dst = NULL; 232 233 if (t != b->set && !bset_written(b, i)) { 234 src = container_of(i, struct btree_node_entry, keys); 235 dst = max(write_block(b), 236 (void *) btree_bkey_last(b, t - 1)); 237 } 238 239 if (src != dst) 240 ret = true; 241 242 if (!should_compact_bset(b, t, ret, mode)) { 243 if (src != dst) { 244 memmove(dst, src, sizeof(*src) + 245 le16_to_cpu(src->keys.u64s) * 246 sizeof(u64)); 247 i = &dst->keys; 248 set_btree_bset(b, t, i); 249 } 250 continue; 251 } 252 253 start = btree_bkey_first(b, t); 254 end = btree_bkey_last(b, t); 255 256 if (src != dst) { 257 memmove(dst, src, sizeof(*src)); 258 i = &dst->keys; 259 set_btree_bset(b, t, i); 260 } 261 262 out = i->start; 263 264 for (k = start; k != end; k = n) { 265 n = bkey_p_next(k); 266 267 if (!bkey_deleted(k)) { 268 bkey_p_copy(out, k); 269 out = bkey_p_next(out); 270 } else { 271 BUG_ON(k->needs_whiteout); 272 } 273 } 274 275 i->u64s = cpu_to_le16((u64 *) out - i->_data); 276 set_btree_bset_end(b, t); 277 bch2_bset_set_no_aux_tree(b, t); 278 ret = true; 279 } 280 281 bch2_verify_btree_nr_keys(b); 282 283 bch2_btree_build_aux_trees(b); 284 285 return ret; 286 } 287 288 bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, 289 enum compact_mode mode) 290 { 291 return bch2_drop_whiteouts(b, mode); 292 } 293 294 static void btree_node_sort(struct bch_fs *c, struct btree *b, 295 unsigned start_idx, 296 unsigned end_idx) 297 { 298 struct btree_node *out; 299 struct sort_iter_stack sort_iter; 300 struct bset_tree *t; 301 struct bset *start_bset = bset(b, &b->set[start_idx]); 302 bool used_mempool = false; 303 u64 start_time, seq = 0; 304 unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; 305 bool sorting_entire_node = start_idx == 0 && 306 end_idx == b->nsets; 307 308 sort_iter_stack_init(&sort_iter, b); 309 310 for (t = b->set + start_idx; 311 t < b->set + end_idx; 312 t++) { 313 u64s += le16_to_cpu(bset(b, t)->u64s); 314 sort_iter_add(&sort_iter.iter, 315 btree_bkey_first(b, t), 316 btree_bkey_last(b, t)); 317 } 318 319 bytes = sorting_entire_node 320 ? btree_buf_bytes(b) 321 : __vstruct_bytes(struct btree_node, u64s); 322 323 out = btree_bounce_alloc(c, bytes, &used_mempool); 324 325 start_time = local_clock(); 326 327 u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter); 328 329 out->keys.u64s = cpu_to_le16(u64s); 330 331 BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); 332 333 if (sorting_entire_node) 334 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], 335 start_time); 336 337 /* Make sure we preserve bset journal_seq: */ 338 for (t = b->set + start_idx; t < b->set + end_idx; t++) 339 seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); 340 start_bset->journal_seq = cpu_to_le64(seq); 341 342 if (sorting_entire_node) { 343 u64s = le16_to_cpu(out->keys.u64s); 344 345 BUG_ON(bytes != btree_buf_bytes(b)); 346 347 /* 348 * Our temporary buffer is the same size as the btree node's 349 * buffer, we can just swap buffers instead of doing a big 350 * memcpy() 351 */ 352 *out = *b->data; 353 out->keys.u64s = cpu_to_le16(u64s); 354 swap(out, b->data); 355 set_btree_bset(b, b->set, &b->data->keys); 356 } else { 357 start_bset->u64s = out->keys.u64s; 358 memcpy_u64s(start_bset->start, 359 out->keys.start, 360 le16_to_cpu(out->keys.u64s)); 361 } 362 363 for (i = start_idx + 1; i < end_idx; i++) 364 b->nr.bset_u64s[start_idx] += 365 b->nr.bset_u64s[i]; 366 367 b->nsets -= shift; 368 369 for (i = start_idx + 1; i < b->nsets; i++) { 370 b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; 371 b->set[i] = b->set[i + shift]; 372 } 373 374 for (i = b->nsets; i < MAX_BSETS; i++) 375 b->nr.bset_u64s[i] = 0; 376 377 set_btree_bset_end(b, &b->set[start_idx]); 378 bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); 379 380 btree_bounce_free(c, bytes, used_mempool, out); 381 382 bch2_verify_btree_nr_keys(b); 383 } 384 385 void bch2_btree_sort_into(struct bch_fs *c, 386 struct btree *dst, 387 struct btree *src) 388 { 389 struct btree_nr_keys nr; 390 struct btree_node_iter src_iter; 391 u64 start_time = local_clock(); 392 393 BUG_ON(dst->nsets != 1); 394 395 bch2_bset_set_no_aux_tree(dst, dst->set); 396 397 bch2_btree_node_iter_init_from_start(&src_iter, src); 398 399 nr = bch2_sort_repack(btree_bset_first(dst), 400 src, &src_iter, 401 &dst->format, 402 true); 403 404 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], 405 start_time); 406 407 set_btree_bset_end(dst, dst->set); 408 409 dst->nr.live_u64s += nr.live_u64s; 410 dst->nr.bset_u64s[0] += nr.bset_u64s[0]; 411 dst->nr.packed_keys += nr.packed_keys; 412 dst->nr.unpacked_keys += nr.unpacked_keys; 413 414 bch2_verify_btree_nr_keys(dst); 415 } 416 417 /* 418 * We're about to add another bset to the btree node, so if there's currently 419 * too many bsets - sort some of them together: 420 */ 421 static bool btree_node_compact(struct bch_fs *c, struct btree *b) 422 { 423 unsigned unwritten_idx; 424 bool ret = false; 425 426 for (unwritten_idx = 0; 427 unwritten_idx < b->nsets; 428 unwritten_idx++) 429 if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) 430 break; 431 432 if (b->nsets - unwritten_idx > 1) { 433 btree_node_sort(c, b, unwritten_idx, b->nsets); 434 ret = true; 435 } 436 437 if (unwritten_idx > 1) { 438 btree_node_sort(c, b, 0, unwritten_idx); 439 ret = true; 440 } 441 442 return ret; 443 } 444 445 void bch2_btree_build_aux_trees(struct btree *b) 446 { 447 for_each_bset(b, t) 448 bch2_bset_build_aux_tree(b, t, 449 !bset_written(b, bset(b, t)) && 450 t == bset_tree_last(b)); 451 } 452 453 /* 454 * If we have MAX_BSETS (3) bsets, should we sort them all down to just one? 455 * 456 * The first bset is going to be of similar order to the size of the node, the 457 * last bset is bounded by btree_write_set_buffer(), which is set to keep the 458 * memmove on insert from being too expensive: the middle bset should, ideally, 459 * be the geometric mean of the first and the last. 460 * 461 * Returns true if the middle bset is greater than that geometric mean: 462 */ 463 static inline bool should_compact_all(struct bch_fs *c, struct btree *b) 464 { 465 unsigned mid_u64s_bits = 466 (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2; 467 468 return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits; 469 } 470 471 /* 472 * @bch_btree_init_next - initialize a new (unwritten) bset that can then be 473 * inserted into 474 * 475 * Safe to call if there already is an unwritten bset - will only add a new bset 476 * if @b doesn't already have one. 477 * 478 * Returns true if we sorted (i.e. invalidated iterators 479 */ 480 void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) 481 { 482 struct bch_fs *c = trans->c; 483 struct btree_node_entry *bne; 484 bool reinit_iter = false; 485 486 EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]); 487 BUG_ON(bset_written(b, bset(b, &b->set[1]))); 488 BUG_ON(btree_node_just_written(b)); 489 490 if (b->nsets == MAX_BSETS && 491 !btree_node_write_in_flight(b) && 492 should_compact_all(c, b)) { 493 bch2_btree_node_write_trans(trans, b, SIX_LOCK_write, 494 BTREE_WRITE_init_next_bset); 495 reinit_iter = true; 496 } 497 498 if (b->nsets == MAX_BSETS && 499 btree_node_compact(c, b)) 500 reinit_iter = true; 501 502 BUG_ON(b->nsets >= MAX_BSETS); 503 504 bne = want_new_bset(c, b); 505 if (bne) 506 bch2_bset_init_next(b, bne); 507 508 bch2_btree_build_aux_trees(b); 509 510 if (reinit_iter) 511 bch2_trans_node_reinit_iter(trans, b); 512 } 513 514 static void btree_err_msg(struct printbuf *out, struct bch_fs *c, 515 struct bch_dev *ca, 516 struct btree *b, struct bset *i, struct bkey_packed *k, 517 unsigned offset, int write) 518 { 519 prt_printf(out, bch2_log_msg(c, "%s"), 520 write == READ 521 ? "error validating btree node " 522 : "corrupt btree node before write "); 523 if (ca) 524 prt_printf(out, "on %s ", ca->name); 525 prt_printf(out, "at btree "); 526 bch2_btree_pos_to_text(out, c, b); 527 528 prt_printf(out, "\nnode offset %u/%u", 529 b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); 530 if (i) 531 prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); 532 if (k) 533 prt_printf(out, " bset byte offset %lu", 534 (unsigned long)(void *)k - 535 ((unsigned long)(void *)i & ~511UL)); 536 prt_str(out, ": "); 537 } 538 539 __printf(10, 11) 540 static int __btree_err(int ret, 541 struct bch_fs *c, 542 struct bch_dev *ca, 543 struct btree *b, 544 struct bset *i, 545 struct bkey_packed *k, 546 int write, 547 bool have_retry, 548 enum bch_sb_error_id err_type, 549 const char *fmt, ...) 550 { 551 bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; 552 553 if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) 554 ret = -BCH_ERR_btree_node_read_err_fixable; 555 if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) 556 ret = -BCH_ERR_btree_node_read_err_bad_node; 557 558 if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) 559 bch2_sb_error_count(c, err_type); 560 561 struct printbuf out = PRINTBUF; 562 if (write != WRITE && ret != -BCH_ERR_btree_node_read_err_fixable) { 563 printbuf_indent_add_nextline(&out, 2); 564 #ifdef BCACHEFS_LOG_PREFIX 565 prt_printf(&out, bch2_log_msg(c, "")); 566 #endif 567 } 568 569 btree_err_msg(&out, c, ca, b, i, k, b->written, write); 570 571 va_list args; 572 va_start(args, fmt); 573 prt_vprintf(&out, fmt, args); 574 va_end(args); 575 576 if (write == WRITE) { 577 prt_str(&out, ", "); 578 ret = __bch2_inconsistent_error(c, &out) 579 ? -BCH_ERR_fsck_errors_not_fixed 580 : 0; 581 silent = false; 582 } 583 584 switch (ret) { 585 case -BCH_ERR_btree_node_read_err_fixable: 586 ret = !silent 587 ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf) 588 : -BCH_ERR_fsck_fix; 589 if (ret != -BCH_ERR_fsck_fix && 590 ret != -BCH_ERR_fsck_ignore) 591 goto fsck_err; 592 ret = -BCH_ERR_fsck_fix; 593 goto out; 594 case -BCH_ERR_btree_node_read_err_bad_node: 595 prt_str(&out, ", "); 596 ret = __bch2_topology_error(c, &out); 597 if (ret) 598 silent = false; 599 break; 600 case -BCH_ERR_btree_node_read_err_incompatible: 601 ret = -BCH_ERR_fsck_errors_not_fixed; 602 silent = false; 603 break; 604 } 605 606 if (!silent) 607 bch2_print_string_as_lines(KERN_ERR, out.buf); 608 out: 609 fsck_err: 610 printbuf_exit(&out); 611 return ret; 612 } 613 614 #define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ 615 ({ \ 616 int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ 617 BCH_FSCK_ERR_##_err_type, \ 618 msg, ##__VA_ARGS__); \ 619 \ 620 if (_ret != -BCH_ERR_fsck_fix) { \ 621 ret = _ret; \ 622 goto fsck_err; \ 623 } \ 624 \ 625 *saw_error = true; \ 626 }) 627 628 #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) 629 630 /* 631 * When btree topology repair changes the start or end of a node, that might 632 * mean we have to drop keys that are no longer inside the node: 633 */ 634 __cold 635 void bch2_btree_node_drop_keys_outside_node(struct btree *b) 636 { 637 for_each_bset(b, t) { 638 struct bset *i = bset(b, t); 639 struct bkey_packed *k; 640 641 for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) 642 if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) 643 break; 644 645 if (k != i->start) { 646 unsigned shift = (u64 *) k - (u64 *) i->start; 647 648 memmove_u64s_down(i->start, k, 649 (u64 *) vstruct_end(i) - (u64 *) k); 650 i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); 651 set_btree_bset_end(b, t); 652 } 653 654 for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) 655 if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) 656 break; 657 658 if (k != vstruct_last(i)) { 659 i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); 660 set_btree_bset_end(b, t); 661 } 662 } 663 664 /* 665 * Always rebuild search trees: eytzinger search tree nodes directly 666 * depend on the values of min/max key: 667 */ 668 bch2_bset_set_no_aux_tree(b, b->set); 669 bch2_btree_build_aux_trees(b); 670 b->nr = bch2_btree_node_count_keys(b); 671 672 struct bkey_s_c k; 673 struct bkey unpacked; 674 struct btree_node_iter iter; 675 for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { 676 BUG_ON(bpos_lt(k.k->p, b->data->min_key)); 677 BUG_ON(bpos_gt(k.k->p, b->data->max_key)); 678 } 679 } 680 681 static int validate_bset(struct bch_fs *c, struct bch_dev *ca, 682 struct btree *b, struct bset *i, 683 unsigned offset, unsigned sectors, 684 int write, bool have_retry, bool *saw_error) 685 { 686 unsigned version = le16_to_cpu(i->version); 687 unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); 688 struct printbuf buf1 = PRINTBUF; 689 struct printbuf buf2 = PRINTBUF; 690 int ret = 0; 691 692 btree_err_on(!bch2_version_compatible(version), 693 -BCH_ERR_btree_node_read_err_incompatible, 694 c, ca, b, i, NULL, 695 btree_node_unsupported_version, 696 "unsupported bset version %u.%u", 697 BCH_VERSION_MAJOR(version), 698 BCH_VERSION_MINOR(version)); 699 700 if (btree_err_on(version < c->sb.version_min, 701 -BCH_ERR_btree_node_read_err_fixable, 702 c, NULL, b, i, NULL, 703 btree_node_bset_older_than_sb_min, 704 "bset version %u older than superblock version_min %u", 705 version, c->sb.version_min)) { 706 mutex_lock(&c->sb_lock); 707 c->disk_sb.sb->version_min = cpu_to_le16(version); 708 bch2_write_super(c); 709 mutex_unlock(&c->sb_lock); 710 } 711 712 if (btree_err_on(BCH_VERSION_MAJOR(version) > 713 BCH_VERSION_MAJOR(c->sb.version), 714 -BCH_ERR_btree_node_read_err_fixable, 715 c, NULL, b, i, NULL, 716 btree_node_bset_newer_than_sb, 717 "bset version %u newer than superblock version %u", 718 version, c->sb.version)) { 719 mutex_lock(&c->sb_lock); 720 c->disk_sb.sb->version = cpu_to_le16(version); 721 bch2_write_super(c); 722 mutex_unlock(&c->sb_lock); 723 } 724 725 btree_err_on(BSET_SEPARATE_WHITEOUTS(i), 726 -BCH_ERR_btree_node_read_err_incompatible, 727 c, ca, b, i, NULL, 728 btree_node_unsupported_version, 729 "BSET_SEPARATE_WHITEOUTS no longer supported"); 730 731 if (!write && 732 btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)), 733 -BCH_ERR_btree_node_read_err_fixable, 734 c, ca, b, i, NULL, 735 bset_past_end_of_btree_node, 736 "bset past end of btree node (offset %u len %u but written %zu)", 737 offset, sectors, ptr_written ?: btree_sectors(c))) 738 i->u64s = 0; 739 740 btree_err_on(offset && !i->u64s, 741 -BCH_ERR_btree_node_read_err_fixable, 742 c, ca, b, i, NULL, 743 bset_empty, 744 "empty bset"); 745 746 btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, 747 -BCH_ERR_btree_node_read_err_want_retry, 748 c, ca, b, i, NULL, 749 bset_wrong_sector_offset, 750 "bset at wrong sector offset"); 751 752 if (!offset) { 753 struct btree_node *bn = 754 container_of(i, struct btree_node, keys); 755 /* These indicate that we read the wrong btree node: */ 756 757 if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { 758 struct bch_btree_ptr_v2 *bp = 759 &bkey_i_to_btree_ptr_v2(&b->key)->v; 760 761 /* XXX endianness */ 762 btree_err_on(bp->seq != bn->keys.seq, 763 -BCH_ERR_btree_node_read_err_must_retry, 764 c, ca, b, NULL, NULL, 765 bset_bad_seq, 766 "incorrect sequence number (wrong btree node)"); 767 } 768 769 btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, 770 -BCH_ERR_btree_node_read_err_must_retry, 771 c, ca, b, i, NULL, 772 btree_node_bad_btree, 773 "incorrect btree id"); 774 775 btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, 776 -BCH_ERR_btree_node_read_err_must_retry, 777 c, ca, b, i, NULL, 778 btree_node_bad_level, 779 "incorrect level"); 780 781 if (!write) 782 compat_btree_node(b->c.level, b->c.btree_id, version, 783 BSET_BIG_ENDIAN(i), write, bn); 784 785 if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { 786 struct bch_btree_ptr_v2 *bp = 787 &bkey_i_to_btree_ptr_v2(&b->key)->v; 788 789 if (BTREE_PTR_RANGE_UPDATED(bp)) { 790 b->data->min_key = bp->min_key; 791 b->data->max_key = b->key.k.p; 792 } 793 794 btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), 795 -BCH_ERR_btree_node_read_err_must_retry, 796 c, ca, b, NULL, NULL, 797 btree_node_bad_min_key, 798 "incorrect min_key: got %s should be %s", 799 (printbuf_reset(&buf1), 800 bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), 801 (printbuf_reset(&buf2), 802 bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); 803 } 804 805 btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), 806 -BCH_ERR_btree_node_read_err_must_retry, 807 c, ca, b, i, NULL, 808 btree_node_bad_max_key, 809 "incorrect max key %s", 810 (printbuf_reset(&buf1), 811 bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); 812 813 if (write) 814 compat_btree_node(b->c.level, b->c.btree_id, version, 815 BSET_BIG_ENDIAN(i), write, bn); 816 817 btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), 818 -BCH_ERR_btree_node_read_err_bad_node, 819 c, ca, b, i, NULL, 820 btree_node_bad_format, 821 "invalid bkey format: %s\n%s", buf1.buf, 822 (printbuf_reset(&buf2), 823 bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); 824 printbuf_reset(&buf1); 825 826 compat_bformat(b->c.level, b->c.btree_id, version, 827 BSET_BIG_ENDIAN(i), write, 828 &bn->format); 829 } 830 fsck_err: 831 printbuf_exit(&buf2); 832 printbuf_exit(&buf1); 833 return ret; 834 } 835 836 static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b, 837 struct bkey_s_c k, 838 enum bch_validate_flags flags) 839 { 840 return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) { 841 .from = BKEY_VALIDATE_btree_node, 842 .level = b->c.level, 843 .btree = b->c.btree_id, 844 .flags = flags 845 }); 846 } 847 848 static int bset_key_validate(struct bch_fs *c, struct btree *b, 849 struct bkey_s_c k, 850 bool updated_range, 851 enum bch_validate_flags flags) 852 { 853 struct bkey_validate_context from = (struct bkey_validate_context) { 854 .from = BKEY_VALIDATE_btree_node, 855 .level = b->c.level, 856 .btree = b->c.btree_id, 857 .flags = flags, 858 }; 859 return __bch2_bkey_validate(c, k, from) ?: 860 (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?: 861 (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0); 862 } 863 864 static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, 865 struct bset *i, struct bkey_packed *k) 866 { 867 if (bkey_p_next(k) > vstruct_last(i)) 868 return false; 869 870 if (k->format > KEY_FORMAT_CURRENT) 871 return false; 872 873 if (!bkeyp_u64s_valid(&b->format, k)) 874 return false; 875 876 struct bkey tmp; 877 struct bkey_s u = __bkey_disassemble(b, k, &tmp); 878 return !__bch2_bkey_validate(c, u.s_c, 879 (struct bkey_validate_context) { 880 .from = BKEY_VALIDATE_btree_node, 881 .level = b->c.level, 882 .btree = b->c.btree_id, 883 .flags = BCH_VALIDATE_silent 884 }); 885 } 886 887 static inline int btree_node_read_bkey_cmp(const struct btree *b, 888 const struct bkey_packed *l, 889 const struct bkey_packed *r) 890 { 891 return bch2_bkey_cmp_packed(b, l, r) 892 ?: (int) bkey_deleted(r) - (int) bkey_deleted(l); 893 } 894 895 static int validate_bset_keys(struct bch_fs *c, struct btree *b, 896 struct bset *i, int write, 897 bool have_retry, bool *saw_error) 898 { 899 unsigned version = le16_to_cpu(i->version); 900 struct bkey_packed *k, *prev = NULL; 901 struct printbuf buf = PRINTBUF; 902 bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && 903 BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); 904 int ret = 0; 905 906 for (k = i->start; 907 k != vstruct_last(i);) { 908 struct bkey_s u; 909 struct bkey tmp; 910 unsigned next_good_key; 911 912 if (btree_err_on(bkey_p_next(k) > vstruct_last(i), 913 -BCH_ERR_btree_node_read_err_fixable, 914 c, NULL, b, i, k, 915 btree_node_bkey_past_bset_end, 916 "key extends past end of bset")) { 917 i->u64s = cpu_to_le16((u64 *) k - i->_data); 918 break; 919 } 920 921 if (btree_err_on(k->format > KEY_FORMAT_CURRENT, 922 -BCH_ERR_btree_node_read_err_fixable, 923 c, NULL, b, i, k, 924 btree_node_bkey_bad_format, 925 "invalid bkey format %u", k->format)) 926 goto drop_this_key; 927 928 if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), 929 -BCH_ERR_btree_node_read_err_fixable, 930 c, NULL, b, i, k, 931 btree_node_bkey_bad_u64s, 932 "bad k->u64s %u (min %u max %zu)", k->u64s, 933 bkeyp_key_u64s(&b->format, k), 934 U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k))) 935 goto drop_this_key; 936 937 if (!write) 938 bch2_bkey_compat(b->c.level, b->c.btree_id, version, 939 BSET_BIG_ENDIAN(i), write, 940 &b->format, k); 941 942 u = __bkey_disassemble(b, k, &tmp); 943 944 ret = bset_key_validate(c, b, u.s_c, updated_range, write); 945 if (ret == -BCH_ERR_fsck_delete_bkey) 946 goto drop_this_key; 947 if (ret) 948 goto fsck_err; 949 950 if (write) 951 bch2_bkey_compat(b->c.level, b->c.btree_id, version, 952 BSET_BIG_ENDIAN(i), write, 953 &b->format, k); 954 955 if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) { 956 struct bkey up = bkey_unpack_key(b, prev); 957 958 printbuf_reset(&buf); 959 prt_printf(&buf, "keys out of order: "); 960 bch2_bkey_to_text(&buf, &up); 961 prt_printf(&buf, " > "); 962 bch2_bkey_to_text(&buf, u.k); 963 964 if (btree_err(-BCH_ERR_btree_node_read_err_fixable, 965 c, NULL, b, i, k, 966 btree_node_bkey_out_of_order, 967 "%s", buf.buf)) 968 goto drop_this_key; 969 } 970 971 prev = k; 972 k = bkey_p_next(k); 973 continue; 974 drop_this_key: 975 next_good_key = k->u64s; 976 977 if (!next_good_key || 978 (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN && 979 version >= bcachefs_metadata_version_snapshot)) { 980 /* 981 * only do scanning if bch2_bkey_compat() has nothing to 982 * do 983 */ 984 985 if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { 986 for (next_good_key = 1; 987 next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; 988 next_good_key++) 989 if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) 990 goto got_good_key; 991 } 992 993 /* 994 * didn't find a good key, have to truncate the rest of 995 * the bset 996 */ 997 next_good_key = (u64 *) vstruct_last(i) - (u64 *) k; 998 } 999 got_good_key: 1000 le16_add_cpu(&i->u64s, -next_good_key); 1001 memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); 1002 set_btree_node_need_rewrite(b); 1003 } 1004 fsck_err: 1005 printbuf_exit(&buf); 1006 return ret; 1007 } 1008 1009 int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, 1010 struct btree *b, bool have_retry, bool *saw_error) 1011 { 1012 struct btree_node_entry *bne; 1013 struct sort_iter *iter; 1014 struct btree_node *sorted; 1015 struct bkey_packed *k; 1016 struct bset *i; 1017 bool used_mempool, blacklisted; 1018 bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && 1019 BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); 1020 unsigned u64s; 1021 unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); 1022 u64 max_journal_seq = 0; 1023 struct printbuf buf = PRINTBUF; 1024 int ret = 0, retry_read = 0, write = READ; 1025 u64 start_time = local_clock(); 1026 1027 b->version_ondisk = U16_MAX; 1028 /* We might get called multiple times on read retry: */ 1029 b->written = 0; 1030 1031 iter = mempool_alloc(&c->fill_iter, GFP_NOFS); 1032 sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2); 1033 1034 if (bch2_meta_read_fault("btree")) 1035 btree_err(-BCH_ERR_btree_node_read_err_must_retry, 1036 c, ca, b, NULL, NULL, 1037 btree_node_fault_injected, 1038 "dynamic fault"); 1039 1040 btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), 1041 -BCH_ERR_btree_node_read_err_must_retry, 1042 c, ca, b, NULL, NULL, 1043 btree_node_bad_magic, 1044 "bad magic: want %llx, got %llx", 1045 bset_magic(c), le64_to_cpu(b->data->magic)); 1046 1047 if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { 1048 struct bch_btree_ptr_v2 *bp = 1049 &bkey_i_to_btree_ptr_v2(&b->key)->v; 1050 1051 bch2_bpos_to_text(&buf, b->data->min_key); 1052 prt_str(&buf, "-"); 1053 bch2_bpos_to_text(&buf, b->data->max_key); 1054 1055 btree_err_on(b->data->keys.seq != bp->seq, 1056 -BCH_ERR_btree_node_read_err_must_retry, 1057 c, ca, b, NULL, NULL, 1058 btree_node_bad_seq, 1059 "got wrong btree node: got\n%s", 1060 (printbuf_reset(&buf), 1061 bch2_btree_node_header_to_text(&buf, b->data), 1062 buf.buf)); 1063 } else { 1064 btree_err_on(!b->data->keys.seq, 1065 -BCH_ERR_btree_node_read_err_must_retry, 1066 c, ca, b, NULL, NULL, 1067 btree_node_bad_seq, 1068 "bad btree header: seq 0\n%s", 1069 (printbuf_reset(&buf), 1070 bch2_btree_node_header_to_text(&buf, b->data), 1071 buf.buf)); 1072 } 1073 1074 while (b->written < (ptr_written ?: btree_sectors(c))) { 1075 unsigned sectors; 1076 bool first = !b->written; 1077 1078 if (first) { 1079 bne = NULL; 1080 i = &b->data->keys; 1081 } else { 1082 bne = write_block(b); 1083 i = &bne->keys; 1084 1085 if (i->seq != b->data->keys.seq) 1086 break; 1087 } 1088 1089 struct nonce nonce = btree_nonce(i, b->written << 9); 1090 bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); 1091 1092 btree_err_on(!good_csum_type, 1093 bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) 1094 ? -BCH_ERR_btree_node_read_err_must_retry 1095 : -BCH_ERR_btree_node_read_err_want_retry, 1096 c, ca, b, i, NULL, 1097 bset_unknown_csum, 1098 "unknown checksum type %llu", BSET_CSUM_TYPE(i)); 1099 1100 if (first) { 1101 if (good_csum_type) { 1102 struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); 1103 bool csum_bad = bch2_crc_cmp(b->data->csum, csum); 1104 if (csum_bad) 1105 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 1106 1107 btree_err_on(csum_bad, 1108 -BCH_ERR_btree_node_read_err_want_retry, 1109 c, ca, b, i, NULL, 1110 bset_bad_csum, 1111 "%s", 1112 (printbuf_reset(&buf), 1113 bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), 1114 buf.buf)); 1115 1116 ret = bset_encrypt(c, i, b->written << 9); 1117 if (bch2_fs_fatal_err_on(ret, c, 1118 "decrypting btree node: %s", bch2_err_str(ret))) 1119 goto fsck_err; 1120 } 1121 1122 btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && 1123 !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), 1124 -BCH_ERR_btree_node_read_err_incompatible, 1125 c, NULL, b, NULL, NULL, 1126 btree_node_unsupported_version, 1127 "btree node does not have NEW_EXTENT_OVERWRITE set"); 1128 1129 sectors = vstruct_sectors(b->data, c->block_bits); 1130 } else { 1131 if (good_csum_type) { 1132 struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); 1133 bool csum_bad = bch2_crc_cmp(bne->csum, csum); 1134 if (ca && csum_bad) 1135 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 1136 1137 btree_err_on(csum_bad, 1138 -BCH_ERR_btree_node_read_err_want_retry, 1139 c, ca, b, i, NULL, 1140 bset_bad_csum, 1141 "%s", 1142 (printbuf_reset(&buf), 1143 bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), 1144 buf.buf)); 1145 1146 ret = bset_encrypt(c, i, b->written << 9); 1147 if (bch2_fs_fatal_err_on(ret, c, 1148 "decrypting btree node: %s", bch2_err_str(ret))) 1149 goto fsck_err; 1150 } 1151 1152 sectors = vstruct_sectors(bne, c->block_bits); 1153 } 1154 1155 b->version_ondisk = min(b->version_ondisk, 1156 le16_to_cpu(i->version)); 1157 1158 ret = validate_bset(c, ca, b, i, b->written, sectors, 1159 READ, have_retry, saw_error); 1160 if (ret) 1161 goto fsck_err; 1162 1163 if (!b->written) 1164 btree_node_set_format(b, b->data->format); 1165 1166 ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); 1167 if (ret) 1168 goto fsck_err; 1169 1170 SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); 1171 1172 blacklisted = bch2_journal_seq_is_blacklisted(c, 1173 le64_to_cpu(i->journal_seq), 1174 true); 1175 1176 btree_err_on(blacklisted && first, 1177 -BCH_ERR_btree_node_read_err_fixable, 1178 c, ca, b, i, NULL, 1179 bset_blacklisted_journal_seq, 1180 "first btree node bset has blacklisted journal seq (%llu)", 1181 le64_to_cpu(i->journal_seq)); 1182 1183 btree_err_on(blacklisted && ptr_written, 1184 -BCH_ERR_btree_node_read_err_fixable, 1185 c, ca, b, i, NULL, 1186 first_bset_blacklisted_journal_seq, 1187 "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", 1188 le64_to_cpu(i->journal_seq), 1189 b->written, b->written + sectors, ptr_written); 1190 1191 b->written = min(b->written + sectors, btree_sectors(c)); 1192 1193 if (blacklisted && !first) 1194 continue; 1195 1196 sort_iter_add(iter, 1197 vstruct_idx(i, 0), 1198 vstruct_last(i)); 1199 1200 max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq)); 1201 } 1202 1203 if (ptr_written) { 1204 btree_err_on(b->written < ptr_written, 1205 -BCH_ERR_btree_node_read_err_want_retry, 1206 c, ca, b, NULL, NULL, 1207 btree_node_data_missing, 1208 "btree node data missing: expected %u sectors, found %u", 1209 ptr_written, b->written); 1210 } else { 1211 for (bne = write_block(b); 1212 bset_byte_offset(b, bne) < btree_buf_bytes(b); 1213 bne = (void *) bne + block_bytes(c)) 1214 btree_err_on(bne->keys.seq == b->data->keys.seq && 1215 !bch2_journal_seq_is_blacklisted(c, 1216 le64_to_cpu(bne->keys.journal_seq), 1217 true), 1218 -BCH_ERR_btree_node_read_err_want_retry, 1219 c, ca, b, NULL, NULL, 1220 btree_node_bset_after_end, 1221 "found bset signature after last bset"); 1222 } 1223 1224 sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); 1225 sorted->keys.u64s = 0; 1226 1227 set_btree_bset(b, b->set, &b->data->keys); 1228 1229 b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); 1230 memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, 1231 btree_buf_bytes(b) - 1232 sizeof(struct btree_node) - 1233 b->nr.live_u64s * sizeof(u64)); 1234 1235 u64s = le16_to_cpu(sorted->keys.u64s); 1236 *sorted = *b->data; 1237 sorted->keys.u64s = cpu_to_le16(u64s); 1238 swap(sorted, b->data); 1239 set_btree_bset(b, b->set, &b->data->keys); 1240 b->nsets = 1; 1241 b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); 1242 1243 BUG_ON(b->nr.live_u64s != u64s); 1244 1245 btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); 1246 1247 if (updated_range) 1248 bch2_btree_node_drop_keys_outside_node(b); 1249 1250 i = &b->data->keys; 1251 for (k = i->start; k != vstruct_last(i);) { 1252 struct bkey tmp; 1253 struct bkey_s u = __bkey_disassemble(b, k, &tmp); 1254 1255 ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); 1256 if (ret == -BCH_ERR_fsck_delete_bkey || 1257 (bch2_inject_invalid_keys && 1258 !bversion_cmp(u.k->bversion, MAX_VERSION))) { 1259 btree_keys_account_key_drop(&b->nr, 0, k); 1260 1261 i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); 1262 memmove_u64s_down(k, bkey_p_next(k), 1263 (u64 *) vstruct_end(i) - (u64 *) k); 1264 set_btree_bset_end(b, b->set); 1265 set_btree_node_need_rewrite(b); 1266 continue; 1267 } 1268 if (ret) 1269 goto fsck_err; 1270 1271 if (u.k->type == KEY_TYPE_btree_ptr_v2) { 1272 struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); 1273 1274 bp.v->mem_ptr = 0; 1275 } 1276 1277 k = bkey_p_next(k); 1278 } 1279 1280 bch2_bset_build_aux_tree(b, b->set, false); 1281 1282 set_needs_whiteout(btree_bset_first(b), true); 1283 1284 btree_node_reset_sib_u64s(b); 1285 1286 rcu_read_lock(); 1287 bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { 1288 struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); 1289 1290 if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) 1291 set_btree_node_need_rewrite(b); 1292 } 1293 rcu_read_unlock(); 1294 1295 if (!ptr_written) 1296 set_btree_node_need_rewrite(b); 1297 out: 1298 mempool_free(iter, &c->fill_iter); 1299 printbuf_exit(&buf); 1300 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); 1301 return retry_read; 1302 fsck_err: 1303 if (ret == -BCH_ERR_btree_node_read_err_want_retry || 1304 ret == -BCH_ERR_btree_node_read_err_must_retry) { 1305 retry_read = 1; 1306 } else { 1307 set_btree_node_read_error(b); 1308 bch2_btree_lost_data(c, b->c.btree_id); 1309 } 1310 goto out; 1311 } 1312 1313 static void btree_node_read_work(struct work_struct *work) 1314 { 1315 struct btree_read_bio *rb = 1316 container_of(work, struct btree_read_bio, work); 1317 struct bch_fs *c = rb->c; 1318 struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; 1319 struct btree *b = rb->b; 1320 struct bio *bio = &rb->bio; 1321 struct bch_io_failures failed = { .nr = 0 }; 1322 struct printbuf buf = PRINTBUF; 1323 bool saw_error = false; 1324 bool retry = false; 1325 bool can_retry; 1326 1327 goto start; 1328 while (1) { 1329 retry = true; 1330 bch_info(c, "retrying read"); 1331 ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); 1332 rb->have_ioref = ca != NULL; 1333 rb->start_time = local_clock(); 1334 bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); 1335 bio->bi_iter.bi_sector = rb->pick.ptr.offset; 1336 bio->bi_iter.bi_size = btree_buf_bytes(b); 1337 1338 if (rb->have_ioref) { 1339 bio_set_dev(bio, ca->disk_sb.bdev); 1340 submit_bio_wait(bio); 1341 } else { 1342 bio->bi_status = BLK_STS_REMOVED; 1343 } 1344 1345 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 1346 rb->start_time, !bio->bi_status); 1347 start: 1348 printbuf_reset(&buf); 1349 bch2_btree_pos_to_text(&buf, c, b); 1350 1351 if (ca && bio->bi_status) 1352 bch_err_dev_ratelimited(ca, 1353 "btree read error %s for %s", 1354 bch2_blk_status_to_str(bio->bi_status), buf.buf); 1355 if (rb->have_ioref) 1356 percpu_ref_put(&ca->io_ref[READ]); 1357 rb->have_ioref = false; 1358 1359 bch2_mark_io_failure(&failed, &rb->pick, false); 1360 1361 can_retry = bch2_bkey_pick_read_device(c, 1362 bkey_i_to_s_c(&b->key), 1363 &failed, &rb->pick, -1) > 0; 1364 1365 if (!bio->bi_status && 1366 !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { 1367 if (retry) 1368 bch_info(c, "retry success"); 1369 break; 1370 } 1371 1372 saw_error = true; 1373 1374 if (!can_retry) { 1375 set_btree_node_read_error(b); 1376 bch2_btree_lost_data(c, b->c.btree_id); 1377 break; 1378 } 1379 } 1380 1381 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], 1382 rb->start_time); 1383 bio_put(&rb->bio); 1384 1385 if ((saw_error || 1386 btree_node_need_rewrite(b)) && 1387 !btree_node_read_error(b) && 1388 c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { 1389 if (saw_error) { 1390 printbuf_reset(&buf); 1391 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 1392 prt_str(&buf, " "); 1393 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 1394 bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", 1395 __func__, buf.buf); 1396 } 1397 1398 bch2_btree_node_rewrite_async(c, b); 1399 } 1400 1401 printbuf_exit(&buf); 1402 clear_btree_node_read_in_flight(b); 1403 wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); 1404 } 1405 1406 static void btree_node_read_endio(struct bio *bio) 1407 { 1408 struct btree_read_bio *rb = 1409 container_of(bio, struct btree_read_bio, bio); 1410 struct bch_fs *c = rb->c; 1411 struct bch_dev *ca = rb->have_ioref 1412 ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; 1413 1414 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 1415 rb->start_time, !bio->bi_status); 1416 1417 queue_work(c->btree_read_complete_wq, &rb->work); 1418 } 1419 1420 struct btree_node_read_all { 1421 struct closure cl; 1422 struct bch_fs *c; 1423 struct btree *b; 1424 unsigned nr; 1425 void *buf[BCH_REPLICAS_MAX]; 1426 struct bio *bio[BCH_REPLICAS_MAX]; 1427 blk_status_t err[BCH_REPLICAS_MAX]; 1428 }; 1429 1430 static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) 1431 { 1432 struct btree_node *bn = data; 1433 struct btree_node_entry *bne; 1434 unsigned offset = 0; 1435 1436 if (le64_to_cpu(bn->magic) != bset_magic(c)) 1437 return 0; 1438 1439 while (offset < btree_sectors(c)) { 1440 if (!offset) { 1441 offset += vstruct_sectors(bn, c->block_bits); 1442 } else { 1443 bne = data + (offset << 9); 1444 if (bne->keys.seq != bn->keys.seq) 1445 break; 1446 offset += vstruct_sectors(bne, c->block_bits); 1447 } 1448 } 1449 1450 return offset; 1451 } 1452 1453 static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) 1454 { 1455 struct btree_node *bn = data; 1456 struct btree_node_entry *bne; 1457 1458 if (!offset) 1459 return false; 1460 1461 while (offset < btree_sectors(c)) { 1462 bne = data + (offset << 9); 1463 if (bne->keys.seq == bn->keys.seq) 1464 return true; 1465 offset++; 1466 } 1467 1468 return false; 1469 return offset; 1470 } 1471 1472 static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) 1473 { 1474 closure_type(ra, struct btree_node_read_all, cl); 1475 struct bch_fs *c = ra->c; 1476 struct btree *b = ra->b; 1477 struct printbuf buf = PRINTBUF; 1478 bool dump_bset_maps = false; 1479 bool have_retry = false; 1480 int ret = 0, best = -1, write = READ; 1481 unsigned i, written = 0, written2 = 0; 1482 __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 1483 ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; 1484 bool _saw_error = false, *saw_error = &_saw_error; 1485 1486 for (i = 0; i < ra->nr; i++) { 1487 struct btree_node *bn = ra->buf[i]; 1488 1489 if (ra->err[i]) 1490 continue; 1491 1492 if (le64_to_cpu(bn->magic) != bset_magic(c) || 1493 (seq && seq != bn->keys.seq)) 1494 continue; 1495 1496 if (best < 0) { 1497 best = i; 1498 written = btree_node_sectors_written(c, bn); 1499 continue; 1500 } 1501 1502 written2 = btree_node_sectors_written(c, ra->buf[i]); 1503 if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, 1504 c, NULL, b, NULL, NULL, 1505 btree_node_replicas_sectors_written_mismatch, 1506 "btree node sectors written mismatch: %u != %u", 1507 written, written2) || 1508 btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), 1509 -BCH_ERR_btree_node_read_err_fixable, 1510 c, NULL, b, NULL, NULL, 1511 btree_node_bset_after_end, 1512 "found bset signature after last bset") || 1513 btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), 1514 -BCH_ERR_btree_node_read_err_fixable, 1515 c, NULL, b, NULL, NULL, 1516 btree_node_replicas_data_mismatch, 1517 "btree node replicas content mismatch")) 1518 dump_bset_maps = true; 1519 1520 if (written2 > written) { 1521 written = written2; 1522 best = i; 1523 } 1524 } 1525 fsck_err: 1526 if (dump_bset_maps) { 1527 for (i = 0; i < ra->nr; i++) { 1528 struct btree_node *bn = ra->buf[i]; 1529 struct btree_node_entry *bne = NULL; 1530 unsigned offset = 0, sectors; 1531 bool gap = false; 1532 1533 if (ra->err[i]) 1534 continue; 1535 1536 printbuf_reset(&buf); 1537 1538 while (offset < btree_sectors(c)) { 1539 if (!offset) { 1540 sectors = vstruct_sectors(bn, c->block_bits); 1541 } else { 1542 bne = ra->buf[i] + (offset << 9); 1543 if (bne->keys.seq != bn->keys.seq) 1544 break; 1545 sectors = vstruct_sectors(bne, c->block_bits); 1546 } 1547 1548 prt_printf(&buf, " %u-%u", offset, offset + sectors); 1549 if (bne && bch2_journal_seq_is_blacklisted(c, 1550 le64_to_cpu(bne->keys.journal_seq), false)) 1551 prt_printf(&buf, "*"); 1552 offset += sectors; 1553 } 1554 1555 while (offset < btree_sectors(c)) { 1556 bne = ra->buf[i] + (offset << 9); 1557 if (bne->keys.seq == bn->keys.seq) { 1558 if (!gap) 1559 prt_printf(&buf, " GAP"); 1560 gap = true; 1561 1562 sectors = vstruct_sectors(bne, c->block_bits); 1563 prt_printf(&buf, " %u-%u", offset, offset + sectors); 1564 if (bch2_journal_seq_is_blacklisted(c, 1565 le64_to_cpu(bne->keys.journal_seq), false)) 1566 prt_printf(&buf, "*"); 1567 } 1568 offset++; 1569 } 1570 1571 bch_err(c, "replica %u:%s", i, buf.buf); 1572 } 1573 } 1574 1575 if (best >= 0) { 1576 memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); 1577 ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); 1578 } else { 1579 ret = -1; 1580 } 1581 1582 if (ret) { 1583 set_btree_node_read_error(b); 1584 bch2_btree_lost_data(c, b->c.btree_id); 1585 } else if (*saw_error) 1586 bch2_btree_node_rewrite_async(c, b); 1587 1588 for (i = 0; i < ra->nr; i++) { 1589 mempool_free(ra->buf[i], &c->btree_bounce_pool); 1590 bio_put(ra->bio[i]); 1591 } 1592 1593 closure_debug_destroy(&ra->cl); 1594 kfree(ra); 1595 printbuf_exit(&buf); 1596 1597 clear_btree_node_read_in_flight(b); 1598 wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); 1599 } 1600 1601 static void btree_node_read_all_replicas_endio(struct bio *bio) 1602 { 1603 struct btree_read_bio *rb = 1604 container_of(bio, struct btree_read_bio, bio); 1605 struct bch_fs *c = rb->c; 1606 struct btree_node_read_all *ra = rb->ra; 1607 1608 if (rb->have_ioref) { 1609 struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); 1610 1611 bch2_latency_acct(ca, rb->start_time, READ); 1612 percpu_ref_put(&ca->io_ref[READ]); 1613 } 1614 1615 ra->err[rb->idx] = bio->bi_status; 1616 closure_put(&ra->cl); 1617 } 1618 1619 /* 1620 * XXX This allocates multiple times from the same mempools, and can deadlock 1621 * under sufficient memory pressure (but is only a debug path) 1622 */ 1623 static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) 1624 { 1625 struct bkey_s_c k = bkey_i_to_s_c(&b->key); 1626 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1627 const union bch_extent_entry *entry; 1628 struct extent_ptr_decoded pick; 1629 struct btree_node_read_all *ra; 1630 unsigned i; 1631 1632 ra = kzalloc(sizeof(*ra), GFP_NOFS); 1633 if (!ra) 1634 return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; 1635 1636 closure_init(&ra->cl, NULL); 1637 ra->c = c; 1638 ra->b = b; 1639 ra->nr = bch2_bkey_nr_ptrs(k); 1640 1641 for (i = 0; i < ra->nr; i++) { 1642 ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); 1643 ra->bio[i] = bio_alloc_bioset(NULL, 1644 buf_pages(ra->buf[i], btree_buf_bytes(b)), 1645 REQ_OP_READ|REQ_SYNC|REQ_META, 1646 GFP_NOFS, 1647 &c->btree_bio); 1648 } 1649 1650 i = 0; 1651 bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { 1652 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); 1653 struct btree_read_bio *rb = 1654 container_of(ra->bio[i], struct btree_read_bio, bio); 1655 rb->c = c; 1656 rb->b = b; 1657 rb->ra = ra; 1658 rb->start_time = local_clock(); 1659 rb->have_ioref = ca != NULL; 1660 rb->idx = i; 1661 rb->pick = pick; 1662 rb->bio.bi_iter.bi_sector = pick.ptr.offset; 1663 rb->bio.bi_end_io = btree_node_read_all_replicas_endio; 1664 bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b)); 1665 1666 if (rb->have_ioref) { 1667 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], 1668 bio_sectors(&rb->bio)); 1669 bio_set_dev(&rb->bio, ca->disk_sb.bdev); 1670 1671 closure_get(&ra->cl); 1672 submit_bio(&rb->bio); 1673 } else { 1674 ra->err[i] = BLK_STS_REMOVED; 1675 } 1676 1677 i++; 1678 } 1679 1680 if (sync) { 1681 closure_sync(&ra->cl); 1682 btree_node_read_all_replicas_done(&ra->cl.work); 1683 } else { 1684 continue_at(&ra->cl, btree_node_read_all_replicas_done, 1685 c->btree_read_complete_wq); 1686 } 1687 1688 return 0; 1689 } 1690 1691 void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, 1692 bool sync) 1693 { 1694 struct bch_fs *c = trans->c; 1695 struct extent_ptr_decoded pick; 1696 struct btree_read_bio *rb; 1697 struct bch_dev *ca; 1698 struct bio *bio; 1699 int ret; 1700 1701 trace_and_count(c, btree_node_read, trans, b); 1702 1703 if (bch2_verify_all_btree_replicas && 1704 !btree_node_read_all_replicas(c, b, sync)) 1705 return; 1706 1707 ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), 1708 NULL, &pick, -1); 1709 1710 if (ret <= 0) { 1711 struct printbuf buf = PRINTBUF; 1712 1713 prt_str(&buf, "btree node read error: no device to read from\n at "); 1714 bch2_btree_pos_to_text(&buf, c, b); 1715 bch_err_ratelimited(c, "%s", buf.buf); 1716 1717 if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && 1718 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) 1719 bch2_fatal_error(c); 1720 1721 set_btree_node_read_error(b); 1722 bch2_btree_lost_data(c, b->c.btree_id); 1723 clear_btree_node_read_in_flight(b); 1724 wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); 1725 printbuf_exit(&buf); 1726 return; 1727 } 1728 1729 ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); 1730 1731 bio = bio_alloc_bioset(NULL, 1732 buf_pages(b->data, btree_buf_bytes(b)), 1733 REQ_OP_READ|REQ_SYNC|REQ_META, 1734 GFP_NOFS, 1735 &c->btree_bio); 1736 rb = container_of(bio, struct btree_read_bio, bio); 1737 rb->c = c; 1738 rb->b = b; 1739 rb->ra = NULL; 1740 rb->start_time = local_clock(); 1741 rb->have_ioref = ca != NULL; 1742 rb->pick = pick; 1743 INIT_WORK(&rb->work, btree_node_read_work); 1744 bio->bi_iter.bi_sector = pick.ptr.offset; 1745 bio->bi_end_io = btree_node_read_endio; 1746 bch2_bio_map(bio, b->data, btree_buf_bytes(b)); 1747 1748 if (rb->have_ioref) { 1749 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], 1750 bio_sectors(bio)); 1751 bio_set_dev(bio, ca->disk_sb.bdev); 1752 1753 if (sync) { 1754 submit_bio_wait(bio); 1755 bch2_latency_acct(ca, rb->start_time, READ); 1756 btree_node_read_work(&rb->work); 1757 } else { 1758 submit_bio(bio); 1759 } 1760 } else { 1761 bio->bi_status = BLK_STS_REMOVED; 1762 1763 if (sync) 1764 btree_node_read_work(&rb->work); 1765 else 1766 queue_work(c->btree_read_complete_wq, &rb->work); 1767 } 1768 } 1769 1770 static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, 1771 const struct bkey_i *k, unsigned level) 1772 { 1773 struct bch_fs *c = trans->c; 1774 struct closure cl; 1775 struct btree *b; 1776 int ret; 1777 1778 closure_init_stack(&cl); 1779 1780 do { 1781 ret = bch2_btree_cache_cannibalize_lock(trans, &cl); 1782 closure_sync(&cl); 1783 } while (ret); 1784 1785 b = bch2_btree_node_mem_alloc(trans, level != 0); 1786 bch2_btree_cache_cannibalize_unlock(trans); 1787 1788 BUG_ON(IS_ERR(b)); 1789 1790 bkey_copy(&b->key, k); 1791 BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); 1792 1793 set_btree_node_read_in_flight(b); 1794 1795 /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */ 1796 bch2_trans_unlock(trans); 1797 bch2_btree_node_read(trans, b, true); 1798 1799 if (btree_node_read_error(b)) { 1800 mutex_lock(&c->btree_cache.lock); 1801 bch2_btree_node_hash_remove(&c->btree_cache, b); 1802 mutex_unlock(&c->btree_cache.lock); 1803 1804 ret = -BCH_ERR_btree_node_read_error; 1805 goto err; 1806 } 1807 1808 bch2_btree_set_root_for_read(c, b); 1809 err: 1810 six_unlock_write(&b->c.lock); 1811 six_unlock_intent(&b->c.lock); 1812 1813 return ret; 1814 } 1815 1816 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, 1817 const struct bkey_i *k, unsigned level) 1818 { 1819 return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); 1820 } 1821 1822 struct btree_node_scrub { 1823 struct bch_fs *c; 1824 struct bch_dev *ca; 1825 void *buf; 1826 bool used_mempool; 1827 unsigned written; 1828 1829 enum btree_id btree; 1830 unsigned level; 1831 struct bkey_buf key; 1832 __le64 seq; 1833 1834 struct work_struct work; 1835 struct bio bio; 1836 }; 1837 1838 static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, 1839 struct printbuf *err) 1840 { 1841 unsigned written = 0; 1842 1843 if (le64_to_cpu(data->magic) != bset_magic(c)) { 1844 prt_printf(err, "bad magic: want %llx, got %llx", 1845 bset_magic(c), le64_to_cpu(data->magic)); 1846 return false; 1847 } 1848 1849 while (written < (ptr_written ?: btree_sectors(c))) { 1850 struct btree_node_entry *bne; 1851 struct bset *i; 1852 bool first = !written; 1853 1854 if (first) { 1855 bne = NULL; 1856 i = &data->keys; 1857 } else { 1858 bne = (void *) data + (written << 9); 1859 i = &bne->keys; 1860 1861 if (!ptr_written && i->seq != data->keys.seq) 1862 break; 1863 } 1864 1865 struct nonce nonce = btree_nonce(i, written << 9); 1866 bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); 1867 1868 if (first) { 1869 if (good_csum_type) { 1870 struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); 1871 if (bch2_crc_cmp(data->csum, csum)) { 1872 bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); 1873 return false; 1874 } 1875 } 1876 1877 written += vstruct_sectors(data, c->block_bits); 1878 } else { 1879 if (good_csum_type) { 1880 struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); 1881 if (bch2_crc_cmp(bne->csum, csum)) { 1882 bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); 1883 return false; 1884 } 1885 } 1886 1887 written += vstruct_sectors(bne, c->block_bits); 1888 } 1889 } 1890 1891 return true; 1892 } 1893 1894 static void btree_node_scrub_work(struct work_struct *work) 1895 { 1896 struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); 1897 struct bch_fs *c = scrub->c; 1898 struct printbuf err = PRINTBUF; 1899 1900 __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, 1901 bkey_i_to_s_c(scrub->key.k)); 1902 prt_newline(&err); 1903 1904 if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { 1905 struct btree_trans *trans = bch2_trans_get(c); 1906 1907 struct btree_iter iter; 1908 bch2_trans_node_iter_init(trans, &iter, scrub->btree, 1909 scrub->key.k->k.p, 0, scrub->level - 1, 0); 1910 1911 struct btree *b; 1912 int ret = lockrestart_do(trans, 1913 PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter))); 1914 if (ret) 1915 goto err; 1916 1917 if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { 1918 bch_err(c, "error validating btree node during scrub on %s at btree %s", 1919 scrub->ca->name, err.buf); 1920 1921 ret = bch2_btree_node_rewrite(trans, &iter, b, 0); 1922 } 1923 err: 1924 bch2_trans_iter_exit(trans, &iter); 1925 bch2_trans_begin(trans); 1926 bch2_trans_put(trans); 1927 } 1928 1929 printbuf_exit(&err); 1930 bch2_bkey_buf_exit(&scrub->key, c);; 1931 btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); 1932 percpu_ref_put(&scrub->ca->io_ref[READ]); 1933 kfree(scrub); 1934 bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); 1935 } 1936 1937 static void btree_node_scrub_endio(struct bio *bio) 1938 { 1939 struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); 1940 1941 queue_work(scrub->c->btree_read_complete_wq, &scrub->work); 1942 } 1943 1944 int bch2_btree_node_scrub(struct btree_trans *trans, 1945 enum btree_id btree, unsigned level, 1946 struct bkey_s_c k, unsigned dev) 1947 { 1948 if (k.k->type != KEY_TYPE_btree_ptr_v2) 1949 return 0; 1950 1951 struct bch_fs *c = trans->c; 1952 1953 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub)) 1954 return -BCH_ERR_erofs_no_writes; 1955 1956 struct extent_ptr_decoded pick; 1957 int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); 1958 if (ret <= 0) 1959 goto err; 1960 1961 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); 1962 if (!ca) { 1963 ret = -BCH_ERR_device_offline; 1964 goto err; 1965 } 1966 1967 bool used_mempool = false; 1968 void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); 1969 1970 unsigned vecs = buf_pages(buf, c->opts.btree_node_size); 1971 1972 struct btree_node_scrub *scrub = 1973 kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); 1974 if (!scrub) { 1975 ret = -ENOMEM; 1976 goto err_free; 1977 } 1978 1979 scrub->c = c; 1980 scrub->ca = ca; 1981 scrub->buf = buf; 1982 scrub->used_mempool = used_mempool; 1983 scrub->written = btree_ptr_sectors_written(k); 1984 1985 scrub->btree = btree; 1986 scrub->level = level; 1987 bch2_bkey_buf_init(&scrub->key); 1988 bch2_bkey_buf_reassemble(&scrub->key, c, k); 1989 scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; 1990 1991 INIT_WORK(&scrub->work, btree_node_scrub_work); 1992 1993 bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); 1994 bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); 1995 scrub->bio.bi_iter.bi_sector = pick.ptr.offset; 1996 scrub->bio.bi_end_io = btree_node_scrub_endio; 1997 submit_bio(&scrub->bio); 1998 return 0; 1999 err_free: 2000 btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); 2001 percpu_ref_put(&ca->io_ref[READ]); 2002 err: 2003 bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); 2004 return ret; 2005 } 2006 2007 static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, 2008 struct btree_write *w) 2009 { 2010 unsigned long old, new; 2011 2012 old = READ_ONCE(b->will_make_reachable); 2013 do { 2014 new = old; 2015 if (!(old & 1)) 2016 break; 2017 2018 new &= ~1UL; 2019 } while (!try_cmpxchg(&b->will_make_reachable, &old, new)); 2020 2021 if (old & 1) 2022 closure_put(&((struct btree_update *) new)->cl); 2023 2024 bch2_journal_pin_drop(&c->journal, &w->journal); 2025 } 2026 2027 static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) 2028 { 2029 struct btree_write *w = btree_prev_write(b); 2030 unsigned long old, new; 2031 unsigned type = 0; 2032 2033 bch2_btree_complete_write(c, b, w); 2034 2035 if (start_time) 2036 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time); 2037 2038 old = READ_ONCE(b->flags); 2039 do { 2040 new = old; 2041 2042 if ((old & (1U << BTREE_NODE_dirty)) && 2043 (old & (1U << BTREE_NODE_need_write)) && 2044 !(old & (1U << BTREE_NODE_never_write)) && 2045 !(old & (1U << BTREE_NODE_write_blocked)) && 2046 !(old & (1U << BTREE_NODE_will_make_reachable))) { 2047 new &= ~(1U << BTREE_NODE_dirty); 2048 new &= ~(1U << BTREE_NODE_need_write); 2049 new |= (1U << BTREE_NODE_write_in_flight); 2050 new |= (1U << BTREE_NODE_write_in_flight_inner); 2051 new |= (1U << BTREE_NODE_just_written); 2052 new ^= (1U << BTREE_NODE_write_idx); 2053 2054 type = new & BTREE_WRITE_TYPE_MASK; 2055 new &= ~BTREE_WRITE_TYPE_MASK; 2056 } else { 2057 new &= ~(1U << BTREE_NODE_write_in_flight); 2058 new &= ~(1U << BTREE_NODE_write_in_flight_inner); 2059 } 2060 } while (!try_cmpxchg(&b->flags, &old, new)); 2061 2062 if (new & (1U << BTREE_NODE_write_in_flight)) 2063 __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); 2064 else 2065 wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); 2066 } 2067 2068 static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) 2069 { 2070 struct btree_trans *trans = bch2_trans_get(c); 2071 2072 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); 2073 2074 /* we don't need transaction context anymore after we got the lock. */ 2075 bch2_trans_put(trans); 2076 __btree_node_write_done(c, b, start_time); 2077 six_unlock_read(&b->c.lock); 2078 } 2079 2080 static void btree_node_write_work(struct work_struct *work) 2081 { 2082 struct btree_write_bio *wbio = 2083 container_of(work, struct btree_write_bio, work); 2084 struct bch_fs *c = wbio->wbio.c; 2085 struct btree *b = wbio->wbio.bio.bi_private; 2086 u64 start_time = wbio->start_time; 2087 int ret = 0; 2088 2089 btree_bounce_free(c, 2090 wbio->data_bytes, 2091 wbio->wbio.used_mempool, 2092 wbio->data); 2093 2094 bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, 2095 bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); 2096 2097 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { 2098 ret = -BCH_ERR_btree_node_write_all_failed; 2099 goto err; 2100 } 2101 2102 if (wbio->wbio.first_btree_write) { 2103 if (wbio->wbio.failed.nr) { 2104 2105 } 2106 } else { 2107 ret = bch2_trans_do(c, 2108 bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, 2109 BCH_WATERMARK_interior_updates| 2110 BCH_TRANS_COMMIT_journal_reclaim| 2111 BCH_TRANS_COMMIT_no_enospc| 2112 BCH_TRANS_COMMIT_no_check_rw, 2113 !wbio->wbio.failed.nr)); 2114 if (ret) 2115 goto err; 2116 } 2117 out: 2118 bio_put(&wbio->wbio.bio); 2119 btree_node_write_done(c, b, start_time); 2120 return; 2121 err: 2122 set_btree_node_noevict(b); 2123 2124 if (!bch2_err_matches(ret, EROFS)) { 2125 struct printbuf buf = PRINTBUF; 2126 prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); 2127 bch2_btree_pos_to_text(&buf, c, b); 2128 bch2_fs_fatal_error(c, "%s", buf.buf); 2129 printbuf_exit(&buf); 2130 } 2131 goto out; 2132 } 2133 2134 static void btree_node_write_endio(struct bio *bio) 2135 { 2136 struct bch_write_bio *wbio = to_wbio(bio); 2137 struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; 2138 struct bch_write_bio *orig = parent ?: wbio; 2139 struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); 2140 struct bch_fs *c = wbio->c; 2141 struct btree *b = wbio->bio.bi_private; 2142 struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; 2143 2144 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, 2145 wbio->submit_time, !bio->bi_status); 2146 2147 if (ca && bio->bi_status) { 2148 struct printbuf buf = PRINTBUF; 2149 buf.atomic++; 2150 prt_printf(&buf, "btree write error: %s\n ", 2151 bch2_blk_status_to_str(bio->bi_status)); 2152 bch2_btree_pos_to_text(&buf, c, b); 2153 bch_err_dev_ratelimited(ca, "%s", buf.buf); 2154 printbuf_exit(&buf); 2155 } 2156 2157 if (bio->bi_status) { 2158 unsigned long flags; 2159 spin_lock_irqsave(&c->btree_write_error_lock, flags); 2160 bch2_dev_list_add_dev(&orig->failed, wbio->dev); 2161 spin_unlock_irqrestore(&c->btree_write_error_lock, flags); 2162 } 2163 2164 /* 2165 * XXX: we should be using io_ref[WRITE], but we aren't retrying failed 2166 * btree writes yet (due to device removal/ro): 2167 */ 2168 if (wbio->have_ioref) 2169 percpu_ref_put(&ca->io_ref[READ]); 2170 2171 if (parent) { 2172 bio_put(bio); 2173 bio_endio(&parent->bio); 2174 return; 2175 } 2176 2177 clear_btree_node_write_in_flight_inner(b); 2178 wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); 2179 INIT_WORK(&wb->work, btree_node_write_work); 2180 queue_work(c->btree_io_complete_wq, &wb->work); 2181 } 2182 2183 static int validate_bset_for_write(struct bch_fs *c, struct btree *b, 2184 struct bset *i, unsigned sectors) 2185 { 2186 bool saw_error; 2187 2188 int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), 2189 (struct bkey_validate_context) { 2190 .from = BKEY_VALIDATE_btree_node, 2191 .level = b->c.level + 1, 2192 .btree = b->c.btree_id, 2193 .flags = BCH_VALIDATE_write, 2194 }); 2195 if (ret) { 2196 bch2_fs_inconsistent(c, "invalid btree node key before write"); 2197 return ret; 2198 } 2199 2200 ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: 2201 validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); 2202 if (ret) { 2203 bch2_inconsistent_error(c); 2204 dump_stack(); 2205 } 2206 2207 return ret; 2208 } 2209 2210 static void btree_write_submit(struct work_struct *work) 2211 { 2212 struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); 2213 BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; 2214 2215 bkey_copy(&tmp.k, &wbio->key); 2216 2217 bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) 2218 ptr->offset += wbio->sector_offset; 2219 2220 bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, 2221 &tmp.k, false); 2222 } 2223 2224 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) 2225 { 2226 struct btree_write_bio *wbio; 2227 struct bset *i; 2228 struct btree_node *bn = NULL; 2229 struct btree_node_entry *bne = NULL; 2230 struct sort_iter_stack sort_iter; 2231 struct nonce nonce; 2232 unsigned bytes_to_write, sectors_to_write, bytes, u64s; 2233 u64 seq = 0; 2234 bool used_mempool; 2235 unsigned long old, new; 2236 bool validate_before_checksum = false; 2237 enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; 2238 void *data; 2239 u64 start_time = local_clock(); 2240 int ret; 2241 2242 if (flags & BTREE_WRITE_ALREADY_STARTED) 2243 goto do_write; 2244 2245 /* 2246 * We may only have a read lock on the btree node - the dirty bit is our 2247 * "lock" against racing with other threads that may be trying to start 2248 * a write, we do a write iff we clear the dirty bit. Since setting the 2249 * dirty bit requires a write lock, we can't race with other threads 2250 * redirtying it: 2251 */ 2252 old = READ_ONCE(b->flags); 2253 do { 2254 new = old; 2255 2256 if (!(old & (1 << BTREE_NODE_dirty))) 2257 return; 2258 2259 if ((flags & BTREE_WRITE_ONLY_IF_NEED) && 2260 !(old & (1 << BTREE_NODE_need_write))) 2261 return; 2262 2263 if (old & 2264 ((1 << BTREE_NODE_never_write)| 2265 (1 << BTREE_NODE_write_blocked))) 2266 return; 2267 2268 if (b->written && 2269 (old & (1 << BTREE_NODE_will_make_reachable))) 2270 return; 2271 2272 if (old & (1 << BTREE_NODE_write_in_flight)) 2273 return; 2274 2275 if (flags & BTREE_WRITE_ONLY_IF_NEED) 2276 type = new & BTREE_WRITE_TYPE_MASK; 2277 new &= ~BTREE_WRITE_TYPE_MASK; 2278 2279 new &= ~(1 << BTREE_NODE_dirty); 2280 new &= ~(1 << BTREE_NODE_need_write); 2281 new |= (1 << BTREE_NODE_write_in_flight); 2282 new |= (1 << BTREE_NODE_write_in_flight_inner); 2283 new |= (1 << BTREE_NODE_just_written); 2284 new ^= (1 << BTREE_NODE_write_idx); 2285 } while (!try_cmpxchg_acquire(&b->flags, &old, new)); 2286 2287 if (new & (1U << BTREE_NODE_need_write)) 2288 return; 2289 do_write: 2290 BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); 2291 2292 atomic_long_dec(&c->btree_cache.nr_dirty); 2293 2294 BUG_ON(btree_node_fake(b)); 2295 BUG_ON((b->will_make_reachable != 0) != !b->written); 2296 2297 BUG_ON(b->written >= btree_sectors(c)); 2298 BUG_ON(b->written & (block_sectors(c) - 1)); 2299 BUG_ON(bset_written(b, btree_bset_last(b))); 2300 BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); 2301 BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); 2302 2303 bch2_sort_whiteouts(c, b); 2304 2305 sort_iter_stack_init(&sort_iter, b); 2306 2307 bytes = !b->written 2308 ? sizeof(struct btree_node) 2309 : sizeof(struct btree_node_entry); 2310 2311 bytes += b->whiteout_u64s * sizeof(u64); 2312 2313 for_each_bset(b, t) { 2314 i = bset(b, t); 2315 2316 if (bset_written(b, i)) 2317 continue; 2318 2319 bytes += le16_to_cpu(i->u64s) * sizeof(u64); 2320 sort_iter_add(&sort_iter.iter, 2321 btree_bkey_first(b, t), 2322 btree_bkey_last(b, t)); 2323 seq = max(seq, le64_to_cpu(i->journal_seq)); 2324 } 2325 2326 BUG_ON(b->written && !seq); 2327 2328 /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ 2329 bytes += 8; 2330 2331 /* buffer must be a multiple of the block size */ 2332 bytes = round_up(bytes, block_bytes(c)); 2333 2334 data = btree_bounce_alloc(c, bytes, &used_mempool); 2335 2336 if (!b->written) { 2337 bn = data; 2338 *bn = *b->data; 2339 i = &bn->keys; 2340 } else { 2341 bne = data; 2342 bne->keys = b->data->keys; 2343 i = &bne->keys; 2344 } 2345 2346 i->journal_seq = cpu_to_le64(seq); 2347 i->u64s = 0; 2348 2349 sort_iter_add(&sort_iter.iter, 2350 unwritten_whiteouts_start(b), 2351 unwritten_whiteouts_end(b)); 2352 SET_BSET_SEPARATE_WHITEOUTS(i, false); 2353 2354 u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter); 2355 le16_add_cpu(&i->u64s, u64s); 2356 2357 b->whiteout_u64s = 0; 2358 2359 BUG_ON(!b->written && i->u64s != b->data->keys.u64s); 2360 2361 set_needs_whiteout(i, false); 2362 2363 /* do we have data to write? */ 2364 if (b->written && !i->u64s) 2365 goto nowrite; 2366 2367 bytes_to_write = vstruct_end(i) - data; 2368 sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; 2369 2370 if (!b->written && 2371 b->key.k.type == KEY_TYPE_btree_ptr_v2) 2372 BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write); 2373 2374 memset(data + bytes_to_write, 0, 2375 (sectors_to_write << 9) - bytes_to_write); 2376 2377 BUG_ON(b->written + sectors_to_write > btree_sectors(c)); 2378 BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); 2379 BUG_ON(i->seq != b->data->keys.seq); 2380 2381 i->version = cpu_to_le16(c->sb.version); 2382 SET_BSET_OFFSET(i, b->written); 2383 SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); 2384 2385 if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) 2386 validate_before_checksum = true; 2387 2388 /* validate_bset will be modifying: */ 2389 if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) 2390 validate_before_checksum = true; 2391 2392 /* if we're going to be encrypting, check metadata validity first: */ 2393 if (validate_before_checksum && 2394 validate_bset_for_write(c, b, i, sectors_to_write)) 2395 goto err; 2396 2397 ret = bset_encrypt(c, i, b->written << 9); 2398 if (bch2_fs_fatal_err_on(ret, c, 2399 "encrypting btree node: %s", bch2_err_str(ret))) 2400 goto err; 2401 2402 nonce = btree_nonce(i, b->written << 9); 2403 2404 if (bn) 2405 bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); 2406 else 2407 bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); 2408 2409 /* if we're not encrypting, check metadata after checksumming: */ 2410 if (!validate_before_checksum && 2411 validate_bset_for_write(c, b, i, sectors_to_write)) 2412 goto err; 2413 2414 /* 2415 * We handle btree write errors by immediately halting the journal - 2416 * after we've done that, we can't issue any subsequent btree writes 2417 * because they might have pointers to new nodes that failed to write. 2418 * 2419 * Furthermore, there's no point in doing any more btree writes because 2420 * with the journal stopped, we're never going to update the journal to 2421 * reflect that those writes were done and the data flushed from the 2422 * journal: 2423 * 2424 * Also on journal error, the pending write may have updates that were 2425 * never journalled (interior nodes, see btree_update_nodes_written()) - 2426 * it's critical that we don't do the write in that case otherwise we 2427 * will have updates visible that weren't in the journal: 2428 * 2429 * Make sure to update b->written so bch2_btree_init_next() doesn't 2430 * break: 2431 */ 2432 if (bch2_journal_error(&c->journal) || 2433 c->opts.nochanges) 2434 goto err; 2435 2436 trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); 2437 2438 wbio = container_of(bio_alloc_bioset(NULL, 2439 buf_pages(data, sectors_to_write << 9), 2440 REQ_OP_WRITE|REQ_META, 2441 GFP_NOFS, 2442 &c->btree_bio), 2443 struct btree_write_bio, wbio.bio); 2444 wbio_init(&wbio->wbio.bio); 2445 wbio->data = data; 2446 wbio->data_bytes = bytes; 2447 wbio->sector_offset = b->written; 2448 wbio->start_time = start_time; 2449 wbio->wbio.c = c; 2450 wbio->wbio.used_mempool = used_mempool; 2451 wbio->wbio.first_btree_write = !b->written; 2452 wbio->wbio.bio.bi_end_io = btree_node_write_endio; 2453 wbio->wbio.bio.bi_private = b; 2454 2455 bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); 2456 2457 bkey_copy(&wbio->key, &b->key); 2458 2459 b->written += sectors_to_write; 2460 2461 if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) 2462 bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = 2463 cpu_to_le16(b->written); 2464 2465 atomic64_inc(&c->btree_write_stats[type].nr); 2466 atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); 2467 2468 INIT_WORK(&wbio->work, btree_write_submit); 2469 queue_work(c->btree_write_submit_wq, &wbio->work); 2470 return; 2471 err: 2472 set_btree_node_noevict(b); 2473 b->written += sectors_to_write; 2474 nowrite: 2475 btree_bounce_free(c, bytes, used_mempool, data); 2476 __btree_node_write_done(c, b, 0); 2477 } 2478 2479 /* 2480 * Work that must be done with write lock held: 2481 */ 2482 bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) 2483 { 2484 bool invalidated_iter = false; 2485 struct btree_node_entry *bne; 2486 2487 if (!btree_node_just_written(b)) 2488 return false; 2489 2490 BUG_ON(b->whiteout_u64s); 2491 2492 clear_btree_node_just_written(b); 2493 2494 /* 2495 * Note: immediately after write, bset_written() doesn't work - the 2496 * amount of data we had to write after compaction might have been 2497 * smaller than the offset of the last bset. 2498 * 2499 * However, we know that all bsets have been written here, as long as 2500 * we're still holding the write lock: 2501 */ 2502 2503 /* 2504 * XXX: decide if we really want to unconditionally sort down to a 2505 * single bset: 2506 */ 2507 if (b->nsets > 1) { 2508 btree_node_sort(c, b, 0, b->nsets); 2509 invalidated_iter = true; 2510 } else { 2511 invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); 2512 } 2513 2514 for_each_bset(b, t) 2515 set_needs_whiteout(bset(b, t), true); 2516 2517 bch2_btree_verify(c, b); 2518 2519 /* 2520 * If later we don't unconditionally sort down to a single bset, we have 2521 * to ensure this is still true: 2522 */ 2523 BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); 2524 2525 bne = want_new_bset(c, b); 2526 if (bne) 2527 bch2_bset_init_next(b, bne); 2528 2529 bch2_btree_build_aux_trees(b); 2530 2531 return invalidated_iter; 2532 } 2533 2534 /* 2535 * Use this one if the node is intent locked: 2536 */ 2537 void bch2_btree_node_write(struct bch_fs *c, struct btree *b, 2538 enum six_lock_type lock_type_held, 2539 unsigned flags) 2540 { 2541 if (lock_type_held == SIX_LOCK_intent || 2542 (lock_type_held == SIX_LOCK_read && 2543 six_lock_tryupgrade(&b->c.lock))) { 2544 __bch2_btree_node_write(c, b, flags); 2545 2546 /* don't cycle lock unnecessarily: */ 2547 if (btree_node_just_written(b) && 2548 six_trylock_write(&b->c.lock)) { 2549 bch2_btree_post_write_cleanup(c, b); 2550 six_unlock_write(&b->c.lock); 2551 } 2552 2553 if (lock_type_held == SIX_LOCK_read) 2554 six_lock_downgrade(&b->c.lock); 2555 } else { 2556 __bch2_btree_node_write(c, b, flags); 2557 if (lock_type_held == SIX_LOCK_write && 2558 btree_node_just_written(b)) 2559 bch2_btree_post_write_cleanup(c, b); 2560 } 2561 } 2562 2563 void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b, 2564 enum six_lock_type lock_type_held, 2565 unsigned flags) 2566 { 2567 struct bch_fs *c = trans->c; 2568 2569 if (lock_type_held == SIX_LOCK_intent || 2570 (lock_type_held == SIX_LOCK_read && 2571 six_lock_tryupgrade(&b->c.lock))) { 2572 __bch2_btree_node_write(c, b, flags); 2573 2574 /* don't cycle lock unnecessarily: */ 2575 if (btree_node_just_written(b) && 2576 six_trylock_write(&b->c.lock)) { 2577 bch2_btree_post_write_cleanup(c, b); 2578 __bch2_btree_node_unlock_write(trans, b); 2579 } 2580 2581 if (lock_type_held == SIX_LOCK_read) 2582 six_lock_downgrade(&b->c.lock); 2583 } else { 2584 __bch2_btree_node_write(c, b, flags); 2585 if (lock_type_held == SIX_LOCK_write && 2586 btree_node_just_written(b)) 2587 bch2_btree_post_write_cleanup(c, b); 2588 } 2589 } 2590 2591 static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) 2592 { 2593 struct bucket_table *tbl; 2594 struct rhash_head *pos; 2595 struct btree *b; 2596 unsigned i; 2597 bool ret = false; 2598 restart: 2599 rcu_read_lock(); 2600 for_each_cached_btree(b, c, tbl, i, pos) 2601 if (test_bit(flag, &b->flags)) { 2602 rcu_read_unlock(); 2603 wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); 2604 ret = true; 2605 goto restart; 2606 } 2607 rcu_read_unlock(); 2608 2609 return ret; 2610 } 2611 2612 bool bch2_btree_flush_all_reads(struct bch_fs *c) 2613 { 2614 return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); 2615 } 2616 2617 bool bch2_btree_flush_all_writes(struct bch_fs *c) 2618 { 2619 return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); 2620 } 2621 2622 static const char * const bch2_btree_write_types[] = { 2623 #define x(t, n) [n] = #t, 2624 BCH_BTREE_WRITE_TYPES() 2625 NULL 2626 }; 2627 2628 void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) 2629 { 2630 printbuf_tabstop_push(out, 20); 2631 printbuf_tabstop_push(out, 10); 2632 2633 prt_printf(out, "\tnr\tsize\n"); 2634 2635 for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { 2636 u64 nr = atomic64_read(&c->btree_write_stats[i].nr); 2637 u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); 2638 2639 prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr); 2640 prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); 2641 prt_newline(out); 2642 } 2643 } 2644