1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bbpos.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_iter.h" 9 #include "btree_locking.h" 10 #include "debug.h" 11 #include "errcode.h" 12 #include "error.h" 13 #include "journal.h" 14 #include "trace.h" 15 16 #include <linux/prefetch.h> 17 #include <linux/sched/mm.h> 18 19 const char * const bch2_btree_node_flags[] = { 20 #define x(f) #f, 21 BTREE_FLAGS() 22 #undef x 23 NULL 24 }; 25 26 void bch2_recalc_btree_reserve(struct bch_fs *c) 27 { 28 unsigned i, reserve = 16; 29 30 if (!c->btree_roots_known[0].b) 31 reserve += 8; 32 33 for (i = 0; i < btree_id_nr_alive(c); i++) { 34 struct btree_root *r = bch2_btree_id_root(c, i); 35 36 if (r->b) 37 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 38 } 39 40 c->btree_cache.reserve = reserve; 41 } 42 43 static inline unsigned btree_cache_can_free(struct btree_cache *bc) 44 { 45 return max_t(int, 0, bc->used - bc->reserve); 46 } 47 48 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 49 { 50 if (b->c.lock.readers) 51 list_move(&b->list, &bc->freed_pcpu); 52 else 53 list_move(&b->list, &bc->freed_nonpcpu); 54 } 55 56 static void btree_node_data_free(struct bch_fs *c, struct btree *b) 57 { 58 struct btree_cache *bc = &c->btree_cache; 59 60 EBUG_ON(btree_node_write_in_flight(b)); 61 62 clear_btree_node_just_written(b); 63 64 kvfree(b->data); 65 b->data = NULL; 66 #ifdef __KERNEL__ 67 kvfree(b->aux_data); 68 #else 69 munmap(b->aux_data, btree_aux_data_bytes(b)); 70 #endif 71 b->aux_data = NULL; 72 73 bc->used--; 74 75 btree_node_to_freedlist(bc, b); 76 } 77 78 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 79 const void *obj) 80 { 81 const struct btree *b = obj; 82 const u64 *v = arg->key; 83 84 return b->hash_val == *v ? 0 : 1; 85 } 86 87 static const struct rhashtable_params bch_btree_cache_params = { 88 .head_offset = offsetof(struct btree, hash), 89 .key_offset = offsetof(struct btree, hash_val), 90 .key_len = sizeof(u64), 91 .obj_cmpfn = bch2_btree_cache_cmp_fn, 92 }; 93 94 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 95 { 96 BUG_ON(b->data || b->aux_data); 97 98 b->data = kvmalloc(btree_buf_bytes(b), gfp); 99 if (!b->data) 100 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 101 #ifdef __KERNEL__ 102 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 103 #else 104 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 105 PROT_READ|PROT_WRITE|PROT_EXEC, 106 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 107 if (b->aux_data == MAP_FAILED) 108 b->aux_data = NULL; 109 #endif 110 if (!b->aux_data) { 111 kvfree(b->data); 112 b->data = NULL; 113 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 114 } 115 116 return 0; 117 } 118 119 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 120 { 121 struct btree *b; 122 123 b = kzalloc(sizeof(struct btree), gfp); 124 if (!b) 125 return NULL; 126 127 bkey_btree_ptr_init(&b->key); 128 INIT_LIST_HEAD(&b->list); 129 INIT_LIST_HEAD(&b->write_blocked); 130 b->byte_order = ilog2(c->opts.btree_node_size); 131 return b; 132 } 133 134 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 135 { 136 struct btree_cache *bc = &c->btree_cache; 137 struct btree *b; 138 139 b = __btree_node_mem_alloc(c, GFP_KERNEL); 140 if (!b) 141 return NULL; 142 143 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 144 kfree(b); 145 return NULL; 146 } 147 148 bch2_btree_lock_init(&b->c, 0); 149 150 bc->used++; 151 list_add(&b->list, &bc->freeable); 152 return b; 153 } 154 155 /* Btree in memory cache - hash table */ 156 157 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 158 { 159 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 160 161 BUG_ON(ret); 162 163 /* Cause future lookups for this node to fail: */ 164 b->hash_val = 0; 165 } 166 167 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 168 { 169 BUG_ON(b->hash_val); 170 b->hash_val = btree_ptr_hash_val(&b->key); 171 172 return rhashtable_lookup_insert_fast(&bc->table, &b->hash, 173 bch_btree_cache_params); 174 } 175 176 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 177 unsigned level, enum btree_id id) 178 { 179 int ret; 180 181 b->c.level = level; 182 b->c.btree_id = id; 183 184 mutex_lock(&bc->lock); 185 ret = __bch2_btree_node_hash_insert(bc, b); 186 if (!ret) 187 list_add_tail(&b->list, &bc->live); 188 mutex_unlock(&bc->lock); 189 190 return ret; 191 } 192 193 __flatten 194 static inline struct btree *btree_cache_find(struct btree_cache *bc, 195 const struct bkey_i *k) 196 { 197 u64 v = btree_ptr_hash_val(k); 198 199 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 200 } 201 202 /* 203 * this version is for btree nodes that have already been freed (we're not 204 * reaping a real btree node) 205 */ 206 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) 207 { 208 struct btree_cache *bc = &c->btree_cache; 209 int ret = 0; 210 211 lockdep_assert_held(&bc->lock); 212 213 struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 214 215 u64 mask = b->c.level 216 ? bc->pinned_nodes_interior_mask 217 : bc->pinned_nodes_leaf_mask; 218 219 if ((mask & BIT_ULL(b->c.btree_id)) && 220 bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 221 bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) 222 return -BCH_ERR_ENOMEM_btree_node_reclaim; 223 224 wait_on_io: 225 if (b->flags & ((1U << BTREE_NODE_dirty)| 226 (1U << BTREE_NODE_read_in_flight)| 227 (1U << BTREE_NODE_write_in_flight))) { 228 if (!flush) 229 return -BCH_ERR_ENOMEM_btree_node_reclaim; 230 231 /* XXX: waiting on IO with btree cache lock held */ 232 bch2_btree_node_wait_on_read(b); 233 bch2_btree_node_wait_on_write(b); 234 } 235 236 if (!six_trylock_intent(&b->c.lock)) 237 return -BCH_ERR_ENOMEM_btree_node_reclaim; 238 239 if (!six_trylock_write(&b->c.lock)) 240 goto out_unlock_intent; 241 242 /* recheck under lock */ 243 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 244 (1U << BTREE_NODE_write_in_flight))) { 245 if (!flush) 246 goto out_unlock; 247 six_unlock_write(&b->c.lock); 248 six_unlock_intent(&b->c.lock); 249 goto wait_on_io; 250 } 251 252 if (btree_node_noevict(b) || 253 btree_node_write_blocked(b) || 254 btree_node_will_make_reachable(b)) 255 goto out_unlock; 256 257 if (btree_node_dirty(b)) { 258 if (!flush) 259 goto out_unlock; 260 /* 261 * Using the underscore version because we don't want to compact 262 * bsets after the write, since this node is about to be evicted 263 * - unless btree verify mode is enabled, since it runs out of 264 * the post write cleanup: 265 */ 266 if (bch2_verify_btree_ondisk) 267 bch2_btree_node_write(c, b, SIX_LOCK_intent, 268 BTREE_WRITE_cache_reclaim); 269 else 270 __bch2_btree_node_write(c, b, 271 BTREE_WRITE_cache_reclaim); 272 273 six_unlock_write(&b->c.lock); 274 six_unlock_intent(&b->c.lock); 275 goto wait_on_io; 276 } 277 out: 278 if (b->hash_val && !ret) 279 trace_and_count(c, btree_cache_reap, c, b); 280 return ret; 281 out_unlock: 282 six_unlock_write(&b->c.lock); 283 out_unlock_intent: 284 six_unlock_intent(&b->c.lock); 285 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 286 goto out; 287 } 288 289 static int btree_node_reclaim(struct bch_fs *c, struct btree *b) 290 { 291 return __btree_node_reclaim(c, b, false); 292 } 293 294 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 295 { 296 return __btree_node_reclaim(c, b, true); 297 } 298 299 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 300 struct shrink_control *sc) 301 { 302 struct bch_fs *c = shrink->private_data; 303 struct btree_cache *bc = &c->btree_cache; 304 struct btree *b, *t; 305 unsigned long nr = sc->nr_to_scan; 306 unsigned long can_free = 0; 307 unsigned long freed = 0; 308 unsigned long touched = 0; 309 unsigned i, flags; 310 unsigned long ret = SHRINK_STOP; 311 bool trigger_writes = atomic_read(&bc->dirty) + nr >= 312 bc->used * 3 / 4; 313 314 if (bch2_btree_shrinker_disabled) 315 return SHRINK_STOP; 316 317 mutex_lock(&bc->lock); 318 flags = memalloc_nofs_save(); 319 320 /* 321 * It's _really_ critical that we don't free too many btree nodes - we 322 * have to always leave ourselves a reserve. The reserve is how we 323 * guarantee that allocating memory for a new btree node can always 324 * succeed, so that inserting keys into the btree can always succeed and 325 * IO can always make forward progress: 326 */ 327 can_free = btree_cache_can_free(bc); 328 nr = min_t(unsigned long, nr, can_free); 329 330 i = 0; 331 list_for_each_entry_safe(b, t, &bc->freeable, list) { 332 /* 333 * Leave a few nodes on the freeable list, so that a btree split 334 * won't have to hit the system allocator: 335 */ 336 if (++i <= 3) 337 continue; 338 339 touched++; 340 341 if (touched >= nr) 342 goto out; 343 344 if (!btree_node_reclaim(c, b)) { 345 btree_node_data_free(c, b); 346 six_unlock_write(&b->c.lock); 347 six_unlock_intent(&b->c.lock); 348 freed++; 349 } 350 } 351 restart: 352 list_for_each_entry_safe(b, t, &bc->live, list) { 353 touched++; 354 355 if (btree_node_accessed(b)) { 356 clear_btree_node_accessed(b); 357 } else if (!btree_node_reclaim(c, b)) { 358 freed++; 359 btree_node_data_free(c, b); 360 361 bch2_btree_node_hash_remove(bc, b); 362 six_unlock_write(&b->c.lock); 363 six_unlock_intent(&b->c.lock); 364 365 if (freed == nr) 366 goto out_rotate; 367 } else if (trigger_writes && 368 btree_node_dirty(b) && 369 !btree_node_will_make_reachable(b) && 370 !btree_node_write_blocked(b) && 371 six_trylock_read(&b->c.lock)) { 372 list_move(&bc->live, &b->list); 373 mutex_unlock(&bc->lock); 374 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 375 six_unlock_read(&b->c.lock); 376 if (touched >= nr) 377 goto out_nounlock; 378 mutex_lock(&bc->lock); 379 goto restart; 380 } 381 382 if (touched >= nr) 383 break; 384 } 385 out_rotate: 386 if (&t->list != &bc->live) 387 list_move_tail(&bc->live, &t->list); 388 out: 389 mutex_unlock(&bc->lock); 390 out_nounlock: 391 ret = freed; 392 memalloc_nofs_restore(flags); 393 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 394 return ret; 395 } 396 397 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 398 struct shrink_control *sc) 399 { 400 struct bch_fs *c = shrink->private_data; 401 struct btree_cache *bc = &c->btree_cache; 402 403 if (bch2_btree_shrinker_disabled) 404 return 0; 405 406 return btree_cache_can_free(bc); 407 } 408 409 void bch2_fs_btree_cache_exit(struct bch_fs *c) 410 { 411 struct btree_cache *bc = &c->btree_cache; 412 struct btree *b; 413 unsigned i, flags; 414 415 shrinker_free(bc->shrink); 416 417 /* vfree() can allocate memory: */ 418 flags = memalloc_nofs_save(); 419 mutex_lock(&bc->lock); 420 421 if (c->verify_data) 422 list_move(&c->verify_data->list, &bc->live); 423 424 kvfree(c->verify_ondisk); 425 426 for (i = 0; i < btree_id_nr_alive(c); i++) { 427 struct btree_root *r = bch2_btree_id_root(c, i); 428 429 if (r->b) 430 list_add(&r->b->list, &bc->live); 431 } 432 433 list_splice(&bc->freeable, &bc->live); 434 435 while (!list_empty(&bc->live)) { 436 b = list_first_entry(&bc->live, struct btree, list); 437 438 BUG_ON(btree_node_read_in_flight(b) || 439 btree_node_write_in_flight(b)); 440 441 btree_node_data_free(c, b); 442 } 443 444 BUG_ON(!bch2_journal_error(&c->journal) && 445 atomic_read(&c->btree_cache.dirty)); 446 447 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 448 449 while (!list_empty(&bc->freed_nonpcpu)) { 450 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); 451 list_del(&b->list); 452 six_lock_exit(&b->c.lock); 453 kfree(b); 454 } 455 456 mutex_unlock(&bc->lock); 457 memalloc_nofs_restore(flags); 458 459 if (bc->table_init_done) 460 rhashtable_destroy(&bc->table); 461 } 462 463 int bch2_fs_btree_cache_init(struct bch_fs *c) 464 { 465 struct btree_cache *bc = &c->btree_cache; 466 struct shrinker *shrink; 467 unsigned i; 468 int ret = 0; 469 470 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 471 if (ret) 472 goto err; 473 474 bc->table_init_done = true; 475 476 bch2_recalc_btree_reserve(c); 477 478 for (i = 0; i < bc->reserve; i++) 479 if (!__bch2_btree_node_mem_alloc(c)) 480 goto err; 481 482 list_splice_init(&bc->live, &bc->freeable); 483 484 mutex_init(&c->verify_lock); 485 486 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 487 if (!shrink) 488 goto err; 489 bc->shrink = shrink; 490 shrink->count_objects = bch2_btree_cache_count; 491 shrink->scan_objects = bch2_btree_cache_scan; 492 shrink->seeks = 4; 493 shrink->private_data = c; 494 shrinker_register(shrink); 495 496 return 0; 497 err: 498 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 499 } 500 501 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 502 { 503 mutex_init(&bc->lock); 504 INIT_LIST_HEAD(&bc->live); 505 INIT_LIST_HEAD(&bc->freeable); 506 INIT_LIST_HEAD(&bc->freed_pcpu); 507 INIT_LIST_HEAD(&bc->freed_nonpcpu); 508 } 509 510 /* 511 * We can only have one thread cannibalizing other cached btree nodes at a time, 512 * or we'll deadlock. We use an open coded mutex to ensure that, which a 513 * cannibalize_bucket() will take. This means every time we unlock the root of 514 * the btree, we need to release this lock if we have it held. 515 */ 516 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) 517 { 518 struct bch_fs *c = trans->c; 519 struct btree_cache *bc = &c->btree_cache; 520 521 if (bc->alloc_lock == current) { 522 trace_and_count(c, btree_cache_cannibalize_unlock, trans); 523 bc->alloc_lock = NULL; 524 closure_wake_up(&bc->alloc_wait); 525 } 526 } 527 528 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) 529 { 530 struct bch_fs *c = trans->c; 531 struct btree_cache *bc = &c->btree_cache; 532 struct task_struct *old; 533 534 old = cmpxchg(&bc->alloc_lock, NULL, current); 535 if (old == NULL || old == current) 536 goto success; 537 538 if (!cl) { 539 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 540 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 541 } 542 543 closure_wait(&bc->alloc_wait, cl); 544 545 /* Try again, after adding ourselves to waitlist */ 546 old = cmpxchg(&bc->alloc_lock, NULL, current); 547 if (old == NULL || old == current) { 548 /* We raced */ 549 closure_wake_up(&bc->alloc_wait); 550 goto success; 551 } 552 553 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 554 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 555 556 success: 557 trace_and_count(c, btree_cache_cannibalize_lock, trans); 558 return 0; 559 } 560 561 static struct btree *btree_node_cannibalize(struct bch_fs *c) 562 { 563 struct btree_cache *bc = &c->btree_cache; 564 struct btree *b; 565 566 list_for_each_entry_reverse(b, &bc->live, list) 567 if (!btree_node_reclaim(c, b)) 568 return b; 569 570 while (1) { 571 list_for_each_entry_reverse(b, &bc->live, list) 572 if (!btree_node_write_and_reclaim(c, b)) 573 return b; 574 575 /* 576 * Rare case: all nodes were intent-locked. 577 * Just busy-wait. 578 */ 579 WARN_ONCE(1, "btree cache cannibalize failed\n"); 580 cond_resched(); 581 } 582 } 583 584 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 585 { 586 struct bch_fs *c = trans->c; 587 struct btree_cache *bc = &c->btree_cache; 588 struct list_head *freed = pcpu_read_locks 589 ? &bc->freed_pcpu 590 : &bc->freed_nonpcpu; 591 struct btree *b, *b2; 592 u64 start_time = local_clock(); 593 unsigned flags; 594 595 flags = memalloc_nofs_save(); 596 mutex_lock(&bc->lock); 597 598 /* 599 * We never free struct btree itself, just the memory that holds the on 600 * disk node. Check the freed list before allocating a new one: 601 */ 602 list_for_each_entry(b, freed, list) 603 if (!btree_node_reclaim(c, b)) { 604 list_del_init(&b->list); 605 goto got_node; 606 } 607 608 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 609 if (!b) { 610 mutex_unlock(&bc->lock); 611 bch2_trans_unlock(trans); 612 b = __btree_node_mem_alloc(c, GFP_KERNEL); 613 if (!b) 614 goto err; 615 mutex_lock(&bc->lock); 616 } 617 618 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); 619 620 BUG_ON(!six_trylock_intent(&b->c.lock)); 621 BUG_ON(!six_trylock_write(&b->c.lock)); 622 got_node: 623 624 /* 625 * btree_free() doesn't free memory; it sticks the node on the end of 626 * the list. Check if there's any freed nodes there: 627 */ 628 list_for_each_entry(b2, &bc->freeable, list) 629 if (!btree_node_reclaim(c, b2)) { 630 swap(b->data, b2->data); 631 swap(b->aux_data, b2->aux_data); 632 btree_node_to_freedlist(bc, b2); 633 six_unlock_write(&b2->c.lock); 634 six_unlock_intent(&b2->c.lock); 635 goto got_mem; 636 } 637 638 mutex_unlock(&bc->lock); 639 640 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 641 bch2_trans_unlock(trans); 642 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 643 goto err; 644 } 645 646 mutex_lock(&bc->lock); 647 bc->used++; 648 got_mem: 649 mutex_unlock(&bc->lock); 650 651 BUG_ON(btree_node_hashed(b)); 652 BUG_ON(btree_node_dirty(b)); 653 BUG_ON(btree_node_write_in_flight(b)); 654 out: 655 b->flags = 0; 656 b->written = 0; 657 b->nsets = 0; 658 b->sib_u64s[0] = 0; 659 b->sib_u64s[1] = 0; 660 b->whiteout_u64s = 0; 661 bch2_btree_keys_init(b); 662 set_btree_node_accessed(b); 663 664 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 665 start_time); 666 667 memalloc_nofs_restore(flags); 668 return b; 669 err: 670 mutex_lock(&bc->lock); 671 672 /* Try to cannibalize another cached btree node: */ 673 if (bc->alloc_lock == current) { 674 b2 = btree_node_cannibalize(c); 675 clear_btree_node_just_written(b2); 676 bch2_btree_node_hash_remove(bc, b2); 677 678 if (b) { 679 swap(b->data, b2->data); 680 swap(b->aux_data, b2->aux_data); 681 btree_node_to_freedlist(bc, b2); 682 six_unlock_write(&b2->c.lock); 683 six_unlock_intent(&b2->c.lock); 684 } else { 685 b = b2; 686 list_del_init(&b->list); 687 } 688 689 mutex_unlock(&bc->lock); 690 691 trace_and_count(c, btree_cache_cannibalize, trans); 692 goto out; 693 } 694 695 mutex_unlock(&bc->lock); 696 memalloc_nofs_restore(flags); 697 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 698 } 699 700 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 701 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 702 struct btree_path *path, 703 const struct bkey_i *k, 704 enum btree_id btree_id, 705 unsigned level, 706 enum six_lock_type lock_type, 707 bool sync) 708 { 709 struct bch_fs *c = trans->c; 710 struct btree_cache *bc = &c->btree_cache; 711 struct btree *b; 712 713 if (unlikely(level >= BTREE_MAX_DEPTH)) { 714 int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", 715 level, BTREE_MAX_DEPTH); 716 return ERR_PTR(ret); 717 } 718 719 if (unlikely(!bkey_is_btree_ptr(&k->k))) { 720 struct printbuf buf = PRINTBUF; 721 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 722 723 int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); 724 printbuf_exit(&buf); 725 return ERR_PTR(ret); 726 } 727 728 if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { 729 struct printbuf buf = PRINTBUF; 730 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 731 732 int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); 733 printbuf_exit(&buf); 734 return ERR_PTR(ret); 735 } 736 737 /* 738 * Parent node must be locked, else we could read in a btree node that's 739 * been freed: 740 */ 741 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 742 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 743 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 744 } 745 746 b = bch2_btree_node_mem_alloc(trans, level != 0); 747 748 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 749 if (!path) 750 return b; 751 752 trans->memory_allocation_failure = true; 753 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 754 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 755 } 756 757 if (IS_ERR(b)) 758 return b; 759 760 bkey_copy(&b->key, k); 761 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 762 /* raced with another fill: */ 763 764 /* mark as unhashed... */ 765 b->hash_val = 0; 766 767 mutex_lock(&bc->lock); 768 list_add(&b->list, &bc->freeable); 769 mutex_unlock(&bc->lock); 770 771 six_unlock_write(&b->c.lock); 772 six_unlock_intent(&b->c.lock); 773 return NULL; 774 } 775 776 set_btree_node_read_in_flight(b); 777 six_unlock_write(&b->c.lock); 778 779 if (path) { 780 u32 seq = six_lock_seq(&b->c.lock); 781 782 /* Unlock before doing IO: */ 783 six_unlock_intent(&b->c.lock); 784 bch2_trans_unlock_noassert(trans); 785 786 bch2_btree_node_read(trans, b, sync); 787 788 if (!sync) 789 return NULL; 790 791 if (!six_relock_type(&b->c.lock, lock_type, seq)) 792 b = NULL; 793 } else { 794 bch2_btree_node_read(trans, b, sync); 795 if (lock_type == SIX_LOCK_read) 796 six_lock_downgrade(&b->c.lock); 797 } 798 799 return b; 800 } 801 802 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 803 { 804 struct printbuf buf = PRINTBUF; 805 806 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 807 return; 808 809 prt_printf(&buf, 810 "btree node header doesn't match ptr\n" 811 "btree %s level %u\n" 812 "ptr: ", 813 bch2_btree_id_str(b->c.btree_id), b->c.level); 814 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 815 816 prt_printf(&buf, "\nheader: btree %s level %llu\n" 817 "min ", 818 bch2_btree_id_str(BTREE_NODE_ID(b->data)), 819 BTREE_NODE_LEVEL(b->data)); 820 bch2_bpos_to_text(&buf, b->data->min_key); 821 822 prt_printf(&buf, "\nmax "); 823 bch2_bpos_to_text(&buf, b->data->max_key); 824 825 bch2_fs_topology_error(c, "%s", buf.buf); 826 827 printbuf_exit(&buf); 828 } 829 830 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 831 { 832 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 833 b->c.level != BTREE_NODE_LEVEL(b->data) || 834 !bpos_eq(b->data->max_key, b->key.k.p) || 835 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 836 !bpos_eq(b->data->min_key, 837 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 838 btree_bad_header(c, b); 839 } 840 841 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 842 const struct bkey_i *k, unsigned level, 843 enum six_lock_type lock_type, 844 unsigned long trace_ip) 845 { 846 struct bch_fs *c = trans->c; 847 struct btree_cache *bc = &c->btree_cache; 848 struct btree *b; 849 struct bset_tree *t; 850 bool need_relock = false; 851 int ret; 852 853 EBUG_ON(level >= BTREE_MAX_DEPTH); 854 retry: 855 b = btree_cache_find(bc, k); 856 if (unlikely(!b)) { 857 /* 858 * We must have the parent locked to call bch2_btree_node_fill(), 859 * else we could read in a btree node from disk that's been 860 * freed: 861 */ 862 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 863 level, lock_type, true); 864 need_relock = true; 865 866 /* We raced and found the btree node in the cache */ 867 if (!b) 868 goto retry; 869 870 if (IS_ERR(b)) 871 return b; 872 } else { 873 if (btree_node_read_locked(path, level + 1)) 874 btree_node_unlock(trans, path, level + 1); 875 876 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 877 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 878 return ERR_PTR(ret); 879 880 BUG_ON(ret); 881 882 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 883 b->c.level != level || 884 race_fault())) { 885 six_unlock_type(&b->c.lock, lock_type); 886 if (bch2_btree_node_relock(trans, path, level + 1)) 887 goto retry; 888 889 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 890 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 891 } 892 893 /* avoid atomic set bit if it's not needed: */ 894 if (!btree_node_accessed(b)) 895 set_btree_node_accessed(b); 896 } 897 898 if (unlikely(btree_node_read_in_flight(b))) { 899 u32 seq = six_lock_seq(&b->c.lock); 900 901 six_unlock_type(&b->c.lock, lock_type); 902 bch2_trans_unlock(trans); 903 need_relock = true; 904 905 bch2_btree_node_wait_on_read(b); 906 907 /* 908 * should_be_locked is not set on this path yet, so we need to 909 * relock it specifically: 910 */ 911 if (!six_relock_type(&b->c.lock, lock_type, seq)) 912 goto retry; 913 } 914 915 if (unlikely(need_relock)) { 916 ret = bch2_trans_relock(trans) ?: 917 bch2_btree_path_relock_intent(trans, path); 918 if (ret) { 919 six_unlock_type(&b->c.lock, lock_type); 920 return ERR_PTR(ret); 921 } 922 } 923 924 prefetch(b->aux_data); 925 926 for_each_bset(b, t) { 927 void *p = (u64 *) b->aux_data + t->aux_data_offset; 928 929 prefetch(p + L1_CACHE_BYTES * 0); 930 prefetch(p + L1_CACHE_BYTES * 1); 931 prefetch(p + L1_CACHE_BYTES * 2); 932 } 933 934 if (unlikely(btree_node_read_error(b))) { 935 six_unlock_type(&b->c.lock, lock_type); 936 return ERR_PTR(-BCH_ERR_btree_node_read_error); 937 } 938 939 EBUG_ON(b->c.btree_id != path->btree_id); 940 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 941 btree_check_header(c, b); 942 943 return b; 944 } 945 946 /** 947 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 948 * in from disk if necessary. 949 * 950 * @trans: btree transaction object 951 * @path: btree_path being traversed 952 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 953 * @level: level of btree node being looked up (0 == leaf node) 954 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 955 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 956 * 957 * The btree node will have either a read or a write lock held, depending on 958 * the @write parameter. 959 * 960 * Returns: btree node or ERR_PTR() 961 */ 962 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 963 const struct bkey_i *k, unsigned level, 964 enum six_lock_type lock_type, 965 unsigned long trace_ip) 966 { 967 struct bch_fs *c = trans->c; 968 struct btree *b; 969 struct bset_tree *t; 970 int ret; 971 972 EBUG_ON(level >= BTREE_MAX_DEPTH); 973 974 b = btree_node_mem_ptr(k); 975 976 /* 977 * Check b->hash_val _before_ calling btree_node_lock() - this might not 978 * be the node we want anymore, and trying to lock the wrong node could 979 * cause an unneccessary transaction restart: 980 */ 981 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 982 !b || 983 b->hash_val != btree_ptr_hash_val(k))) 984 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 985 986 if (btree_node_read_locked(path, level + 1)) 987 btree_node_unlock(trans, path, level + 1); 988 989 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 990 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 991 return ERR_PTR(ret); 992 993 BUG_ON(ret); 994 995 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 996 b->c.level != level || 997 race_fault())) { 998 six_unlock_type(&b->c.lock, lock_type); 999 if (bch2_btree_node_relock(trans, path, level + 1)) 1000 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1001 1002 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1003 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1004 } 1005 1006 if (unlikely(btree_node_read_in_flight(b))) { 1007 six_unlock_type(&b->c.lock, lock_type); 1008 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1009 } 1010 1011 prefetch(b->aux_data); 1012 1013 for_each_bset(b, t) { 1014 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1015 1016 prefetch(p + L1_CACHE_BYTES * 0); 1017 prefetch(p + L1_CACHE_BYTES * 1); 1018 prefetch(p + L1_CACHE_BYTES * 2); 1019 } 1020 1021 /* avoid atomic set bit if it's not needed: */ 1022 if (!btree_node_accessed(b)) 1023 set_btree_node_accessed(b); 1024 1025 if (unlikely(btree_node_read_error(b))) { 1026 six_unlock_type(&b->c.lock, lock_type); 1027 return ERR_PTR(-BCH_ERR_btree_node_read_error); 1028 } 1029 1030 EBUG_ON(b->c.btree_id != path->btree_id); 1031 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1032 btree_check_header(c, b); 1033 1034 return b; 1035 } 1036 1037 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1038 const struct bkey_i *k, 1039 enum btree_id btree_id, 1040 unsigned level, 1041 bool nofill) 1042 { 1043 struct bch_fs *c = trans->c; 1044 struct btree_cache *bc = &c->btree_cache; 1045 struct btree *b; 1046 struct bset_tree *t; 1047 int ret; 1048 1049 EBUG_ON(level >= BTREE_MAX_DEPTH); 1050 1051 if (c->opts.btree_node_mem_ptr_optimization) { 1052 b = btree_node_mem_ptr(k); 1053 if (b) 1054 goto lock_node; 1055 } 1056 retry: 1057 b = btree_cache_find(bc, k); 1058 if (unlikely(!b)) { 1059 if (nofill) 1060 goto out; 1061 1062 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1063 level, SIX_LOCK_read, true); 1064 1065 /* We raced and found the btree node in the cache */ 1066 if (!b) 1067 goto retry; 1068 1069 if (IS_ERR(b) && 1070 !bch2_btree_cache_cannibalize_lock(trans, NULL)) 1071 goto retry; 1072 1073 if (IS_ERR(b)) 1074 goto out; 1075 } else { 1076 lock_node: 1077 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1078 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1079 return ERR_PTR(ret); 1080 1081 BUG_ON(ret); 1082 1083 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1084 b->c.btree_id != btree_id || 1085 b->c.level != level)) { 1086 six_unlock_read(&b->c.lock); 1087 goto retry; 1088 } 1089 } 1090 1091 /* XXX: waiting on IO with btree locks held: */ 1092 __bch2_btree_node_wait_on_read(b); 1093 1094 prefetch(b->aux_data); 1095 1096 for_each_bset(b, t) { 1097 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1098 1099 prefetch(p + L1_CACHE_BYTES * 0); 1100 prefetch(p + L1_CACHE_BYTES * 1); 1101 prefetch(p + L1_CACHE_BYTES * 2); 1102 } 1103 1104 /* avoid atomic set bit if it's not needed: */ 1105 if (!btree_node_accessed(b)) 1106 set_btree_node_accessed(b); 1107 1108 if (unlikely(btree_node_read_error(b))) { 1109 six_unlock_read(&b->c.lock); 1110 b = ERR_PTR(-BCH_ERR_btree_node_read_error); 1111 goto out; 1112 } 1113 1114 EBUG_ON(b->c.btree_id != btree_id); 1115 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1116 btree_check_header(c, b); 1117 out: 1118 bch2_btree_cache_cannibalize_unlock(trans); 1119 return b; 1120 } 1121 1122 int bch2_btree_node_prefetch(struct btree_trans *trans, 1123 struct btree_path *path, 1124 const struct bkey_i *k, 1125 enum btree_id btree_id, unsigned level) 1126 { 1127 struct bch_fs *c = trans->c; 1128 struct btree_cache *bc = &c->btree_cache; 1129 1130 BUG_ON(path && !btree_node_locked(path, level + 1)); 1131 BUG_ON(level >= BTREE_MAX_DEPTH); 1132 1133 struct btree *b = btree_cache_find(bc, k); 1134 if (b) 1135 return 0; 1136 1137 b = bch2_btree_node_fill(trans, path, k, btree_id, 1138 level, SIX_LOCK_read, false); 1139 if (!IS_ERR_OR_NULL(b)) 1140 six_unlock_read(&b->c.lock); 1141 return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b); 1142 } 1143 1144 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1145 { 1146 struct bch_fs *c = trans->c; 1147 struct btree_cache *bc = &c->btree_cache; 1148 struct btree *b; 1149 1150 b = btree_cache_find(bc, k); 1151 if (!b) 1152 return; 1153 1154 BUG_ON(b == btree_node_root(trans->c, b)); 1155 wait_on_io: 1156 /* not allowed to wait on io with btree locks held: */ 1157 1158 /* XXX we're called from btree_gc which will be holding other btree 1159 * nodes locked 1160 */ 1161 __bch2_btree_node_wait_on_read(b); 1162 __bch2_btree_node_wait_on_write(b); 1163 1164 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1165 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1166 if (unlikely(b->hash_val != btree_ptr_hash_val(k))) 1167 goto out; 1168 1169 if (btree_node_dirty(b)) { 1170 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1171 six_unlock_write(&b->c.lock); 1172 six_unlock_intent(&b->c.lock); 1173 goto wait_on_io; 1174 } 1175 1176 BUG_ON(btree_node_dirty(b)); 1177 1178 mutex_lock(&bc->lock); 1179 btree_node_data_free(c, b); 1180 bch2_btree_node_hash_remove(bc, b); 1181 mutex_unlock(&bc->lock); 1182 out: 1183 six_unlock_write(&b->c.lock); 1184 six_unlock_intent(&b->c.lock); 1185 } 1186 1187 const char *bch2_btree_id_str(enum btree_id btree) 1188 { 1189 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1190 } 1191 1192 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1193 { 1194 prt_printf(out, "%s level %u/%u\n ", 1195 bch2_btree_id_str(b->c.btree_id), 1196 b->c.level, 1197 bch2_btree_id_root(c, b->c.btree_id)->level); 1198 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1199 } 1200 1201 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1202 { 1203 struct bset_stats stats; 1204 1205 memset(&stats, 0, sizeof(stats)); 1206 1207 bch2_btree_keys_stats(b, &stats); 1208 1209 prt_printf(out, "l %u ", b->c.level); 1210 bch2_bpos_to_text(out, b->data->min_key); 1211 prt_printf(out, " - "); 1212 bch2_bpos_to_text(out, b->data->max_key); 1213 prt_printf(out, ":\n" 1214 " ptrs: "); 1215 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1216 prt_newline(out); 1217 1218 prt_printf(out, 1219 " format: "); 1220 bch2_bkey_format_to_text(out, &b->format); 1221 1222 prt_printf(out, 1223 " unpack fn len: %u\n" 1224 " bytes used %zu/%zu (%zu%% full)\n" 1225 " sib u64s: %u, %u (merge threshold %u)\n" 1226 " nr packed keys %u\n" 1227 " nr unpacked keys %u\n" 1228 " floats %zu\n" 1229 " failed unpacked %zu\n", 1230 b->unpack_fn_len, 1231 b->nr.live_u64s * sizeof(u64), 1232 btree_buf_bytes(b) - sizeof(struct btree_node), 1233 b->nr.live_u64s * 100 / btree_max_u64s(c), 1234 b->sib_u64s[0], 1235 b->sib_u64s[1], 1236 c->btree_foreground_merge_threshold, 1237 b->nr.packed_keys, 1238 b->nr.unpacked_keys, 1239 stats.floats, 1240 stats.failed); 1241 } 1242 1243 void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) 1244 { 1245 prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); 1246 prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); 1247 prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); 1248 } 1249