1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bbpos.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_iter.h" 9 #include "btree_locking.h" 10 #include "debug.h" 11 #include "errcode.h" 12 #include "error.h" 13 #include "journal.h" 14 #include "trace.h" 15 16 #include <linux/prefetch.h> 17 #include <linux/sched/mm.h> 18 19 const char * const bch2_btree_node_flags[] = { 20 #define x(f) #f, 21 BTREE_FLAGS() 22 #undef x 23 NULL 24 }; 25 26 void bch2_recalc_btree_reserve(struct bch_fs *c) 27 { 28 unsigned i, reserve = 16; 29 30 if (!c->btree_roots_known[0].b) 31 reserve += 8; 32 33 for (i = 0; i < btree_id_nr_alive(c); i++) { 34 struct btree_root *r = bch2_btree_id_root(c, i); 35 36 if (r->b) 37 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 38 } 39 40 c->btree_cache.reserve = reserve; 41 } 42 43 static inline unsigned btree_cache_can_free(struct btree_cache *bc) 44 { 45 return max_t(int, 0, bc->used - bc->reserve); 46 } 47 48 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 49 { 50 if (b->c.lock.readers) 51 list_move(&b->list, &bc->freed_pcpu); 52 else 53 list_move(&b->list, &bc->freed_nonpcpu); 54 } 55 56 static void btree_node_data_free(struct bch_fs *c, struct btree *b) 57 { 58 struct btree_cache *bc = &c->btree_cache; 59 60 EBUG_ON(btree_node_write_in_flight(b)); 61 62 clear_btree_node_just_written(b); 63 64 kvfree(b->data); 65 b->data = NULL; 66 #ifdef __KERNEL__ 67 kvfree(b->aux_data); 68 #else 69 munmap(b->aux_data, btree_aux_data_bytes(b)); 70 #endif 71 b->aux_data = NULL; 72 73 bc->used--; 74 75 btree_node_to_freedlist(bc, b); 76 } 77 78 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 79 const void *obj) 80 { 81 const struct btree *b = obj; 82 const u64 *v = arg->key; 83 84 return b->hash_val == *v ? 0 : 1; 85 } 86 87 static const struct rhashtable_params bch_btree_cache_params = { 88 .head_offset = offsetof(struct btree, hash), 89 .key_offset = offsetof(struct btree, hash_val), 90 .key_len = sizeof(u64), 91 .obj_cmpfn = bch2_btree_cache_cmp_fn, 92 }; 93 94 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 95 { 96 BUG_ON(b->data || b->aux_data); 97 98 b->data = kvmalloc(btree_buf_bytes(b), gfp); 99 if (!b->data) 100 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 101 #ifdef __KERNEL__ 102 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 103 #else 104 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 105 PROT_READ|PROT_WRITE|PROT_EXEC, 106 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 107 if (b->aux_data == MAP_FAILED) 108 b->aux_data = NULL; 109 #endif 110 if (!b->aux_data) { 111 kvfree(b->data); 112 b->data = NULL; 113 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 114 } 115 116 return 0; 117 } 118 119 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 120 { 121 struct btree *b; 122 123 b = kzalloc(sizeof(struct btree), gfp); 124 if (!b) 125 return NULL; 126 127 bkey_btree_ptr_init(&b->key); 128 INIT_LIST_HEAD(&b->list); 129 INIT_LIST_HEAD(&b->write_blocked); 130 b->byte_order = ilog2(c->opts.btree_node_size); 131 return b; 132 } 133 134 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 135 { 136 struct btree_cache *bc = &c->btree_cache; 137 struct btree *b; 138 139 b = __btree_node_mem_alloc(c, GFP_KERNEL); 140 if (!b) 141 return NULL; 142 143 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 144 kfree(b); 145 return NULL; 146 } 147 148 bch2_btree_lock_init(&b->c, 0); 149 150 bc->used++; 151 list_add(&b->list, &bc->freeable); 152 return b; 153 } 154 155 /* Btree in memory cache - hash table */ 156 157 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 158 { 159 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 160 161 BUG_ON(ret); 162 163 /* Cause future lookups for this node to fail: */ 164 b->hash_val = 0; 165 } 166 167 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 168 { 169 BUG_ON(b->hash_val); 170 b->hash_val = btree_ptr_hash_val(&b->key); 171 172 return rhashtable_lookup_insert_fast(&bc->table, &b->hash, 173 bch_btree_cache_params); 174 } 175 176 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 177 unsigned level, enum btree_id id) 178 { 179 int ret; 180 181 b->c.level = level; 182 b->c.btree_id = id; 183 184 mutex_lock(&bc->lock); 185 ret = __bch2_btree_node_hash_insert(bc, b); 186 if (!ret) 187 list_add_tail(&b->list, &bc->live); 188 mutex_unlock(&bc->lock); 189 190 return ret; 191 } 192 193 __flatten 194 static inline struct btree *btree_cache_find(struct btree_cache *bc, 195 const struct bkey_i *k) 196 { 197 u64 v = btree_ptr_hash_val(k); 198 199 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 200 } 201 202 /* 203 * this version is for btree nodes that have already been freed (we're not 204 * reaping a real btree node) 205 */ 206 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) 207 { 208 struct btree_cache *bc = &c->btree_cache; 209 int ret = 0; 210 211 lockdep_assert_held(&bc->lock); 212 213 struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 214 215 u64 mask = b->c.level 216 ? bc->pinned_nodes_interior_mask 217 : bc->pinned_nodes_leaf_mask; 218 219 if ((mask & BIT_ULL(b->c.btree_id)) && 220 bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 221 bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) 222 return -BCH_ERR_ENOMEM_btree_node_reclaim; 223 224 wait_on_io: 225 if (b->flags & ((1U << BTREE_NODE_dirty)| 226 (1U << BTREE_NODE_read_in_flight)| 227 (1U << BTREE_NODE_write_in_flight))) { 228 if (!flush) 229 return -BCH_ERR_ENOMEM_btree_node_reclaim; 230 231 /* XXX: waiting on IO with btree cache lock held */ 232 bch2_btree_node_wait_on_read(b); 233 bch2_btree_node_wait_on_write(b); 234 } 235 236 if (!six_trylock_intent(&b->c.lock)) 237 return -BCH_ERR_ENOMEM_btree_node_reclaim; 238 239 if (!six_trylock_write(&b->c.lock)) 240 goto out_unlock_intent; 241 242 /* recheck under lock */ 243 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 244 (1U << BTREE_NODE_write_in_flight))) { 245 if (!flush) 246 goto out_unlock; 247 six_unlock_write(&b->c.lock); 248 six_unlock_intent(&b->c.lock); 249 goto wait_on_io; 250 } 251 252 if (btree_node_noevict(b) || 253 btree_node_write_blocked(b) || 254 btree_node_will_make_reachable(b)) 255 goto out_unlock; 256 257 if (btree_node_dirty(b)) { 258 if (!flush) 259 goto out_unlock; 260 /* 261 * Using the underscore version because we don't want to compact 262 * bsets after the write, since this node is about to be evicted 263 * - unless btree verify mode is enabled, since it runs out of 264 * the post write cleanup: 265 */ 266 if (bch2_verify_btree_ondisk) 267 bch2_btree_node_write(c, b, SIX_LOCK_intent, 268 BTREE_WRITE_cache_reclaim); 269 else 270 __bch2_btree_node_write(c, b, 271 BTREE_WRITE_cache_reclaim); 272 273 six_unlock_write(&b->c.lock); 274 six_unlock_intent(&b->c.lock); 275 goto wait_on_io; 276 } 277 out: 278 if (b->hash_val && !ret) 279 trace_and_count(c, btree_cache_reap, c, b); 280 return ret; 281 out_unlock: 282 six_unlock_write(&b->c.lock); 283 out_unlock_intent: 284 six_unlock_intent(&b->c.lock); 285 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 286 goto out; 287 } 288 289 static int btree_node_reclaim(struct bch_fs *c, struct btree *b) 290 { 291 return __btree_node_reclaim(c, b, false); 292 } 293 294 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 295 { 296 return __btree_node_reclaim(c, b, true); 297 } 298 299 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 300 struct shrink_control *sc) 301 { 302 struct bch_fs *c = shrink->private_data; 303 struct btree_cache *bc = &c->btree_cache; 304 struct btree *b, *t; 305 unsigned long nr = sc->nr_to_scan; 306 unsigned long can_free = 0; 307 unsigned long freed = 0; 308 unsigned long touched = 0; 309 unsigned i, flags; 310 unsigned long ret = SHRINK_STOP; 311 bool trigger_writes = atomic_read(&bc->dirty) + nr >= 312 bc->used * 3 / 4; 313 314 if (bch2_btree_shrinker_disabled) 315 return SHRINK_STOP; 316 317 mutex_lock(&bc->lock); 318 flags = memalloc_nofs_save(); 319 320 /* 321 * It's _really_ critical that we don't free too many btree nodes - we 322 * have to always leave ourselves a reserve. The reserve is how we 323 * guarantee that allocating memory for a new btree node can always 324 * succeed, so that inserting keys into the btree can always succeed and 325 * IO can always make forward progress: 326 */ 327 can_free = btree_cache_can_free(bc); 328 nr = min_t(unsigned long, nr, can_free); 329 330 i = 0; 331 list_for_each_entry_safe(b, t, &bc->freeable, list) { 332 /* 333 * Leave a few nodes on the freeable list, so that a btree split 334 * won't have to hit the system allocator: 335 */ 336 if (++i <= 3) 337 continue; 338 339 touched++; 340 341 if (touched >= nr) 342 goto out; 343 344 if (!btree_node_reclaim(c, b)) { 345 btree_node_data_free(c, b); 346 six_unlock_write(&b->c.lock); 347 six_unlock_intent(&b->c.lock); 348 freed++; 349 } 350 } 351 restart: 352 list_for_each_entry_safe(b, t, &bc->live, list) { 353 touched++; 354 355 if (btree_node_accessed(b)) { 356 clear_btree_node_accessed(b); 357 } else if (!btree_node_reclaim(c, b)) { 358 freed++; 359 btree_node_data_free(c, b); 360 361 bch2_btree_node_hash_remove(bc, b); 362 six_unlock_write(&b->c.lock); 363 six_unlock_intent(&b->c.lock); 364 365 if (freed == nr) 366 goto out_rotate; 367 } else if (trigger_writes && 368 btree_node_dirty(b) && 369 !btree_node_will_make_reachable(b) && 370 !btree_node_write_blocked(b) && 371 six_trylock_read(&b->c.lock)) { 372 list_move(&bc->live, &b->list); 373 mutex_unlock(&bc->lock); 374 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 375 six_unlock_read(&b->c.lock); 376 if (touched >= nr) 377 goto out_nounlock; 378 mutex_lock(&bc->lock); 379 goto restart; 380 } 381 382 if (touched >= nr) 383 break; 384 } 385 out_rotate: 386 if (&t->list != &bc->live) 387 list_move_tail(&bc->live, &t->list); 388 out: 389 mutex_unlock(&bc->lock); 390 out_nounlock: 391 ret = freed; 392 memalloc_nofs_restore(flags); 393 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 394 return ret; 395 } 396 397 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 398 struct shrink_control *sc) 399 { 400 struct bch_fs *c = shrink->private_data; 401 struct btree_cache *bc = &c->btree_cache; 402 403 if (bch2_btree_shrinker_disabled) 404 return 0; 405 406 return btree_cache_can_free(bc); 407 } 408 409 void bch2_fs_btree_cache_exit(struct bch_fs *c) 410 { 411 struct btree_cache *bc = &c->btree_cache; 412 struct btree *b; 413 unsigned i, flags; 414 415 shrinker_free(bc->shrink); 416 417 /* vfree() can allocate memory: */ 418 flags = memalloc_nofs_save(); 419 mutex_lock(&bc->lock); 420 421 if (c->verify_data) 422 list_move(&c->verify_data->list, &bc->live); 423 424 kvfree(c->verify_ondisk); 425 426 for (i = 0; i < btree_id_nr_alive(c); i++) { 427 struct btree_root *r = bch2_btree_id_root(c, i); 428 429 if (r->b) 430 list_add(&r->b->list, &bc->live); 431 } 432 433 list_splice(&bc->freeable, &bc->live); 434 435 while (!list_empty(&bc->live)) { 436 b = list_first_entry(&bc->live, struct btree, list); 437 438 BUG_ON(btree_node_read_in_flight(b) || 439 btree_node_write_in_flight(b)); 440 441 btree_node_data_free(c, b); 442 } 443 444 BUG_ON(!bch2_journal_error(&c->journal) && 445 atomic_read(&c->btree_cache.dirty)); 446 447 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 448 449 while (!list_empty(&bc->freed_nonpcpu)) { 450 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); 451 list_del(&b->list); 452 six_lock_exit(&b->c.lock); 453 kfree(b); 454 } 455 456 mutex_unlock(&bc->lock); 457 memalloc_nofs_restore(flags); 458 459 if (bc->table_init_done) 460 rhashtable_destroy(&bc->table); 461 } 462 463 int bch2_fs_btree_cache_init(struct bch_fs *c) 464 { 465 struct btree_cache *bc = &c->btree_cache; 466 struct shrinker *shrink; 467 unsigned i; 468 int ret = 0; 469 470 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 471 if (ret) 472 goto err; 473 474 bc->table_init_done = true; 475 476 bch2_recalc_btree_reserve(c); 477 478 for (i = 0; i < bc->reserve; i++) 479 if (!__bch2_btree_node_mem_alloc(c)) 480 goto err; 481 482 list_splice_init(&bc->live, &bc->freeable); 483 484 mutex_init(&c->verify_lock); 485 486 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 487 if (!shrink) 488 goto err; 489 bc->shrink = shrink; 490 shrink->count_objects = bch2_btree_cache_count; 491 shrink->scan_objects = bch2_btree_cache_scan; 492 shrink->seeks = 4; 493 shrink->private_data = c; 494 shrinker_register(shrink); 495 496 return 0; 497 err: 498 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 499 } 500 501 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 502 { 503 mutex_init(&bc->lock); 504 INIT_LIST_HEAD(&bc->live); 505 INIT_LIST_HEAD(&bc->freeable); 506 INIT_LIST_HEAD(&bc->freed_pcpu); 507 INIT_LIST_HEAD(&bc->freed_nonpcpu); 508 } 509 510 /* 511 * We can only have one thread cannibalizing other cached btree nodes at a time, 512 * or we'll deadlock. We use an open coded mutex to ensure that, which a 513 * cannibalize_bucket() will take. This means every time we unlock the root of 514 * the btree, we need to release this lock if we have it held. 515 */ 516 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) 517 { 518 struct bch_fs *c = trans->c; 519 struct btree_cache *bc = &c->btree_cache; 520 521 if (bc->alloc_lock == current) { 522 trace_and_count(c, btree_cache_cannibalize_unlock, trans); 523 bc->alloc_lock = NULL; 524 closure_wake_up(&bc->alloc_wait); 525 } 526 } 527 528 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) 529 { 530 struct bch_fs *c = trans->c; 531 struct btree_cache *bc = &c->btree_cache; 532 struct task_struct *old; 533 534 old = cmpxchg(&bc->alloc_lock, NULL, current); 535 if (old == NULL || old == current) 536 goto success; 537 538 if (!cl) { 539 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 540 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 541 } 542 543 closure_wait(&bc->alloc_wait, cl); 544 545 /* Try again, after adding ourselves to waitlist */ 546 old = cmpxchg(&bc->alloc_lock, NULL, current); 547 if (old == NULL || old == current) { 548 /* We raced */ 549 closure_wake_up(&bc->alloc_wait); 550 goto success; 551 } 552 553 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 554 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 555 556 success: 557 trace_and_count(c, btree_cache_cannibalize_lock, trans); 558 return 0; 559 } 560 561 static struct btree *btree_node_cannibalize(struct bch_fs *c) 562 { 563 struct btree_cache *bc = &c->btree_cache; 564 struct btree *b; 565 566 list_for_each_entry_reverse(b, &bc->live, list) 567 if (!btree_node_reclaim(c, b)) 568 return b; 569 570 while (1) { 571 list_for_each_entry_reverse(b, &bc->live, list) 572 if (!btree_node_write_and_reclaim(c, b)) 573 return b; 574 575 /* 576 * Rare case: all nodes were intent-locked. 577 * Just busy-wait. 578 */ 579 WARN_ONCE(1, "btree cache cannibalize failed\n"); 580 cond_resched(); 581 } 582 } 583 584 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 585 { 586 struct bch_fs *c = trans->c; 587 struct btree_cache *bc = &c->btree_cache; 588 struct list_head *freed = pcpu_read_locks 589 ? &bc->freed_pcpu 590 : &bc->freed_nonpcpu; 591 struct btree *b, *b2; 592 u64 start_time = local_clock(); 593 unsigned flags; 594 595 flags = memalloc_nofs_save(); 596 mutex_lock(&bc->lock); 597 598 /* 599 * We never free struct btree itself, just the memory that holds the on 600 * disk node. Check the freed list before allocating a new one: 601 */ 602 list_for_each_entry(b, freed, list) 603 if (!btree_node_reclaim(c, b)) { 604 list_del_init(&b->list); 605 goto got_node; 606 } 607 608 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 609 if (!b) { 610 mutex_unlock(&bc->lock); 611 bch2_trans_unlock(trans); 612 b = __btree_node_mem_alloc(c, GFP_KERNEL); 613 if (!b) 614 goto err; 615 mutex_lock(&bc->lock); 616 } 617 618 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); 619 620 BUG_ON(!six_trylock_intent(&b->c.lock)); 621 BUG_ON(!six_trylock_write(&b->c.lock)); 622 got_node: 623 624 /* 625 * btree_free() doesn't free memory; it sticks the node on the end of 626 * the list. Check if there's any freed nodes there: 627 */ 628 list_for_each_entry(b2, &bc->freeable, list) 629 if (!btree_node_reclaim(c, b2)) { 630 swap(b->data, b2->data); 631 swap(b->aux_data, b2->aux_data); 632 btree_node_to_freedlist(bc, b2); 633 six_unlock_write(&b2->c.lock); 634 six_unlock_intent(&b2->c.lock); 635 goto got_mem; 636 } 637 638 mutex_unlock(&bc->lock); 639 640 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 641 bch2_trans_unlock(trans); 642 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 643 goto err; 644 } 645 646 mutex_lock(&bc->lock); 647 bc->used++; 648 got_mem: 649 mutex_unlock(&bc->lock); 650 651 BUG_ON(btree_node_hashed(b)); 652 BUG_ON(btree_node_dirty(b)); 653 BUG_ON(btree_node_write_in_flight(b)); 654 out: 655 b->flags = 0; 656 b->written = 0; 657 b->nsets = 0; 658 b->sib_u64s[0] = 0; 659 b->sib_u64s[1] = 0; 660 b->whiteout_u64s = 0; 661 bch2_btree_keys_init(b); 662 set_btree_node_accessed(b); 663 664 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 665 start_time); 666 667 memalloc_nofs_restore(flags); 668 return b; 669 err: 670 mutex_lock(&bc->lock); 671 672 /* Try to cannibalize another cached btree node: */ 673 if (bc->alloc_lock == current) { 674 b2 = btree_node_cannibalize(c); 675 clear_btree_node_just_written(b2); 676 bch2_btree_node_hash_remove(bc, b2); 677 678 if (b) { 679 swap(b->data, b2->data); 680 swap(b->aux_data, b2->aux_data); 681 btree_node_to_freedlist(bc, b2); 682 six_unlock_write(&b2->c.lock); 683 six_unlock_intent(&b2->c.lock); 684 } else { 685 b = b2; 686 list_del_init(&b->list); 687 } 688 689 mutex_unlock(&bc->lock); 690 691 trace_and_count(c, btree_cache_cannibalize, trans); 692 goto out; 693 } 694 695 mutex_unlock(&bc->lock); 696 memalloc_nofs_restore(flags); 697 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 698 } 699 700 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 701 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 702 struct btree_path *path, 703 const struct bkey_i *k, 704 enum btree_id btree_id, 705 unsigned level, 706 enum six_lock_type lock_type, 707 bool sync) 708 { 709 struct bch_fs *c = trans->c; 710 struct btree_cache *bc = &c->btree_cache; 711 struct btree *b; 712 u32 seq; 713 714 BUG_ON(level + 1 >= BTREE_MAX_DEPTH); 715 /* 716 * Parent node must be locked, else we could read in a btree node that's 717 * been freed: 718 */ 719 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 720 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 721 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 722 } 723 724 b = bch2_btree_node_mem_alloc(trans, level != 0); 725 726 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 727 if (!path) 728 return b; 729 730 trans->memory_allocation_failure = true; 731 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 732 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 733 } 734 735 if (IS_ERR(b)) 736 return b; 737 738 bkey_copy(&b->key, k); 739 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 740 /* raced with another fill: */ 741 742 /* mark as unhashed... */ 743 b->hash_val = 0; 744 745 mutex_lock(&bc->lock); 746 list_add(&b->list, &bc->freeable); 747 mutex_unlock(&bc->lock); 748 749 six_unlock_write(&b->c.lock); 750 six_unlock_intent(&b->c.lock); 751 return NULL; 752 } 753 754 set_btree_node_read_in_flight(b); 755 756 six_unlock_write(&b->c.lock); 757 seq = six_lock_seq(&b->c.lock); 758 six_unlock_intent(&b->c.lock); 759 760 /* Unlock before doing IO: */ 761 if (path && sync) 762 bch2_trans_unlock_noassert(trans); 763 764 bch2_btree_node_read(trans, b, sync); 765 766 if (!sync) 767 return NULL; 768 769 if (path) { 770 int ret = bch2_trans_relock(trans) ?: 771 bch2_btree_path_relock_intent(trans, path); 772 if (ret) { 773 BUG_ON(!trans->restarted); 774 return ERR_PTR(ret); 775 } 776 } 777 778 if (!six_relock_type(&b->c.lock, lock_type, seq)) { 779 BUG_ON(!path); 780 781 trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); 782 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); 783 } 784 785 return b; 786 } 787 788 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 789 { 790 struct printbuf buf = PRINTBUF; 791 792 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 793 return; 794 795 prt_printf(&buf, 796 "btree node header doesn't match ptr\n" 797 "btree %s level %u\n" 798 "ptr: ", 799 bch2_btree_id_str(b->c.btree_id), b->c.level); 800 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 801 802 prt_printf(&buf, "\nheader: btree %s level %llu\n" 803 "min ", 804 bch2_btree_id_str(BTREE_NODE_ID(b->data)), 805 BTREE_NODE_LEVEL(b->data)); 806 bch2_bpos_to_text(&buf, b->data->min_key); 807 808 prt_printf(&buf, "\nmax "); 809 bch2_bpos_to_text(&buf, b->data->max_key); 810 811 bch2_fs_topology_error(c, "%s", buf.buf); 812 813 printbuf_exit(&buf); 814 } 815 816 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 817 { 818 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 819 b->c.level != BTREE_NODE_LEVEL(b->data) || 820 !bpos_eq(b->data->max_key, b->key.k.p) || 821 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 822 !bpos_eq(b->data->min_key, 823 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 824 btree_bad_header(c, b); 825 } 826 827 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 828 const struct bkey_i *k, unsigned level, 829 enum six_lock_type lock_type, 830 unsigned long trace_ip) 831 { 832 struct bch_fs *c = trans->c; 833 struct btree_cache *bc = &c->btree_cache; 834 struct btree *b; 835 struct bset_tree *t; 836 bool need_relock = false; 837 int ret; 838 839 EBUG_ON(level >= BTREE_MAX_DEPTH); 840 retry: 841 b = btree_cache_find(bc, k); 842 if (unlikely(!b)) { 843 /* 844 * We must have the parent locked to call bch2_btree_node_fill(), 845 * else we could read in a btree node from disk that's been 846 * freed: 847 */ 848 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 849 level, lock_type, true); 850 need_relock = true; 851 852 /* We raced and found the btree node in the cache */ 853 if (!b) 854 goto retry; 855 856 if (IS_ERR(b)) 857 return b; 858 } else { 859 if (btree_node_read_locked(path, level + 1)) 860 btree_node_unlock(trans, path, level + 1); 861 862 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 863 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 864 return ERR_PTR(ret); 865 866 BUG_ON(ret); 867 868 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 869 b->c.level != level || 870 race_fault())) { 871 six_unlock_type(&b->c.lock, lock_type); 872 if (bch2_btree_node_relock(trans, path, level + 1)) 873 goto retry; 874 875 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 876 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 877 } 878 879 /* avoid atomic set bit if it's not needed: */ 880 if (!btree_node_accessed(b)) 881 set_btree_node_accessed(b); 882 } 883 884 if (unlikely(btree_node_read_in_flight(b))) { 885 u32 seq = six_lock_seq(&b->c.lock); 886 887 six_unlock_type(&b->c.lock, lock_type); 888 bch2_trans_unlock(trans); 889 need_relock = true; 890 891 bch2_btree_node_wait_on_read(b); 892 893 /* 894 * should_be_locked is not set on this path yet, so we need to 895 * relock it specifically: 896 */ 897 if (!six_relock_type(&b->c.lock, lock_type, seq)) 898 goto retry; 899 } 900 901 if (unlikely(need_relock)) { 902 ret = bch2_trans_relock(trans) ?: 903 bch2_btree_path_relock_intent(trans, path); 904 if (ret) { 905 six_unlock_type(&b->c.lock, lock_type); 906 return ERR_PTR(ret); 907 } 908 } 909 910 prefetch(b->aux_data); 911 912 for_each_bset(b, t) { 913 void *p = (u64 *) b->aux_data + t->aux_data_offset; 914 915 prefetch(p + L1_CACHE_BYTES * 0); 916 prefetch(p + L1_CACHE_BYTES * 1); 917 prefetch(p + L1_CACHE_BYTES * 2); 918 } 919 920 if (unlikely(btree_node_read_error(b))) { 921 six_unlock_type(&b->c.lock, lock_type); 922 return ERR_PTR(-BCH_ERR_btree_node_read_error); 923 } 924 925 EBUG_ON(b->c.btree_id != path->btree_id); 926 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 927 btree_check_header(c, b); 928 929 return b; 930 } 931 932 /** 933 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 934 * in from disk if necessary. 935 * 936 * @trans: btree transaction object 937 * @path: btree_path being traversed 938 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 939 * @level: level of btree node being looked up (0 == leaf node) 940 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 941 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 942 * 943 * The btree node will have either a read or a write lock held, depending on 944 * the @write parameter. 945 * 946 * Returns: btree node or ERR_PTR() 947 */ 948 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 949 const struct bkey_i *k, unsigned level, 950 enum six_lock_type lock_type, 951 unsigned long trace_ip) 952 { 953 struct bch_fs *c = trans->c; 954 struct btree *b; 955 struct bset_tree *t; 956 int ret; 957 958 EBUG_ON(level >= BTREE_MAX_DEPTH); 959 960 b = btree_node_mem_ptr(k); 961 962 /* 963 * Check b->hash_val _before_ calling btree_node_lock() - this might not 964 * be the node we want anymore, and trying to lock the wrong node could 965 * cause an unneccessary transaction restart: 966 */ 967 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 968 !b || 969 b->hash_val != btree_ptr_hash_val(k))) 970 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 971 972 if (btree_node_read_locked(path, level + 1)) 973 btree_node_unlock(trans, path, level + 1); 974 975 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 976 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 977 return ERR_PTR(ret); 978 979 BUG_ON(ret); 980 981 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 982 b->c.level != level || 983 race_fault())) { 984 six_unlock_type(&b->c.lock, lock_type); 985 if (bch2_btree_node_relock(trans, path, level + 1)) 986 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 987 988 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 989 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 990 } 991 992 if (unlikely(btree_node_read_in_flight(b))) { 993 six_unlock_type(&b->c.lock, lock_type); 994 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 995 } 996 997 prefetch(b->aux_data); 998 999 for_each_bset(b, t) { 1000 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1001 1002 prefetch(p + L1_CACHE_BYTES * 0); 1003 prefetch(p + L1_CACHE_BYTES * 1); 1004 prefetch(p + L1_CACHE_BYTES * 2); 1005 } 1006 1007 /* avoid atomic set bit if it's not needed: */ 1008 if (!btree_node_accessed(b)) 1009 set_btree_node_accessed(b); 1010 1011 if (unlikely(btree_node_read_error(b))) { 1012 six_unlock_type(&b->c.lock, lock_type); 1013 return ERR_PTR(-BCH_ERR_btree_node_read_error); 1014 } 1015 1016 EBUG_ON(b->c.btree_id != path->btree_id); 1017 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1018 btree_check_header(c, b); 1019 1020 return b; 1021 } 1022 1023 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1024 const struct bkey_i *k, 1025 enum btree_id btree_id, 1026 unsigned level, 1027 bool nofill) 1028 { 1029 struct bch_fs *c = trans->c; 1030 struct btree_cache *bc = &c->btree_cache; 1031 struct btree *b; 1032 struct bset_tree *t; 1033 int ret; 1034 1035 EBUG_ON(level >= BTREE_MAX_DEPTH); 1036 1037 if (c->opts.btree_node_mem_ptr_optimization) { 1038 b = btree_node_mem_ptr(k); 1039 if (b) 1040 goto lock_node; 1041 } 1042 retry: 1043 b = btree_cache_find(bc, k); 1044 if (unlikely(!b)) { 1045 if (nofill) 1046 goto out; 1047 1048 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1049 level, SIX_LOCK_read, true); 1050 1051 /* We raced and found the btree node in the cache */ 1052 if (!b) 1053 goto retry; 1054 1055 if (IS_ERR(b) && 1056 !bch2_btree_cache_cannibalize_lock(trans, NULL)) 1057 goto retry; 1058 1059 if (IS_ERR(b)) 1060 goto out; 1061 } else { 1062 lock_node: 1063 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1064 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1065 return ERR_PTR(ret); 1066 1067 BUG_ON(ret); 1068 1069 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1070 b->c.btree_id != btree_id || 1071 b->c.level != level)) { 1072 six_unlock_read(&b->c.lock); 1073 goto retry; 1074 } 1075 } 1076 1077 /* XXX: waiting on IO with btree locks held: */ 1078 __bch2_btree_node_wait_on_read(b); 1079 1080 prefetch(b->aux_data); 1081 1082 for_each_bset(b, t) { 1083 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1084 1085 prefetch(p + L1_CACHE_BYTES * 0); 1086 prefetch(p + L1_CACHE_BYTES * 1); 1087 prefetch(p + L1_CACHE_BYTES * 2); 1088 } 1089 1090 /* avoid atomic set bit if it's not needed: */ 1091 if (!btree_node_accessed(b)) 1092 set_btree_node_accessed(b); 1093 1094 if (unlikely(btree_node_read_error(b))) { 1095 six_unlock_read(&b->c.lock); 1096 b = ERR_PTR(-BCH_ERR_btree_node_read_error); 1097 goto out; 1098 } 1099 1100 EBUG_ON(b->c.btree_id != btree_id); 1101 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1102 btree_check_header(c, b); 1103 out: 1104 bch2_btree_cache_cannibalize_unlock(trans); 1105 return b; 1106 } 1107 1108 int bch2_btree_node_prefetch(struct btree_trans *trans, 1109 struct btree_path *path, 1110 const struct bkey_i *k, 1111 enum btree_id btree_id, unsigned level) 1112 { 1113 struct bch_fs *c = trans->c; 1114 struct btree_cache *bc = &c->btree_cache; 1115 struct btree *b; 1116 1117 BUG_ON(path && !btree_node_locked(path, level + 1)); 1118 BUG_ON(level >= BTREE_MAX_DEPTH); 1119 1120 b = btree_cache_find(bc, k); 1121 if (b) 1122 return 0; 1123 1124 b = bch2_btree_node_fill(trans, path, k, btree_id, 1125 level, SIX_LOCK_read, false); 1126 return PTR_ERR_OR_ZERO(b); 1127 } 1128 1129 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1130 { 1131 struct bch_fs *c = trans->c; 1132 struct btree_cache *bc = &c->btree_cache; 1133 struct btree *b; 1134 1135 b = btree_cache_find(bc, k); 1136 if (!b) 1137 return; 1138 1139 BUG_ON(b == btree_node_root(trans->c, b)); 1140 wait_on_io: 1141 /* not allowed to wait on io with btree locks held: */ 1142 1143 /* XXX we're called from btree_gc which will be holding other btree 1144 * nodes locked 1145 */ 1146 __bch2_btree_node_wait_on_read(b); 1147 __bch2_btree_node_wait_on_write(b); 1148 1149 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1150 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1151 1152 if (btree_node_dirty(b)) { 1153 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1154 six_unlock_write(&b->c.lock); 1155 six_unlock_intent(&b->c.lock); 1156 goto wait_on_io; 1157 } 1158 1159 BUG_ON(btree_node_dirty(b)); 1160 1161 mutex_lock(&bc->lock); 1162 btree_node_data_free(c, b); 1163 bch2_btree_node_hash_remove(bc, b); 1164 mutex_unlock(&bc->lock); 1165 1166 six_unlock_write(&b->c.lock); 1167 six_unlock_intent(&b->c.lock); 1168 } 1169 1170 const char *bch2_btree_id_str(enum btree_id btree) 1171 { 1172 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1173 } 1174 1175 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1176 { 1177 prt_printf(out, "%s level %u/%u\n ", 1178 bch2_btree_id_str(b->c.btree_id), 1179 b->c.level, 1180 bch2_btree_id_root(c, b->c.btree_id)->level); 1181 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1182 } 1183 1184 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1185 { 1186 struct bset_stats stats; 1187 1188 memset(&stats, 0, sizeof(stats)); 1189 1190 bch2_btree_keys_stats(b, &stats); 1191 1192 prt_printf(out, "l %u ", b->c.level); 1193 bch2_bpos_to_text(out, b->data->min_key); 1194 prt_printf(out, " - "); 1195 bch2_bpos_to_text(out, b->data->max_key); 1196 prt_printf(out, ":\n" 1197 " ptrs: "); 1198 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1199 prt_newline(out); 1200 1201 prt_printf(out, 1202 " format: "); 1203 bch2_bkey_format_to_text(out, &b->format); 1204 1205 prt_printf(out, 1206 " unpack fn len: %u\n" 1207 " bytes used %zu/%zu (%zu%% full)\n" 1208 " sib u64s: %u, %u (merge threshold %u)\n" 1209 " nr packed keys %u\n" 1210 " nr unpacked keys %u\n" 1211 " floats %zu\n" 1212 " failed unpacked %zu\n", 1213 b->unpack_fn_len, 1214 b->nr.live_u64s * sizeof(u64), 1215 btree_buf_bytes(b) - sizeof(struct btree_node), 1216 b->nr.live_u64s * 100 / btree_max_u64s(c), 1217 b->sib_u64s[0], 1218 b->sib_u64s[1], 1219 c->btree_foreground_merge_threshold, 1220 b->nr.packed_keys, 1221 b->nr.unpacked_keys, 1222 stats.floats, 1223 stats.failed); 1224 } 1225 1226 void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) 1227 { 1228 prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); 1229 prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); 1230 prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); 1231 } 1232