1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bbpos.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_iter.h" 9 #include "btree_locking.h" 10 #include "debug.h" 11 #include "errcode.h" 12 #include "error.h" 13 #include "journal.h" 14 #include "trace.h" 15 16 #include <linux/prefetch.h> 17 #include <linux/sched/mm.h> 18 19 const char * const bch2_btree_node_flags[] = { 20 #define x(f) #f, 21 BTREE_FLAGS() 22 #undef x 23 NULL 24 }; 25 26 void bch2_recalc_btree_reserve(struct bch_fs *c) 27 { 28 unsigned i, reserve = 16; 29 30 if (!c->btree_roots_known[0].b) 31 reserve += 8; 32 33 for (i = 0; i < btree_id_nr_alive(c); i++) { 34 struct btree_root *r = bch2_btree_id_root(c, i); 35 36 if (r->b) 37 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 38 } 39 40 c->btree_cache.reserve = reserve; 41 } 42 43 static inline unsigned btree_cache_can_free(struct btree_cache *bc) 44 { 45 return max_t(int, 0, bc->used - bc->reserve); 46 } 47 48 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 49 { 50 if (b->c.lock.readers) 51 list_move(&b->list, &bc->freed_pcpu); 52 else 53 list_move(&b->list, &bc->freed_nonpcpu); 54 } 55 56 static void btree_node_data_free(struct bch_fs *c, struct btree *b) 57 { 58 struct btree_cache *bc = &c->btree_cache; 59 60 EBUG_ON(btree_node_write_in_flight(b)); 61 62 clear_btree_node_just_written(b); 63 64 kvfree(b->data); 65 b->data = NULL; 66 #ifdef __KERNEL__ 67 kvfree(b->aux_data); 68 #else 69 munmap(b->aux_data, btree_aux_data_bytes(b)); 70 #endif 71 b->aux_data = NULL; 72 73 bc->used--; 74 75 btree_node_to_freedlist(bc, b); 76 } 77 78 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 79 const void *obj) 80 { 81 const struct btree *b = obj; 82 const u64 *v = arg->key; 83 84 return b->hash_val == *v ? 0 : 1; 85 } 86 87 static const struct rhashtable_params bch_btree_cache_params = { 88 .head_offset = offsetof(struct btree, hash), 89 .key_offset = offsetof(struct btree, hash_val), 90 .key_len = sizeof(u64), 91 .obj_cmpfn = bch2_btree_cache_cmp_fn, 92 }; 93 94 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 95 { 96 BUG_ON(b->data || b->aux_data); 97 98 b->data = kvmalloc(btree_buf_bytes(b), gfp); 99 if (!b->data) 100 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 101 #ifdef __KERNEL__ 102 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 103 #else 104 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 105 PROT_READ|PROT_WRITE|PROT_EXEC, 106 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 107 if (b->aux_data == MAP_FAILED) 108 b->aux_data = NULL; 109 #endif 110 if (!b->aux_data) { 111 kvfree(b->data); 112 b->data = NULL; 113 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 114 } 115 116 return 0; 117 } 118 119 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 120 { 121 struct btree *b; 122 123 b = kzalloc(sizeof(struct btree), gfp); 124 if (!b) 125 return NULL; 126 127 bkey_btree_ptr_init(&b->key); 128 INIT_LIST_HEAD(&b->list); 129 INIT_LIST_HEAD(&b->write_blocked); 130 b->byte_order = ilog2(c->opts.btree_node_size); 131 return b; 132 } 133 134 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 135 { 136 struct btree_cache *bc = &c->btree_cache; 137 struct btree *b; 138 139 b = __btree_node_mem_alloc(c, GFP_KERNEL); 140 if (!b) 141 return NULL; 142 143 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 144 kfree(b); 145 return NULL; 146 } 147 148 bch2_btree_lock_init(&b->c, 0); 149 150 bc->used++; 151 list_add(&b->list, &bc->freeable); 152 return b; 153 } 154 155 /* Btree in memory cache - hash table */ 156 157 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 158 { 159 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 160 161 BUG_ON(ret); 162 163 /* Cause future lookups for this node to fail: */ 164 b->hash_val = 0; 165 } 166 167 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 168 { 169 BUG_ON(b->hash_val); 170 b->hash_val = btree_ptr_hash_val(&b->key); 171 172 return rhashtable_lookup_insert_fast(&bc->table, &b->hash, 173 bch_btree_cache_params); 174 } 175 176 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 177 unsigned level, enum btree_id id) 178 { 179 int ret; 180 181 b->c.level = level; 182 b->c.btree_id = id; 183 184 mutex_lock(&bc->lock); 185 ret = __bch2_btree_node_hash_insert(bc, b); 186 if (!ret) 187 list_add_tail(&b->list, &bc->live); 188 mutex_unlock(&bc->lock); 189 190 return ret; 191 } 192 193 __flatten 194 static inline struct btree *btree_cache_find(struct btree_cache *bc, 195 const struct bkey_i *k) 196 { 197 u64 v = btree_ptr_hash_val(k); 198 199 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 200 } 201 202 /* 203 * this version is for btree nodes that have already been freed (we're not 204 * reaping a real btree node) 205 */ 206 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) 207 { 208 struct btree_cache *bc = &c->btree_cache; 209 int ret = 0; 210 211 lockdep_assert_held(&bc->lock); 212 213 struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 214 215 u64 mask = b->c.level 216 ? bc->pinned_nodes_interior_mask 217 : bc->pinned_nodes_leaf_mask; 218 219 if ((mask & BIT_ULL(b->c.btree_id)) && 220 bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 221 bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) 222 return -BCH_ERR_ENOMEM_btree_node_reclaim; 223 224 wait_on_io: 225 if (b->flags & ((1U << BTREE_NODE_dirty)| 226 (1U << BTREE_NODE_read_in_flight)| 227 (1U << BTREE_NODE_write_in_flight))) { 228 if (!flush) 229 return -BCH_ERR_ENOMEM_btree_node_reclaim; 230 231 /* XXX: waiting on IO with btree cache lock held */ 232 bch2_btree_node_wait_on_read(b); 233 bch2_btree_node_wait_on_write(b); 234 } 235 236 if (!six_trylock_intent(&b->c.lock)) 237 return -BCH_ERR_ENOMEM_btree_node_reclaim; 238 239 if (!six_trylock_write(&b->c.lock)) 240 goto out_unlock_intent; 241 242 /* recheck under lock */ 243 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 244 (1U << BTREE_NODE_write_in_flight))) { 245 if (!flush) 246 goto out_unlock; 247 six_unlock_write(&b->c.lock); 248 six_unlock_intent(&b->c.lock); 249 goto wait_on_io; 250 } 251 252 if (btree_node_noevict(b) || 253 btree_node_write_blocked(b) || 254 btree_node_will_make_reachable(b)) 255 goto out_unlock; 256 257 if (btree_node_dirty(b)) { 258 if (!flush) 259 goto out_unlock; 260 /* 261 * Using the underscore version because we don't want to compact 262 * bsets after the write, since this node is about to be evicted 263 * - unless btree verify mode is enabled, since it runs out of 264 * the post write cleanup: 265 */ 266 if (bch2_verify_btree_ondisk) 267 bch2_btree_node_write(c, b, SIX_LOCK_intent, 268 BTREE_WRITE_cache_reclaim); 269 else 270 __bch2_btree_node_write(c, b, 271 BTREE_WRITE_cache_reclaim); 272 273 six_unlock_write(&b->c.lock); 274 six_unlock_intent(&b->c.lock); 275 goto wait_on_io; 276 } 277 out: 278 if (b->hash_val && !ret) 279 trace_and_count(c, btree_cache_reap, c, b); 280 return ret; 281 out_unlock: 282 six_unlock_write(&b->c.lock); 283 out_unlock_intent: 284 six_unlock_intent(&b->c.lock); 285 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 286 goto out; 287 } 288 289 static int btree_node_reclaim(struct bch_fs *c, struct btree *b) 290 { 291 return __btree_node_reclaim(c, b, false); 292 } 293 294 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 295 { 296 return __btree_node_reclaim(c, b, true); 297 } 298 299 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 300 struct shrink_control *sc) 301 { 302 struct bch_fs *c = shrink->private_data; 303 struct btree_cache *bc = &c->btree_cache; 304 struct btree *b, *t; 305 unsigned long nr = sc->nr_to_scan; 306 unsigned long can_free = 0; 307 unsigned long freed = 0; 308 unsigned long touched = 0; 309 unsigned i, flags; 310 unsigned long ret = SHRINK_STOP; 311 bool trigger_writes = atomic_read(&bc->dirty) + nr >= 312 bc->used * 3 / 4; 313 314 if (bch2_btree_shrinker_disabled) 315 return SHRINK_STOP; 316 317 mutex_lock(&bc->lock); 318 flags = memalloc_nofs_save(); 319 320 /* 321 * It's _really_ critical that we don't free too many btree nodes - we 322 * have to always leave ourselves a reserve. The reserve is how we 323 * guarantee that allocating memory for a new btree node can always 324 * succeed, so that inserting keys into the btree can always succeed and 325 * IO can always make forward progress: 326 */ 327 can_free = btree_cache_can_free(bc); 328 nr = min_t(unsigned long, nr, can_free); 329 330 i = 0; 331 list_for_each_entry_safe(b, t, &bc->freeable, list) { 332 /* 333 * Leave a few nodes on the freeable list, so that a btree split 334 * won't have to hit the system allocator: 335 */ 336 if (++i <= 3) 337 continue; 338 339 touched++; 340 341 if (touched >= nr) 342 goto out; 343 344 if (!btree_node_reclaim(c, b)) { 345 btree_node_data_free(c, b); 346 six_unlock_write(&b->c.lock); 347 six_unlock_intent(&b->c.lock); 348 freed++; 349 } 350 } 351 restart: 352 list_for_each_entry_safe(b, t, &bc->live, list) { 353 touched++; 354 355 if (btree_node_accessed(b)) { 356 clear_btree_node_accessed(b); 357 } else if (!btree_node_reclaim(c, b)) { 358 freed++; 359 btree_node_data_free(c, b); 360 361 bch2_btree_node_hash_remove(bc, b); 362 six_unlock_write(&b->c.lock); 363 six_unlock_intent(&b->c.lock); 364 365 if (freed == nr) 366 goto out_rotate; 367 } else if (trigger_writes && 368 btree_node_dirty(b) && 369 !btree_node_will_make_reachable(b) && 370 !btree_node_write_blocked(b) && 371 six_trylock_read(&b->c.lock)) { 372 list_move(&bc->live, &b->list); 373 mutex_unlock(&bc->lock); 374 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 375 six_unlock_read(&b->c.lock); 376 if (touched >= nr) 377 goto out_nounlock; 378 mutex_lock(&bc->lock); 379 goto restart; 380 } 381 382 if (touched >= nr) 383 break; 384 } 385 out_rotate: 386 if (&t->list != &bc->live) 387 list_move_tail(&bc->live, &t->list); 388 out: 389 mutex_unlock(&bc->lock); 390 out_nounlock: 391 ret = freed; 392 memalloc_nofs_restore(flags); 393 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 394 return ret; 395 } 396 397 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 398 struct shrink_control *sc) 399 { 400 struct bch_fs *c = shrink->private_data; 401 struct btree_cache *bc = &c->btree_cache; 402 403 if (bch2_btree_shrinker_disabled) 404 return 0; 405 406 return btree_cache_can_free(bc); 407 } 408 409 void bch2_fs_btree_cache_exit(struct bch_fs *c) 410 { 411 struct btree_cache *bc = &c->btree_cache; 412 struct btree *b; 413 unsigned i, flags; 414 415 shrinker_free(bc->shrink); 416 417 /* vfree() can allocate memory: */ 418 flags = memalloc_nofs_save(); 419 mutex_lock(&bc->lock); 420 421 if (c->verify_data) 422 list_move(&c->verify_data->list, &bc->live); 423 424 kvfree(c->verify_ondisk); 425 426 for (i = 0; i < btree_id_nr_alive(c); i++) { 427 struct btree_root *r = bch2_btree_id_root(c, i); 428 429 if (r->b) 430 list_add(&r->b->list, &bc->live); 431 } 432 433 list_splice(&bc->freeable, &bc->live); 434 435 while (!list_empty(&bc->live)) { 436 b = list_first_entry(&bc->live, struct btree, list); 437 438 BUG_ON(btree_node_read_in_flight(b) || 439 btree_node_write_in_flight(b)); 440 441 btree_node_data_free(c, b); 442 } 443 444 BUG_ON(!bch2_journal_error(&c->journal) && 445 atomic_read(&c->btree_cache.dirty)); 446 447 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 448 449 while (!list_empty(&bc->freed_nonpcpu)) { 450 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); 451 list_del(&b->list); 452 six_lock_exit(&b->c.lock); 453 kfree(b); 454 } 455 456 mutex_unlock(&bc->lock); 457 memalloc_nofs_restore(flags); 458 459 if (bc->table_init_done) 460 rhashtable_destroy(&bc->table); 461 } 462 463 int bch2_fs_btree_cache_init(struct bch_fs *c) 464 { 465 struct btree_cache *bc = &c->btree_cache; 466 struct shrinker *shrink; 467 unsigned i; 468 int ret = 0; 469 470 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 471 if (ret) 472 goto err; 473 474 bc->table_init_done = true; 475 476 bch2_recalc_btree_reserve(c); 477 478 for (i = 0; i < bc->reserve; i++) 479 if (!__bch2_btree_node_mem_alloc(c)) 480 goto err; 481 482 list_splice_init(&bc->live, &bc->freeable); 483 484 mutex_init(&c->verify_lock); 485 486 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 487 if (!shrink) 488 goto err; 489 bc->shrink = shrink; 490 shrink->count_objects = bch2_btree_cache_count; 491 shrink->scan_objects = bch2_btree_cache_scan; 492 shrink->seeks = 4; 493 shrink->private_data = c; 494 shrinker_register(shrink); 495 496 return 0; 497 err: 498 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 499 } 500 501 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 502 { 503 mutex_init(&bc->lock); 504 INIT_LIST_HEAD(&bc->live); 505 INIT_LIST_HEAD(&bc->freeable); 506 INIT_LIST_HEAD(&bc->freed_pcpu); 507 INIT_LIST_HEAD(&bc->freed_nonpcpu); 508 } 509 510 /* 511 * We can only have one thread cannibalizing other cached btree nodes at a time, 512 * or we'll deadlock. We use an open coded mutex to ensure that, which a 513 * cannibalize_bucket() will take. This means every time we unlock the root of 514 * the btree, we need to release this lock if we have it held. 515 */ 516 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) 517 { 518 struct bch_fs *c = trans->c; 519 struct btree_cache *bc = &c->btree_cache; 520 521 if (bc->alloc_lock == current) { 522 trace_and_count(c, btree_cache_cannibalize_unlock, trans); 523 bc->alloc_lock = NULL; 524 closure_wake_up(&bc->alloc_wait); 525 } 526 } 527 528 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) 529 { 530 struct bch_fs *c = trans->c; 531 struct btree_cache *bc = &c->btree_cache; 532 struct task_struct *old; 533 534 old = cmpxchg(&bc->alloc_lock, NULL, current); 535 if (old == NULL || old == current) 536 goto success; 537 538 if (!cl) { 539 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 540 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 541 } 542 543 closure_wait(&bc->alloc_wait, cl); 544 545 /* Try again, after adding ourselves to waitlist */ 546 old = cmpxchg(&bc->alloc_lock, NULL, current); 547 if (old == NULL || old == current) { 548 /* We raced */ 549 closure_wake_up(&bc->alloc_wait); 550 goto success; 551 } 552 553 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 554 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 555 556 success: 557 trace_and_count(c, btree_cache_cannibalize_lock, trans); 558 return 0; 559 } 560 561 static struct btree *btree_node_cannibalize(struct bch_fs *c) 562 { 563 struct btree_cache *bc = &c->btree_cache; 564 struct btree *b; 565 566 list_for_each_entry_reverse(b, &bc->live, list) 567 if (!btree_node_reclaim(c, b)) 568 return b; 569 570 while (1) { 571 list_for_each_entry_reverse(b, &bc->live, list) 572 if (!btree_node_write_and_reclaim(c, b)) 573 return b; 574 575 /* 576 * Rare case: all nodes were intent-locked. 577 * Just busy-wait. 578 */ 579 WARN_ONCE(1, "btree cache cannibalize failed\n"); 580 cond_resched(); 581 } 582 } 583 584 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 585 { 586 struct bch_fs *c = trans->c; 587 struct btree_cache *bc = &c->btree_cache; 588 struct list_head *freed = pcpu_read_locks 589 ? &bc->freed_pcpu 590 : &bc->freed_nonpcpu; 591 struct btree *b, *b2; 592 u64 start_time = local_clock(); 593 unsigned flags; 594 595 flags = memalloc_nofs_save(); 596 mutex_lock(&bc->lock); 597 598 /* 599 * We never free struct btree itself, just the memory that holds the on 600 * disk node. Check the freed list before allocating a new one: 601 */ 602 list_for_each_entry(b, freed, list) 603 if (!btree_node_reclaim(c, b)) { 604 list_del_init(&b->list); 605 goto got_node; 606 } 607 608 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 609 if (!b) { 610 mutex_unlock(&bc->lock); 611 bch2_trans_unlock(trans); 612 b = __btree_node_mem_alloc(c, GFP_KERNEL); 613 if (!b) 614 goto err; 615 mutex_lock(&bc->lock); 616 } 617 618 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); 619 620 BUG_ON(!six_trylock_intent(&b->c.lock)); 621 BUG_ON(!six_trylock_write(&b->c.lock)); 622 got_node: 623 624 /* 625 * btree_free() doesn't free memory; it sticks the node on the end of 626 * the list. Check if there's any freed nodes there: 627 */ 628 list_for_each_entry(b2, &bc->freeable, list) 629 if (!btree_node_reclaim(c, b2)) { 630 swap(b->data, b2->data); 631 swap(b->aux_data, b2->aux_data); 632 btree_node_to_freedlist(bc, b2); 633 six_unlock_write(&b2->c.lock); 634 six_unlock_intent(&b2->c.lock); 635 goto got_mem; 636 } 637 638 mutex_unlock(&bc->lock); 639 640 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 641 bch2_trans_unlock(trans); 642 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 643 goto err; 644 } 645 646 mutex_lock(&bc->lock); 647 bc->used++; 648 got_mem: 649 mutex_unlock(&bc->lock); 650 651 BUG_ON(btree_node_hashed(b)); 652 BUG_ON(btree_node_dirty(b)); 653 BUG_ON(btree_node_write_in_flight(b)); 654 out: 655 b->flags = 0; 656 b->written = 0; 657 b->nsets = 0; 658 b->sib_u64s[0] = 0; 659 b->sib_u64s[1] = 0; 660 b->whiteout_u64s = 0; 661 bch2_btree_keys_init(b); 662 set_btree_node_accessed(b); 663 664 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 665 start_time); 666 667 memalloc_nofs_restore(flags); 668 return b; 669 err: 670 mutex_lock(&bc->lock); 671 672 /* Try to cannibalize another cached btree node: */ 673 if (bc->alloc_lock == current) { 674 b2 = btree_node_cannibalize(c); 675 clear_btree_node_just_written(b2); 676 bch2_btree_node_hash_remove(bc, b2); 677 678 if (b) { 679 swap(b->data, b2->data); 680 swap(b->aux_data, b2->aux_data); 681 btree_node_to_freedlist(bc, b2); 682 six_unlock_write(&b2->c.lock); 683 six_unlock_intent(&b2->c.lock); 684 } else { 685 b = b2; 686 list_del_init(&b->list); 687 } 688 689 mutex_unlock(&bc->lock); 690 691 trace_and_count(c, btree_cache_cannibalize, trans); 692 goto out; 693 } 694 695 mutex_unlock(&bc->lock); 696 memalloc_nofs_restore(flags); 697 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 698 } 699 700 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 701 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 702 struct btree_path *path, 703 const struct bkey_i *k, 704 enum btree_id btree_id, 705 unsigned level, 706 enum six_lock_type lock_type, 707 bool sync) 708 { 709 struct bch_fs *c = trans->c; 710 struct btree_cache *bc = &c->btree_cache; 711 struct btree *b; 712 u32 seq; 713 714 BUG_ON(level + 1 >= BTREE_MAX_DEPTH); 715 /* 716 * Parent node must be locked, else we could read in a btree node that's 717 * been freed: 718 */ 719 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 720 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 721 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 722 } 723 724 b = bch2_btree_node_mem_alloc(trans, level != 0); 725 726 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 727 if (!path) 728 return b; 729 730 trans->memory_allocation_failure = true; 731 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 732 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 733 } 734 735 if (IS_ERR(b)) 736 return b; 737 738 bkey_copy(&b->key, k); 739 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 740 /* raced with another fill: */ 741 742 /* mark as unhashed... */ 743 b->hash_val = 0; 744 745 mutex_lock(&bc->lock); 746 list_add(&b->list, &bc->freeable); 747 mutex_unlock(&bc->lock); 748 749 six_unlock_write(&b->c.lock); 750 six_unlock_intent(&b->c.lock); 751 return NULL; 752 } 753 754 set_btree_node_read_in_flight(b); 755 756 six_unlock_write(&b->c.lock); 757 seq = six_lock_seq(&b->c.lock); 758 six_unlock_intent(&b->c.lock); 759 760 /* Unlock before doing IO: */ 761 if (path && sync) 762 bch2_trans_unlock_noassert(trans); 763 764 bch2_btree_node_read(trans, b, sync); 765 766 if (!sync) 767 return NULL; 768 769 if (path) { 770 int ret = bch2_trans_relock(trans) ?: 771 bch2_btree_path_relock_intent(trans, path); 772 if (ret) { 773 BUG_ON(!trans->restarted); 774 return ERR_PTR(ret); 775 } 776 } 777 778 if (!six_relock_type(&b->c.lock, lock_type, seq)) { 779 BUG_ON(!path); 780 781 trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); 782 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); 783 } 784 785 return b; 786 } 787 788 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 789 { 790 struct printbuf buf = PRINTBUF; 791 792 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 793 return; 794 795 prt_printf(&buf, 796 "btree node header doesn't match ptr\n" 797 "btree %s level %u\n" 798 "ptr: ", 799 bch2_btree_id_str(b->c.btree_id), b->c.level); 800 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 801 802 prt_printf(&buf, "\nheader: btree %s level %llu\n" 803 "min ", 804 bch2_btree_id_str(BTREE_NODE_ID(b->data)), 805 BTREE_NODE_LEVEL(b->data)); 806 bch2_bpos_to_text(&buf, b->data->min_key); 807 808 prt_printf(&buf, "\nmax "); 809 bch2_bpos_to_text(&buf, b->data->max_key); 810 811 bch2_fs_inconsistent(c, "%s", buf.buf); 812 printbuf_exit(&buf); 813 } 814 815 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 816 { 817 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 818 b->c.level != BTREE_NODE_LEVEL(b->data) || 819 !bpos_eq(b->data->max_key, b->key.k.p) || 820 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 821 !bpos_eq(b->data->min_key, 822 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 823 btree_bad_header(c, b); 824 } 825 826 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 827 const struct bkey_i *k, unsigned level, 828 enum six_lock_type lock_type, 829 unsigned long trace_ip) 830 { 831 struct bch_fs *c = trans->c; 832 struct btree_cache *bc = &c->btree_cache; 833 struct btree *b; 834 struct bset_tree *t; 835 bool need_relock = false; 836 int ret; 837 838 EBUG_ON(level >= BTREE_MAX_DEPTH); 839 retry: 840 b = btree_cache_find(bc, k); 841 if (unlikely(!b)) { 842 /* 843 * We must have the parent locked to call bch2_btree_node_fill(), 844 * else we could read in a btree node from disk that's been 845 * freed: 846 */ 847 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 848 level, lock_type, true); 849 need_relock = true; 850 851 /* We raced and found the btree node in the cache */ 852 if (!b) 853 goto retry; 854 855 if (IS_ERR(b)) 856 return b; 857 } else { 858 if (btree_node_read_locked(path, level + 1)) 859 btree_node_unlock(trans, path, level + 1); 860 861 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 862 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 863 return ERR_PTR(ret); 864 865 BUG_ON(ret); 866 867 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 868 b->c.level != level || 869 race_fault())) { 870 six_unlock_type(&b->c.lock, lock_type); 871 if (bch2_btree_node_relock(trans, path, level + 1)) 872 goto retry; 873 874 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 875 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 876 } 877 878 /* avoid atomic set bit if it's not needed: */ 879 if (!btree_node_accessed(b)) 880 set_btree_node_accessed(b); 881 } 882 883 if (unlikely(btree_node_read_in_flight(b))) { 884 u32 seq = six_lock_seq(&b->c.lock); 885 886 six_unlock_type(&b->c.lock, lock_type); 887 bch2_trans_unlock(trans); 888 need_relock = true; 889 890 bch2_btree_node_wait_on_read(b); 891 892 /* 893 * should_be_locked is not set on this path yet, so we need to 894 * relock it specifically: 895 */ 896 if (!six_relock_type(&b->c.lock, lock_type, seq)) 897 goto retry; 898 } 899 900 if (unlikely(need_relock)) { 901 ret = bch2_trans_relock(trans) ?: 902 bch2_btree_path_relock_intent(trans, path); 903 if (ret) { 904 six_unlock_type(&b->c.lock, lock_type); 905 return ERR_PTR(ret); 906 } 907 } 908 909 prefetch(b->aux_data); 910 911 for_each_bset(b, t) { 912 void *p = (u64 *) b->aux_data + t->aux_data_offset; 913 914 prefetch(p + L1_CACHE_BYTES * 0); 915 prefetch(p + L1_CACHE_BYTES * 1); 916 prefetch(p + L1_CACHE_BYTES * 2); 917 } 918 919 if (unlikely(btree_node_read_error(b))) { 920 six_unlock_type(&b->c.lock, lock_type); 921 return ERR_PTR(-BCH_ERR_btree_node_read_error); 922 } 923 924 EBUG_ON(b->c.btree_id != path->btree_id); 925 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 926 btree_check_header(c, b); 927 928 return b; 929 } 930 931 /** 932 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 933 * in from disk if necessary. 934 * 935 * @trans: btree transaction object 936 * @path: btree_path being traversed 937 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 938 * @level: level of btree node being looked up (0 == leaf node) 939 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 940 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 941 * 942 * The btree node will have either a read or a write lock held, depending on 943 * the @write parameter. 944 * 945 * Returns: btree node or ERR_PTR() 946 */ 947 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 948 const struct bkey_i *k, unsigned level, 949 enum six_lock_type lock_type, 950 unsigned long trace_ip) 951 { 952 struct bch_fs *c = trans->c; 953 struct btree *b; 954 struct bset_tree *t; 955 int ret; 956 957 EBUG_ON(level >= BTREE_MAX_DEPTH); 958 959 b = btree_node_mem_ptr(k); 960 961 /* 962 * Check b->hash_val _before_ calling btree_node_lock() - this might not 963 * be the node we want anymore, and trying to lock the wrong node could 964 * cause an unneccessary transaction restart: 965 */ 966 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 967 !b || 968 b->hash_val != btree_ptr_hash_val(k))) 969 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 970 971 if (btree_node_read_locked(path, level + 1)) 972 btree_node_unlock(trans, path, level + 1); 973 974 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 975 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 976 return ERR_PTR(ret); 977 978 BUG_ON(ret); 979 980 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 981 b->c.level != level || 982 race_fault())) { 983 six_unlock_type(&b->c.lock, lock_type); 984 if (bch2_btree_node_relock(trans, path, level + 1)) 985 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 986 987 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 988 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 989 } 990 991 if (unlikely(btree_node_read_in_flight(b))) { 992 six_unlock_type(&b->c.lock, lock_type); 993 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 994 } 995 996 prefetch(b->aux_data); 997 998 for_each_bset(b, t) { 999 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1000 1001 prefetch(p + L1_CACHE_BYTES * 0); 1002 prefetch(p + L1_CACHE_BYTES * 1); 1003 prefetch(p + L1_CACHE_BYTES * 2); 1004 } 1005 1006 /* avoid atomic set bit if it's not needed: */ 1007 if (!btree_node_accessed(b)) 1008 set_btree_node_accessed(b); 1009 1010 if (unlikely(btree_node_read_error(b))) { 1011 six_unlock_type(&b->c.lock, lock_type); 1012 return ERR_PTR(-BCH_ERR_btree_node_read_error); 1013 } 1014 1015 EBUG_ON(b->c.btree_id != path->btree_id); 1016 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1017 btree_check_header(c, b); 1018 1019 return b; 1020 } 1021 1022 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1023 const struct bkey_i *k, 1024 enum btree_id btree_id, 1025 unsigned level, 1026 bool nofill) 1027 { 1028 struct bch_fs *c = trans->c; 1029 struct btree_cache *bc = &c->btree_cache; 1030 struct btree *b; 1031 struct bset_tree *t; 1032 int ret; 1033 1034 EBUG_ON(level >= BTREE_MAX_DEPTH); 1035 1036 if (c->opts.btree_node_mem_ptr_optimization) { 1037 b = btree_node_mem_ptr(k); 1038 if (b) 1039 goto lock_node; 1040 } 1041 retry: 1042 b = btree_cache_find(bc, k); 1043 if (unlikely(!b)) { 1044 if (nofill) 1045 goto out; 1046 1047 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1048 level, SIX_LOCK_read, true); 1049 1050 /* We raced and found the btree node in the cache */ 1051 if (!b) 1052 goto retry; 1053 1054 if (IS_ERR(b) && 1055 !bch2_btree_cache_cannibalize_lock(trans, NULL)) 1056 goto retry; 1057 1058 if (IS_ERR(b)) 1059 goto out; 1060 } else { 1061 lock_node: 1062 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1063 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1064 return ERR_PTR(ret); 1065 1066 BUG_ON(ret); 1067 1068 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1069 b->c.btree_id != btree_id || 1070 b->c.level != level)) { 1071 six_unlock_read(&b->c.lock); 1072 goto retry; 1073 } 1074 } 1075 1076 /* XXX: waiting on IO with btree locks held: */ 1077 __bch2_btree_node_wait_on_read(b); 1078 1079 prefetch(b->aux_data); 1080 1081 for_each_bset(b, t) { 1082 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1083 1084 prefetch(p + L1_CACHE_BYTES * 0); 1085 prefetch(p + L1_CACHE_BYTES * 1); 1086 prefetch(p + L1_CACHE_BYTES * 2); 1087 } 1088 1089 /* avoid atomic set bit if it's not needed: */ 1090 if (!btree_node_accessed(b)) 1091 set_btree_node_accessed(b); 1092 1093 if (unlikely(btree_node_read_error(b))) { 1094 six_unlock_read(&b->c.lock); 1095 b = ERR_PTR(-BCH_ERR_btree_node_read_error); 1096 goto out; 1097 } 1098 1099 EBUG_ON(b->c.btree_id != btree_id); 1100 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1101 btree_check_header(c, b); 1102 out: 1103 bch2_btree_cache_cannibalize_unlock(trans); 1104 return b; 1105 } 1106 1107 int bch2_btree_node_prefetch(struct btree_trans *trans, 1108 struct btree_path *path, 1109 const struct bkey_i *k, 1110 enum btree_id btree_id, unsigned level) 1111 { 1112 struct bch_fs *c = trans->c; 1113 struct btree_cache *bc = &c->btree_cache; 1114 struct btree *b; 1115 1116 BUG_ON(path && !btree_node_locked(path, level + 1)); 1117 BUG_ON(level >= BTREE_MAX_DEPTH); 1118 1119 b = btree_cache_find(bc, k); 1120 if (b) 1121 return 0; 1122 1123 b = bch2_btree_node_fill(trans, path, k, btree_id, 1124 level, SIX_LOCK_read, false); 1125 return PTR_ERR_OR_ZERO(b); 1126 } 1127 1128 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1129 { 1130 struct bch_fs *c = trans->c; 1131 struct btree_cache *bc = &c->btree_cache; 1132 struct btree *b; 1133 1134 b = btree_cache_find(bc, k); 1135 if (!b) 1136 return; 1137 wait_on_io: 1138 /* not allowed to wait on io with btree locks held: */ 1139 1140 /* XXX we're called from btree_gc which will be holding other btree 1141 * nodes locked 1142 */ 1143 __bch2_btree_node_wait_on_read(b); 1144 __bch2_btree_node_wait_on_write(b); 1145 1146 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1147 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1148 1149 if (btree_node_dirty(b)) { 1150 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1151 six_unlock_write(&b->c.lock); 1152 six_unlock_intent(&b->c.lock); 1153 goto wait_on_io; 1154 } 1155 1156 BUG_ON(btree_node_dirty(b)); 1157 1158 mutex_lock(&bc->lock); 1159 btree_node_data_free(c, b); 1160 bch2_btree_node_hash_remove(bc, b); 1161 mutex_unlock(&bc->lock); 1162 1163 six_unlock_write(&b->c.lock); 1164 six_unlock_intent(&b->c.lock); 1165 } 1166 1167 const char *bch2_btree_id_str(enum btree_id btree) 1168 { 1169 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1170 } 1171 1172 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1173 { 1174 prt_printf(out, "%s level %u/%u\n ", 1175 bch2_btree_id_str(b->c.btree_id), 1176 b->c.level, 1177 bch2_btree_id_root(c, b->c.btree_id)->level); 1178 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1179 } 1180 1181 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1182 { 1183 struct bset_stats stats; 1184 1185 memset(&stats, 0, sizeof(stats)); 1186 1187 bch2_btree_keys_stats(b, &stats); 1188 1189 prt_printf(out, "l %u ", b->c.level); 1190 bch2_bpos_to_text(out, b->data->min_key); 1191 prt_printf(out, " - "); 1192 bch2_bpos_to_text(out, b->data->max_key); 1193 prt_printf(out, ":\n" 1194 " ptrs: "); 1195 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1196 prt_newline(out); 1197 1198 prt_printf(out, 1199 " format: "); 1200 bch2_bkey_format_to_text(out, &b->format); 1201 1202 prt_printf(out, 1203 " unpack fn len: %u\n" 1204 " bytes used %zu/%zu (%zu%% full)\n" 1205 " sib u64s: %u, %u (merge threshold %u)\n" 1206 " nr packed keys %u\n" 1207 " nr unpacked keys %u\n" 1208 " floats %zu\n" 1209 " failed unpacked %zu\n", 1210 b->unpack_fn_len, 1211 b->nr.live_u64s * sizeof(u64), 1212 btree_buf_bytes(b) - sizeof(struct btree_node), 1213 b->nr.live_u64s * 100 / btree_max_u64s(c), 1214 b->sib_u64s[0], 1215 b->sib_u64s[1], 1216 c->btree_foreground_merge_threshold, 1217 b->nr.packed_keys, 1218 b->nr.unpacked_keys, 1219 stats.floats, 1220 stats.failed); 1221 } 1222 1223 void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) 1224 { 1225 prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); 1226 prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); 1227 prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); 1228 } 1229