1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_cache.h" 6 #include "btree_io.h" 7 #include "btree_iter.h" 8 #include "btree_locking.h" 9 #include "debug.h" 10 #include "errcode.h" 11 #include "error.h" 12 #include "trace.h" 13 14 #include <linux/prefetch.h> 15 #include <linux/sched/mm.h> 16 17 const char * const bch2_btree_node_flags[] = { 18 #define x(f) #f, 19 BTREE_FLAGS() 20 #undef x 21 NULL 22 }; 23 24 void bch2_recalc_btree_reserve(struct bch_fs *c) 25 { 26 unsigned i, reserve = 16; 27 28 if (!c->btree_roots_known[0].b) 29 reserve += 8; 30 31 for (i = 0; i < btree_id_nr_alive(c); i++) { 32 struct btree_root *r = bch2_btree_id_root(c, i); 33 34 if (r->b) 35 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 36 } 37 38 c->btree_cache.reserve = reserve; 39 } 40 41 static inline unsigned btree_cache_can_free(struct btree_cache *bc) 42 { 43 return max_t(int, 0, bc->used - bc->reserve); 44 } 45 46 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 47 { 48 if (b->c.lock.readers) 49 list_move(&b->list, &bc->freed_pcpu); 50 else 51 list_move(&b->list, &bc->freed_nonpcpu); 52 } 53 54 static void btree_node_data_free(struct bch_fs *c, struct btree *b) 55 { 56 struct btree_cache *bc = &c->btree_cache; 57 58 EBUG_ON(btree_node_write_in_flight(b)); 59 60 clear_btree_node_just_written(b); 61 62 kvpfree(b->data, btree_bytes(c)); 63 b->data = NULL; 64 #ifdef __KERNEL__ 65 kvfree(b->aux_data); 66 #else 67 munmap(b->aux_data, btree_aux_data_bytes(b)); 68 #endif 69 b->aux_data = NULL; 70 71 bc->used--; 72 73 btree_node_to_freedlist(bc, b); 74 } 75 76 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 77 const void *obj) 78 { 79 const struct btree *b = obj; 80 const u64 *v = arg->key; 81 82 return b->hash_val == *v ? 0 : 1; 83 } 84 85 static const struct rhashtable_params bch_btree_cache_params = { 86 .head_offset = offsetof(struct btree, hash), 87 .key_offset = offsetof(struct btree, hash_val), 88 .key_len = sizeof(u64), 89 .obj_cmpfn = bch2_btree_cache_cmp_fn, 90 }; 91 92 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 93 { 94 BUG_ON(b->data || b->aux_data); 95 96 b->data = kvpmalloc(btree_bytes(c), gfp); 97 if (!b->data) 98 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 99 #ifdef __KERNEL__ 100 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 101 #else 102 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 103 PROT_READ|PROT_WRITE|PROT_EXEC, 104 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 105 if (b->aux_data == MAP_FAILED) 106 b->aux_data = NULL; 107 #endif 108 if (!b->aux_data) { 109 kvpfree(b->data, btree_bytes(c)); 110 b->data = NULL; 111 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 112 } 113 114 return 0; 115 } 116 117 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 118 { 119 struct btree *b; 120 121 b = kzalloc(sizeof(struct btree), gfp); 122 if (!b) 123 return NULL; 124 125 bkey_btree_ptr_init(&b->key); 126 INIT_LIST_HEAD(&b->list); 127 INIT_LIST_HEAD(&b->write_blocked); 128 b->byte_order = ilog2(btree_bytes(c)); 129 return b; 130 } 131 132 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 133 { 134 struct btree_cache *bc = &c->btree_cache; 135 struct btree *b; 136 137 b = __btree_node_mem_alloc(c, GFP_KERNEL); 138 if (!b) 139 return NULL; 140 141 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 142 kfree(b); 143 return NULL; 144 } 145 146 bch2_btree_lock_init(&b->c, 0); 147 148 bc->used++; 149 list_add(&b->list, &bc->freeable); 150 return b; 151 } 152 153 /* Btree in memory cache - hash table */ 154 155 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 156 { 157 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 158 159 BUG_ON(ret); 160 161 /* Cause future lookups for this node to fail: */ 162 b->hash_val = 0; 163 } 164 165 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 166 { 167 BUG_ON(b->hash_val); 168 b->hash_val = btree_ptr_hash_val(&b->key); 169 170 return rhashtable_lookup_insert_fast(&bc->table, &b->hash, 171 bch_btree_cache_params); 172 } 173 174 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 175 unsigned level, enum btree_id id) 176 { 177 int ret; 178 179 b->c.level = level; 180 b->c.btree_id = id; 181 182 mutex_lock(&bc->lock); 183 ret = __bch2_btree_node_hash_insert(bc, b); 184 if (!ret) 185 list_add_tail(&b->list, &bc->live); 186 mutex_unlock(&bc->lock); 187 188 return ret; 189 } 190 191 __flatten 192 static inline struct btree *btree_cache_find(struct btree_cache *bc, 193 const struct bkey_i *k) 194 { 195 u64 v = btree_ptr_hash_val(k); 196 197 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 198 } 199 200 /* 201 * this version is for btree nodes that have already been freed (we're not 202 * reaping a real btree node) 203 */ 204 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) 205 { 206 struct btree_cache *bc = &c->btree_cache; 207 int ret = 0; 208 209 lockdep_assert_held(&bc->lock); 210 wait_on_io: 211 if (b->flags & ((1U << BTREE_NODE_dirty)| 212 (1U << BTREE_NODE_read_in_flight)| 213 (1U << BTREE_NODE_write_in_flight))) { 214 if (!flush) 215 return -BCH_ERR_ENOMEM_btree_node_reclaim; 216 217 /* XXX: waiting on IO with btree cache lock held */ 218 bch2_btree_node_wait_on_read(b); 219 bch2_btree_node_wait_on_write(b); 220 } 221 222 if (!six_trylock_intent(&b->c.lock)) 223 return -BCH_ERR_ENOMEM_btree_node_reclaim; 224 225 if (!six_trylock_write(&b->c.lock)) 226 goto out_unlock_intent; 227 228 /* recheck under lock */ 229 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 230 (1U << BTREE_NODE_write_in_flight))) { 231 if (!flush) 232 goto out_unlock; 233 six_unlock_write(&b->c.lock); 234 six_unlock_intent(&b->c.lock); 235 goto wait_on_io; 236 } 237 238 if (btree_node_noevict(b) || 239 btree_node_write_blocked(b) || 240 btree_node_will_make_reachable(b)) 241 goto out_unlock; 242 243 if (btree_node_dirty(b)) { 244 if (!flush) 245 goto out_unlock; 246 /* 247 * Using the underscore version because we don't want to compact 248 * bsets after the write, since this node is about to be evicted 249 * - unless btree verify mode is enabled, since it runs out of 250 * the post write cleanup: 251 */ 252 if (bch2_verify_btree_ondisk) 253 bch2_btree_node_write(c, b, SIX_LOCK_intent, 254 BTREE_WRITE_cache_reclaim); 255 else 256 __bch2_btree_node_write(c, b, 257 BTREE_WRITE_cache_reclaim); 258 259 six_unlock_write(&b->c.lock); 260 six_unlock_intent(&b->c.lock); 261 goto wait_on_io; 262 } 263 out: 264 if (b->hash_val && !ret) 265 trace_and_count(c, btree_cache_reap, c, b); 266 return ret; 267 out_unlock: 268 six_unlock_write(&b->c.lock); 269 out_unlock_intent: 270 six_unlock_intent(&b->c.lock); 271 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 272 goto out; 273 } 274 275 static int btree_node_reclaim(struct bch_fs *c, struct btree *b) 276 { 277 return __btree_node_reclaim(c, b, false); 278 } 279 280 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 281 { 282 return __btree_node_reclaim(c, b, true); 283 } 284 285 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 286 struct shrink_control *sc) 287 { 288 struct bch_fs *c = container_of(shrink, struct bch_fs, 289 btree_cache.shrink); 290 struct btree_cache *bc = &c->btree_cache; 291 struct btree *b, *t; 292 unsigned long nr = sc->nr_to_scan; 293 unsigned long can_free = 0; 294 unsigned long freed = 0; 295 unsigned long touched = 0; 296 unsigned i, flags; 297 unsigned long ret = SHRINK_STOP; 298 bool trigger_writes = atomic_read(&bc->dirty) + nr >= 299 bc->used * 3 / 4; 300 301 if (bch2_btree_shrinker_disabled) 302 return SHRINK_STOP; 303 304 mutex_lock(&bc->lock); 305 flags = memalloc_nofs_save(); 306 307 /* 308 * It's _really_ critical that we don't free too many btree nodes - we 309 * have to always leave ourselves a reserve. The reserve is how we 310 * guarantee that allocating memory for a new btree node can always 311 * succeed, so that inserting keys into the btree can always succeed and 312 * IO can always make forward progress: 313 */ 314 can_free = btree_cache_can_free(bc); 315 nr = min_t(unsigned long, nr, can_free); 316 317 i = 0; 318 list_for_each_entry_safe(b, t, &bc->freeable, list) { 319 /* 320 * Leave a few nodes on the freeable list, so that a btree split 321 * won't have to hit the system allocator: 322 */ 323 if (++i <= 3) 324 continue; 325 326 touched++; 327 328 if (touched >= nr) 329 goto out; 330 331 if (!btree_node_reclaim(c, b)) { 332 btree_node_data_free(c, b); 333 six_unlock_write(&b->c.lock); 334 six_unlock_intent(&b->c.lock); 335 freed++; 336 } 337 } 338 restart: 339 list_for_each_entry_safe(b, t, &bc->live, list) { 340 touched++; 341 342 if (btree_node_accessed(b)) { 343 clear_btree_node_accessed(b); 344 } else if (!btree_node_reclaim(c, b)) { 345 freed++; 346 btree_node_data_free(c, b); 347 348 bch2_btree_node_hash_remove(bc, b); 349 six_unlock_write(&b->c.lock); 350 six_unlock_intent(&b->c.lock); 351 352 if (freed == nr) 353 goto out_rotate; 354 } else if (trigger_writes && 355 btree_node_dirty(b) && 356 !btree_node_will_make_reachable(b) && 357 !btree_node_write_blocked(b) && 358 six_trylock_read(&b->c.lock)) { 359 list_move(&bc->live, &b->list); 360 mutex_unlock(&bc->lock); 361 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 362 six_unlock_read(&b->c.lock); 363 if (touched >= nr) 364 goto out_nounlock; 365 mutex_lock(&bc->lock); 366 goto restart; 367 } 368 369 if (touched >= nr) 370 break; 371 } 372 out_rotate: 373 if (&t->list != &bc->live) 374 list_move_tail(&bc->live, &t->list); 375 out: 376 mutex_unlock(&bc->lock); 377 out_nounlock: 378 ret = freed; 379 memalloc_nofs_restore(flags); 380 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 381 return ret; 382 } 383 384 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 385 struct shrink_control *sc) 386 { 387 struct bch_fs *c = container_of(shrink, struct bch_fs, 388 btree_cache.shrink); 389 struct btree_cache *bc = &c->btree_cache; 390 391 if (bch2_btree_shrinker_disabled) 392 return 0; 393 394 return btree_cache_can_free(bc); 395 } 396 397 void bch2_fs_btree_cache_exit(struct bch_fs *c) 398 { 399 struct btree_cache *bc = &c->btree_cache; 400 struct btree *b; 401 unsigned i, flags; 402 403 unregister_shrinker(&bc->shrink); 404 405 /* vfree() can allocate memory: */ 406 flags = memalloc_nofs_save(); 407 mutex_lock(&bc->lock); 408 409 if (c->verify_data) 410 list_move(&c->verify_data->list, &bc->live); 411 412 kvpfree(c->verify_ondisk, btree_bytes(c)); 413 414 for (i = 0; i < btree_id_nr_alive(c); i++) { 415 struct btree_root *r = bch2_btree_id_root(c, i); 416 417 if (r->b) 418 list_add(&r->b->list, &bc->live); 419 } 420 421 list_splice(&bc->freeable, &bc->live); 422 423 while (!list_empty(&bc->live)) { 424 b = list_first_entry(&bc->live, struct btree, list); 425 426 BUG_ON(btree_node_read_in_flight(b) || 427 btree_node_write_in_flight(b)); 428 429 if (btree_node_dirty(b)) 430 bch2_btree_complete_write(c, b, btree_current_write(b)); 431 clear_btree_node_dirty_acct(c, b); 432 433 btree_node_data_free(c, b); 434 } 435 436 BUG_ON(atomic_read(&c->btree_cache.dirty)); 437 438 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 439 440 while (!list_empty(&bc->freed_nonpcpu)) { 441 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); 442 list_del(&b->list); 443 six_lock_exit(&b->c.lock); 444 kfree(b); 445 } 446 447 mutex_unlock(&bc->lock); 448 memalloc_nofs_restore(flags); 449 450 if (bc->table_init_done) 451 rhashtable_destroy(&bc->table); 452 } 453 454 int bch2_fs_btree_cache_init(struct bch_fs *c) 455 { 456 struct btree_cache *bc = &c->btree_cache; 457 unsigned i; 458 int ret = 0; 459 460 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 461 if (ret) 462 goto err; 463 464 bc->table_init_done = true; 465 466 bch2_recalc_btree_reserve(c); 467 468 for (i = 0; i < bc->reserve; i++) 469 if (!__bch2_btree_node_mem_alloc(c)) 470 goto err; 471 472 list_splice_init(&bc->live, &bc->freeable); 473 474 mutex_init(&c->verify_lock); 475 476 bc->shrink.count_objects = bch2_btree_cache_count; 477 bc->shrink.scan_objects = bch2_btree_cache_scan; 478 bc->shrink.seeks = 4; 479 ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name); 480 if (ret) 481 goto err; 482 483 return 0; 484 err: 485 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 486 } 487 488 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 489 { 490 mutex_init(&bc->lock); 491 INIT_LIST_HEAD(&bc->live); 492 INIT_LIST_HEAD(&bc->freeable); 493 INIT_LIST_HEAD(&bc->freed_pcpu); 494 INIT_LIST_HEAD(&bc->freed_nonpcpu); 495 } 496 497 /* 498 * We can only have one thread cannibalizing other cached btree nodes at a time, 499 * or we'll deadlock. We use an open coded mutex to ensure that, which a 500 * cannibalize_bucket() will take. This means every time we unlock the root of 501 * the btree, we need to release this lock if we have it held. 502 */ 503 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) 504 { 505 struct btree_cache *bc = &c->btree_cache; 506 507 if (bc->alloc_lock == current) { 508 trace_and_count(c, btree_cache_cannibalize_unlock, c); 509 bc->alloc_lock = NULL; 510 closure_wake_up(&bc->alloc_wait); 511 } 512 } 513 514 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) 515 { 516 struct btree_cache *bc = &c->btree_cache; 517 struct task_struct *old; 518 519 old = cmpxchg(&bc->alloc_lock, NULL, current); 520 if (old == NULL || old == current) 521 goto success; 522 523 if (!cl) { 524 trace_and_count(c, btree_cache_cannibalize_lock_fail, c); 525 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 526 } 527 528 closure_wait(&bc->alloc_wait, cl); 529 530 /* Try again, after adding ourselves to waitlist */ 531 old = cmpxchg(&bc->alloc_lock, NULL, current); 532 if (old == NULL || old == current) { 533 /* We raced */ 534 closure_wake_up(&bc->alloc_wait); 535 goto success; 536 } 537 538 trace_and_count(c, btree_cache_cannibalize_lock_fail, c); 539 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 540 541 success: 542 trace_and_count(c, btree_cache_cannibalize_lock, c); 543 return 0; 544 } 545 546 static struct btree *btree_node_cannibalize(struct bch_fs *c) 547 { 548 struct btree_cache *bc = &c->btree_cache; 549 struct btree *b; 550 551 list_for_each_entry_reverse(b, &bc->live, list) 552 if (!btree_node_reclaim(c, b)) 553 return b; 554 555 while (1) { 556 list_for_each_entry_reverse(b, &bc->live, list) 557 if (!btree_node_write_and_reclaim(c, b)) 558 return b; 559 560 /* 561 * Rare case: all nodes were intent-locked. 562 * Just busy-wait. 563 */ 564 WARN_ONCE(1, "btree cache cannibalize failed\n"); 565 cond_resched(); 566 } 567 } 568 569 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 570 { 571 struct bch_fs *c = trans->c; 572 struct btree_cache *bc = &c->btree_cache; 573 struct list_head *freed = pcpu_read_locks 574 ? &bc->freed_pcpu 575 : &bc->freed_nonpcpu; 576 struct btree *b, *b2; 577 u64 start_time = local_clock(); 578 unsigned flags; 579 580 flags = memalloc_nofs_save(); 581 mutex_lock(&bc->lock); 582 583 /* 584 * We never free struct btree itself, just the memory that holds the on 585 * disk node. Check the freed list before allocating a new one: 586 */ 587 list_for_each_entry(b, freed, list) 588 if (!btree_node_reclaim(c, b)) { 589 list_del_init(&b->list); 590 goto got_node; 591 } 592 593 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 594 if (!b) { 595 mutex_unlock(&bc->lock); 596 bch2_trans_unlock(trans); 597 b = __btree_node_mem_alloc(c, GFP_KERNEL); 598 if (!b) 599 goto err; 600 mutex_lock(&bc->lock); 601 } 602 603 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); 604 605 BUG_ON(!six_trylock_intent(&b->c.lock)); 606 BUG_ON(!six_trylock_write(&b->c.lock)); 607 got_node: 608 609 /* 610 * btree_free() doesn't free memory; it sticks the node on the end of 611 * the list. Check if there's any freed nodes there: 612 */ 613 list_for_each_entry(b2, &bc->freeable, list) 614 if (!btree_node_reclaim(c, b2)) { 615 swap(b->data, b2->data); 616 swap(b->aux_data, b2->aux_data); 617 btree_node_to_freedlist(bc, b2); 618 six_unlock_write(&b2->c.lock); 619 six_unlock_intent(&b2->c.lock); 620 goto got_mem; 621 } 622 623 mutex_unlock(&bc->lock); 624 625 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 626 bch2_trans_unlock(trans); 627 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 628 goto err; 629 } 630 631 mutex_lock(&bc->lock); 632 bc->used++; 633 got_mem: 634 mutex_unlock(&bc->lock); 635 636 BUG_ON(btree_node_hashed(b)); 637 BUG_ON(btree_node_dirty(b)); 638 BUG_ON(btree_node_write_in_flight(b)); 639 out: 640 b->flags = 0; 641 b->written = 0; 642 b->nsets = 0; 643 b->sib_u64s[0] = 0; 644 b->sib_u64s[1] = 0; 645 b->whiteout_u64s = 0; 646 bch2_btree_keys_init(b); 647 set_btree_node_accessed(b); 648 649 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 650 start_time); 651 652 memalloc_nofs_restore(flags); 653 return b; 654 err: 655 mutex_lock(&bc->lock); 656 657 /* Try to cannibalize another cached btree node: */ 658 if (bc->alloc_lock == current) { 659 b2 = btree_node_cannibalize(c); 660 clear_btree_node_just_written(b2); 661 bch2_btree_node_hash_remove(bc, b2); 662 663 if (b) { 664 swap(b->data, b2->data); 665 swap(b->aux_data, b2->aux_data); 666 btree_node_to_freedlist(bc, b2); 667 six_unlock_write(&b2->c.lock); 668 six_unlock_intent(&b2->c.lock); 669 } else { 670 b = b2; 671 list_del_init(&b->list); 672 } 673 674 mutex_unlock(&bc->lock); 675 676 trace_and_count(c, btree_cache_cannibalize, c); 677 goto out; 678 } 679 680 mutex_unlock(&bc->lock); 681 memalloc_nofs_restore(flags); 682 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 683 } 684 685 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 686 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 687 struct btree_path *path, 688 const struct bkey_i *k, 689 enum btree_id btree_id, 690 unsigned level, 691 enum six_lock_type lock_type, 692 bool sync) 693 { 694 struct bch_fs *c = trans->c; 695 struct btree_cache *bc = &c->btree_cache; 696 struct btree *b; 697 u32 seq; 698 699 BUG_ON(level + 1 >= BTREE_MAX_DEPTH); 700 /* 701 * Parent node must be locked, else we could read in a btree node that's 702 * been freed: 703 */ 704 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 705 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 706 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 707 } 708 709 b = bch2_btree_node_mem_alloc(trans, level != 0); 710 711 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 712 trans->memory_allocation_failure = true; 713 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 714 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 715 } 716 717 if (IS_ERR(b)) 718 return b; 719 720 /* 721 * Btree nodes read in from disk should not have the accessed bit set 722 * initially, so that linear scans don't thrash the cache: 723 */ 724 clear_btree_node_accessed(b); 725 726 bkey_copy(&b->key, k); 727 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 728 /* raced with another fill: */ 729 730 /* mark as unhashed... */ 731 b->hash_val = 0; 732 733 mutex_lock(&bc->lock); 734 list_add(&b->list, &bc->freeable); 735 mutex_unlock(&bc->lock); 736 737 six_unlock_write(&b->c.lock); 738 six_unlock_intent(&b->c.lock); 739 return NULL; 740 } 741 742 set_btree_node_read_in_flight(b); 743 744 six_unlock_write(&b->c.lock); 745 seq = six_lock_seq(&b->c.lock); 746 six_unlock_intent(&b->c.lock); 747 748 /* Unlock before doing IO: */ 749 if (path && sync) 750 bch2_trans_unlock_noassert(trans); 751 752 bch2_btree_node_read(c, b, sync); 753 754 if (!sync) 755 return NULL; 756 757 if (path) { 758 int ret = bch2_trans_relock(trans) ?: 759 bch2_btree_path_relock_intent(trans, path); 760 if (ret) { 761 BUG_ON(!trans->restarted); 762 return ERR_PTR(ret); 763 } 764 } 765 766 if (!six_relock_type(&b->c.lock, lock_type, seq)) { 767 if (path) 768 trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); 769 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); 770 } 771 772 return b; 773 } 774 775 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 776 { 777 struct printbuf buf = PRINTBUF; 778 779 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 780 return; 781 782 prt_printf(&buf, 783 "btree node header doesn't match ptr\n" 784 "btree %s level %u\n" 785 "ptr: ", 786 bch2_btree_ids[b->c.btree_id], b->c.level); 787 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 788 789 prt_printf(&buf, "\nheader: btree %s level %llu\n" 790 "min ", 791 bch2_btree_ids[BTREE_NODE_ID(b->data)], 792 BTREE_NODE_LEVEL(b->data)); 793 bch2_bpos_to_text(&buf, b->data->min_key); 794 795 prt_printf(&buf, "\nmax "); 796 bch2_bpos_to_text(&buf, b->data->max_key); 797 798 bch2_fs_inconsistent(c, "%s", buf.buf); 799 printbuf_exit(&buf); 800 } 801 802 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 803 { 804 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 805 b->c.level != BTREE_NODE_LEVEL(b->data) || 806 !bpos_eq(b->data->max_key, b->key.k.p) || 807 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 808 !bpos_eq(b->data->min_key, 809 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 810 btree_bad_header(c, b); 811 } 812 813 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 814 const struct bkey_i *k, unsigned level, 815 enum six_lock_type lock_type, 816 unsigned long trace_ip) 817 { 818 struct bch_fs *c = trans->c; 819 struct btree_cache *bc = &c->btree_cache; 820 struct btree *b; 821 struct bset_tree *t; 822 bool need_relock = false; 823 int ret; 824 825 EBUG_ON(level >= BTREE_MAX_DEPTH); 826 retry: 827 b = btree_cache_find(bc, k); 828 if (unlikely(!b)) { 829 /* 830 * We must have the parent locked to call bch2_btree_node_fill(), 831 * else we could read in a btree node from disk that's been 832 * freed: 833 */ 834 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 835 level, lock_type, true); 836 need_relock = true; 837 838 /* We raced and found the btree node in the cache */ 839 if (!b) 840 goto retry; 841 842 if (IS_ERR(b)) 843 return b; 844 } else { 845 if (btree_node_read_locked(path, level + 1)) 846 btree_node_unlock(trans, path, level + 1); 847 848 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 849 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 850 return ERR_PTR(ret); 851 852 BUG_ON(ret); 853 854 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 855 b->c.level != level || 856 race_fault())) { 857 six_unlock_type(&b->c.lock, lock_type); 858 if (bch2_btree_node_relock(trans, path, level + 1)) 859 goto retry; 860 861 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 862 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 863 } 864 865 /* avoid atomic set bit if it's not needed: */ 866 if (!btree_node_accessed(b)) 867 set_btree_node_accessed(b); 868 } 869 870 if (unlikely(btree_node_read_in_flight(b))) { 871 u32 seq = six_lock_seq(&b->c.lock); 872 873 six_unlock_type(&b->c.lock, lock_type); 874 bch2_trans_unlock(trans); 875 need_relock = true; 876 877 bch2_btree_node_wait_on_read(b); 878 879 /* 880 * should_be_locked is not set on this path yet, so we need to 881 * relock it specifically: 882 */ 883 if (!six_relock_type(&b->c.lock, lock_type, seq)) 884 goto retry; 885 } 886 887 if (unlikely(need_relock)) { 888 ret = bch2_trans_relock(trans) ?: 889 bch2_btree_path_relock_intent(trans, path); 890 if (ret) { 891 six_unlock_type(&b->c.lock, lock_type); 892 return ERR_PTR(ret); 893 } 894 } 895 896 prefetch(b->aux_data); 897 898 for_each_bset(b, t) { 899 void *p = (u64 *) b->aux_data + t->aux_data_offset; 900 901 prefetch(p + L1_CACHE_BYTES * 0); 902 prefetch(p + L1_CACHE_BYTES * 1); 903 prefetch(p + L1_CACHE_BYTES * 2); 904 } 905 906 if (unlikely(btree_node_read_error(b))) { 907 six_unlock_type(&b->c.lock, lock_type); 908 return ERR_PTR(-EIO); 909 } 910 911 EBUG_ON(b->c.btree_id != path->btree_id); 912 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 913 btree_check_header(c, b); 914 915 return b; 916 } 917 918 /** 919 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 920 * in from disk if necessary. 921 * 922 * @trans: btree transaction object 923 * @path: btree_path being traversed 924 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 925 * @level: level of btree node being looked up (0 == leaf node) 926 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 927 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 928 * 929 * The btree node will have either a read or a write lock held, depending on 930 * the @write parameter. 931 * 932 * Returns: btree node or ERR_PTR() 933 */ 934 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 935 const struct bkey_i *k, unsigned level, 936 enum six_lock_type lock_type, 937 unsigned long trace_ip) 938 { 939 struct bch_fs *c = trans->c; 940 struct btree *b; 941 struct bset_tree *t; 942 int ret; 943 944 EBUG_ON(level >= BTREE_MAX_DEPTH); 945 946 b = btree_node_mem_ptr(k); 947 948 /* 949 * Check b->hash_val _before_ calling btree_node_lock() - this might not 950 * be the node we want anymore, and trying to lock the wrong node could 951 * cause an unneccessary transaction restart: 952 */ 953 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 954 !b || 955 b->hash_val != btree_ptr_hash_val(k))) 956 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 957 958 if (btree_node_read_locked(path, level + 1)) 959 btree_node_unlock(trans, path, level + 1); 960 961 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 962 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 963 return ERR_PTR(ret); 964 965 BUG_ON(ret); 966 967 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 968 b->c.level != level || 969 race_fault())) { 970 six_unlock_type(&b->c.lock, lock_type); 971 if (bch2_btree_node_relock(trans, path, level + 1)) 972 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 973 974 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 975 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 976 } 977 978 if (unlikely(btree_node_read_in_flight(b))) { 979 six_unlock_type(&b->c.lock, lock_type); 980 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 981 } 982 983 prefetch(b->aux_data); 984 985 for_each_bset(b, t) { 986 void *p = (u64 *) b->aux_data + t->aux_data_offset; 987 988 prefetch(p + L1_CACHE_BYTES * 0); 989 prefetch(p + L1_CACHE_BYTES * 1); 990 prefetch(p + L1_CACHE_BYTES * 2); 991 } 992 993 /* avoid atomic set bit if it's not needed: */ 994 if (!btree_node_accessed(b)) 995 set_btree_node_accessed(b); 996 997 if (unlikely(btree_node_read_error(b))) { 998 six_unlock_type(&b->c.lock, lock_type); 999 return ERR_PTR(-EIO); 1000 } 1001 1002 EBUG_ON(b->c.btree_id != path->btree_id); 1003 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1004 btree_check_header(c, b); 1005 1006 return b; 1007 } 1008 1009 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1010 const struct bkey_i *k, 1011 enum btree_id btree_id, 1012 unsigned level, 1013 bool nofill) 1014 { 1015 struct bch_fs *c = trans->c; 1016 struct btree_cache *bc = &c->btree_cache; 1017 struct btree *b; 1018 struct bset_tree *t; 1019 int ret; 1020 1021 EBUG_ON(level >= BTREE_MAX_DEPTH); 1022 1023 if (c->opts.btree_node_mem_ptr_optimization) { 1024 b = btree_node_mem_ptr(k); 1025 if (b) 1026 goto lock_node; 1027 } 1028 retry: 1029 b = btree_cache_find(bc, k); 1030 if (unlikely(!b)) { 1031 if (nofill) 1032 goto out; 1033 1034 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1035 level, SIX_LOCK_read, true); 1036 1037 /* We raced and found the btree node in the cache */ 1038 if (!b) 1039 goto retry; 1040 1041 if (IS_ERR(b) && 1042 !bch2_btree_cache_cannibalize_lock(c, NULL)) 1043 goto retry; 1044 1045 if (IS_ERR(b)) 1046 goto out; 1047 } else { 1048 lock_node: 1049 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1050 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1051 return ERR_PTR(ret); 1052 1053 BUG_ON(ret); 1054 1055 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1056 b->c.btree_id != btree_id || 1057 b->c.level != level)) { 1058 six_unlock_read(&b->c.lock); 1059 goto retry; 1060 } 1061 } 1062 1063 /* XXX: waiting on IO with btree locks held: */ 1064 __bch2_btree_node_wait_on_read(b); 1065 1066 prefetch(b->aux_data); 1067 1068 for_each_bset(b, t) { 1069 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1070 1071 prefetch(p + L1_CACHE_BYTES * 0); 1072 prefetch(p + L1_CACHE_BYTES * 1); 1073 prefetch(p + L1_CACHE_BYTES * 2); 1074 } 1075 1076 /* avoid atomic set bit if it's not needed: */ 1077 if (!btree_node_accessed(b)) 1078 set_btree_node_accessed(b); 1079 1080 if (unlikely(btree_node_read_error(b))) { 1081 six_unlock_read(&b->c.lock); 1082 b = ERR_PTR(-EIO); 1083 goto out; 1084 } 1085 1086 EBUG_ON(b->c.btree_id != btree_id); 1087 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1088 btree_check_header(c, b); 1089 out: 1090 bch2_btree_cache_cannibalize_unlock(c); 1091 return b; 1092 } 1093 1094 int bch2_btree_node_prefetch(struct btree_trans *trans, 1095 struct btree_path *path, 1096 const struct bkey_i *k, 1097 enum btree_id btree_id, unsigned level) 1098 { 1099 struct bch_fs *c = trans->c; 1100 struct btree_cache *bc = &c->btree_cache; 1101 struct btree *b; 1102 1103 BUG_ON(trans && !btree_node_locked(path, level + 1)); 1104 BUG_ON(level >= BTREE_MAX_DEPTH); 1105 1106 b = btree_cache_find(bc, k); 1107 if (b) 1108 return 0; 1109 1110 b = bch2_btree_node_fill(trans, path, k, btree_id, 1111 level, SIX_LOCK_read, false); 1112 return PTR_ERR_OR_ZERO(b); 1113 } 1114 1115 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1116 { 1117 struct bch_fs *c = trans->c; 1118 struct btree_cache *bc = &c->btree_cache; 1119 struct btree *b; 1120 1121 b = btree_cache_find(bc, k); 1122 if (!b) 1123 return; 1124 wait_on_io: 1125 /* not allowed to wait on io with btree locks held: */ 1126 1127 /* XXX we're called from btree_gc which will be holding other btree 1128 * nodes locked 1129 */ 1130 __bch2_btree_node_wait_on_read(b); 1131 __bch2_btree_node_wait_on_write(b); 1132 1133 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1134 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1135 1136 if (btree_node_dirty(b)) { 1137 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1138 six_unlock_write(&b->c.lock); 1139 six_unlock_intent(&b->c.lock); 1140 goto wait_on_io; 1141 } 1142 1143 BUG_ON(btree_node_dirty(b)); 1144 1145 mutex_lock(&bc->lock); 1146 btree_node_data_free(c, b); 1147 bch2_btree_node_hash_remove(bc, b); 1148 mutex_unlock(&bc->lock); 1149 1150 six_unlock_write(&b->c.lock); 1151 six_unlock_intent(&b->c.lock); 1152 } 1153 1154 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, 1155 const struct btree *b) 1156 { 1157 struct bset_stats stats; 1158 1159 memset(&stats, 0, sizeof(stats)); 1160 1161 bch2_btree_keys_stats(b, &stats); 1162 1163 prt_printf(out, "l %u ", b->c.level); 1164 bch2_bpos_to_text(out, b->data->min_key); 1165 prt_printf(out, " - "); 1166 bch2_bpos_to_text(out, b->data->max_key); 1167 prt_printf(out, ":\n" 1168 " ptrs: "); 1169 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1170 prt_newline(out); 1171 1172 prt_printf(out, 1173 " format: "); 1174 bch2_bkey_format_to_text(out, &b->format); 1175 1176 prt_printf(out, 1177 " unpack fn len: %u\n" 1178 " bytes used %zu/%zu (%zu%% full)\n" 1179 " sib u64s: %u, %u (merge threshold %u)\n" 1180 " nr packed keys %u\n" 1181 " nr unpacked keys %u\n" 1182 " floats %zu\n" 1183 " failed unpacked %zu\n", 1184 b->unpack_fn_len, 1185 b->nr.live_u64s * sizeof(u64), 1186 btree_bytes(c) - sizeof(struct btree_node), 1187 b->nr.live_u64s * 100 / btree_max_u64s(c), 1188 b->sib_u64s[0], 1189 b->sib_u64s[1], 1190 c->btree_foreground_merge_threshold, 1191 b->nr.packed_keys, 1192 b->nr.unpacked_keys, 1193 stats.floats, 1194 stats.failed); 1195 } 1196 1197 void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) 1198 { 1199 prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); 1200 prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); 1201 prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); 1202 } 1203