1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_cache.h" 6 #include "btree_io.h" 7 #include "btree_iter.h" 8 #include "btree_locking.h" 9 #include "debug.h" 10 #include "errcode.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "trace.h" 14 15 #include <linux/prefetch.h> 16 #include <linux/sched/mm.h> 17 18 const char * const bch2_btree_node_flags[] = { 19 #define x(f) #f, 20 BTREE_FLAGS() 21 #undef x 22 NULL 23 }; 24 25 void bch2_recalc_btree_reserve(struct bch_fs *c) 26 { 27 unsigned i, reserve = 16; 28 29 if (!c->btree_roots_known[0].b) 30 reserve += 8; 31 32 for (i = 0; i < btree_id_nr_alive(c); i++) { 33 struct btree_root *r = bch2_btree_id_root(c, i); 34 35 if (r->b) 36 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 37 } 38 39 c->btree_cache.reserve = reserve; 40 } 41 42 static inline unsigned btree_cache_can_free(struct btree_cache *bc) 43 { 44 return max_t(int, 0, bc->used - bc->reserve); 45 } 46 47 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 48 { 49 if (b->c.lock.readers) 50 list_move(&b->list, &bc->freed_pcpu); 51 else 52 list_move(&b->list, &bc->freed_nonpcpu); 53 } 54 55 static void btree_node_data_free(struct bch_fs *c, struct btree *b) 56 { 57 struct btree_cache *bc = &c->btree_cache; 58 59 EBUG_ON(btree_node_write_in_flight(b)); 60 61 clear_btree_node_just_written(b); 62 63 kvpfree(b->data, btree_bytes(c)); 64 b->data = NULL; 65 #ifdef __KERNEL__ 66 kvfree(b->aux_data); 67 #else 68 munmap(b->aux_data, btree_aux_data_bytes(b)); 69 #endif 70 b->aux_data = NULL; 71 72 bc->used--; 73 74 btree_node_to_freedlist(bc, b); 75 } 76 77 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 78 const void *obj) 79 { 80 const struct btree *b = obj; 81 const u64 *v = arg->key; 82 83 return b->hash_val == *v ? 0 : 1; 84 } 85 86 static const struct rhashtable_params bch_btree_cache_params = { 87 .head_offset = offsetof(struct btree, hash), 88 .key_offset = offsetof(struct btree, hash_val), 89 .key_len = sizeof(u64), 90 .obj_cmpfn = bch2_btree_cache_cmp_fn, 91 }; 92 93 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 94 { 95 BUG_ON(b->data || b->aux_data); 96 97 b->data = kvpmalloc(btree_bytes(c), gfp); 98 if (!b->data) 99 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 100 #ifdef __KERNEL__ 101 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 102 #else 103 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 104 PROT_READ|PROT_WRITE|PROT_EXEC, 105 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 106 if (b->aux_data == MAP_FAILED) 107 b->aux_data = NULL; 108 #endif 109 if (!b->aux_data) { 110 kvpfree(b->data, btree_bytes(c)); 111 b->data = NULL; 112 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 113 } 114 115 return 0; 116 } 117 118 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 119 { 120 struct btree *b; 121 122 b = kzalloc(sizeof(struct btree), gfp); 123 if (!b) 124 return NULL; 125 126 bkey_btree_ptr_init(&b->key); 127 INIT_LIST_HEAD(&b->list); 128 INIT_LIST_HEAD(&b->write_blocked); 129 b->byte_order = ilog2(btree_bytes(c)); 130 return b; 131 } 132 133 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 134 { 135 struct btree_cache *bc = &c->btree_cache; 136 struct btree *b; 137 138 b = __btree_node_mem_alloc(c, GFP_KERNEL); 139 if (!b) 140 return NULL; 141 142 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 143 kfree(b); 144 return NULL; 145 } 146 147 bch2_btree_lock_init(&b->c, 0); 148 149 bc->used++; 150 list_add(&b->list, &bc->freeable); 151 return b; 152 } 153 154 /* Btree in memory cache - hash table */ 155 156 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 157 { 158 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 159 160 BUG_ON(ret); 161 162 /* Cause future lookups for this node to fail: */ 163 b->hash_val = 0; 164 } 165 166 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 167 { 168 BUG_ON(b->hash_val); 169 b->hash_val = btree_ptr_hash_val(&b->key); 170 171 return rhashtable_lookup_insert_fast(&bc->table, &b->hash, 172 bch_btree_cache_params); 173 } 174 175 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 176 unsigned level, enum btree_id id) 177 { 178 int ret; 179 180 b->c.level = level; 181 b->c.btree_id = id; 182 183 mutex_lock(&bc->lock); 184 ret = __bch2_btree_node_hash_insert(bc, b); 185 if (!ret) 186 list_add_tail(&b->list, &bc->live); 187 mutex_unlock(&bc->lock); 188 189 return ret; 190 } 191 192 __flatten 193 static inline struct btree *btree_cache_find(struct btree_cache *bc, 194 const struct bkey_i *k) 195 { 196 u64 v = btree_ptr_hash_val(k); 197 198 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 199 } 200 201 /* 202 * this version is for btree nodes that have already been freed (we're not 203 * reaping a real btree node) 204 */ 205 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) 206 { 207 struct btree_cache *bc = &c->btree_cache; 208 int ret = 0; 209 210 lockdep_assert_held(&bc->lock); 211 wait_on_io: 212 if (b->flags & ((1U << BTREE_NODE_dirty)| 213 (1U << BTREE_NODE_read_in_flight)| 214 (1U << BTREE_NODE_write_in_flight))) { 215 if (!flush) 216 return -BCH_ERR_ENOMEM_btree_node_reclaim; 217 218 /* XXX: waiting on IO with btree cache lock held */ 219 bch2_btree_node_wait_on_read(b); 220 bch2_btree_node_wait_on_write(b); 221 } 222 223 if (!six_trylock_intent(&b->c.lock)) 224 return -BCH_ERR_ENOMEM_btree_node_reclaim; 225 226 if (!six_trylock_write(&b->c.lock)) 227 goto out_unlock_intent; 228 229 /* recheck under lock */ 230 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 231 (1U << BTREE_NODE_write_in_flight))) { 232 if (!flush) 233 goto out_unlock; 234 six_unlock_write(&b->c.lock); 235 six_unlock_intent(&b->c.lock); 236 goto wait_on_io; 237 } 238 239 if (btree_node_noevict(b) || 240 btree_node_write_blocked(b) || 241 btree_node_will_make_reachable(b)) 242 goto out_unlock; 243 244 if (btree_node_dirty(b)) { 245 if (!flush) 246 goto out_unlock; 247 /* 248 * Using the underscore version because we don't want to compact 249 * bsets after the write, since this node is about to be evicted 250 * - unless btree verify mode is enabled, since it runs out of 251 * the post write cleanup: 252 */ 253 if (bch2_verify_btree_ondisk) 254 bch2_btree_node_write(c, b, SIX_LOCK_intent, 255 BTREE_WRITE_cache_reclaim); 256 else 257 __bch2_btree_node_write(c, b, 258 BTREE_WRITE_cache_reclaim); 259 260 six_unlock_write(&b->c.lock); 261 six_unlock_intent(&b->c.lock); 262 goto wait_on_io; 263 } 264 out: 265 if (b->hash_val && !ret) 266 trace_and_count(c, btree_cache_reap, c, b); 267 return ret; 268 out_unlock: 269 six_unlock_write(&b->c.lock); 270 out_unlock_intent: 271 six_unlock_intent(&b->c.lock); 272 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 273 goto out; 274 } 275 276 static int btree_node_reclaim(struct bch_fs *c, struct btree *b) 277 { 278 return __btree_node_reclaim(c, b, false); 279 } 280 281 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 282 { 283 return __btree_node_reclaim(c, b, true); 284 } 285 286 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 287 struct shrink_control *sc) 288 { 289 struct bch_fs *c = shrink->private_data; 290 struct btree_cache *bc = &c->btree_cache; 291 struct btree *b, *t; 292 unsigned long nr = sc->nr_to_scan; 293 unsigned long can_free = 0; 294 unsigned long freed = 0; 295 unsigned long touched = 0; 296 unsigned i, flags; 297 unsigned long ret = SHRINK_STOP; 298 bool trigger_writes = atomic_read(&bc->dirty) + nr >= 299 bc->used * 3 / 4; 300 301 if (bch2_btree_shrinker_disabled) 302 return SHRINK_STOP; 303 304 mutex_lock(&bc->lock); 305 flags = memalloc_nofs_save(); 306 307 /* 308 * It's _really_ critical that we don't free too many btree nodes - we 309 * have to always leave ourselves a reserve. The reserve is how we 310 * guarantee that allocating memory for a new btree node can always 311 * succeed, so that inserting keys into the btree can always succeed and 312 * IO can always make forward progress: 313 */ 314 can_free = btree_cache_can_free(bc); 315 nr = min_t(unsigned long, nr, can_free); 316 317 i = 0; 318 list_for_each_entry_safe(b, t, &bc->freeable, list) { 319 /* 320 * Leave a few nodes on the freeable list, so that a btree split 321 * won't have to hit the system allocator: 322 */ 323 if (++i <= 3) 324 continue; 325 326 touched++; 327 328 if (touched >= nr) 329 goto out; 330 331 if (!btree_node_reclaim(c, b)) { 332 btree_node_data_free(c, b); 333 six_unlock_write(&b->c.lock); 334 six_unlock_intent(&b->c.lock); 335 freed++; 336 } 337 } 338 restart: 339 list_for_each_entry_safe(b, t, &bc->live, list) { 340 touched++; 341 342 if (btree_node_accessed(b)) { 343 clear_btree_node_accessed(b); 344 } else if (!btree_node_reclaim(c, b)) { 345 freed++; 346 btree_node_data_free(c, b); 347 348 bch2_btree_node_hash_remove(bc, b); 349 six_unlock_write(&b->c.lock); 350 six_unlock_intent(&b->c.lock); 351 352 if (freed == nr) 353 goto out_rotate; 354 } else if (trigger_writes && 355 btree_node_dirty(b) && 356 !btree_node_will_make_reachable(b) && 357 !btree_node_write_blocked(b) && 358 six_trylock_read(&b->c.lock)) { 359 list_move(&bc->live, &b->list); 360 mutex_unlock(&bc->lock); 361 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 362 six_unlock_read(&b->c.lock); 363 if (touched >= nr) 364 goto out_nounlock; 365 mutex_lock(&bc->lock); 366 goto restart; 367 } 368 369 if (touched >= nr) 370 break; 371 } 372 out_rotate: 373 if (&t->list != &bc->live) 374 list_move_tail(&bc->live, &t->list); 375 out: 376 mutex_unlock(&bc->lock); 377 out_nounlock: 378 ret = freed; 379 memalloc_nofs_restore(flags); 380 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 381 return ret; 382 } 383 384 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 385 struct shrink_control *sc) 386 { 387 struct bch_fs *c = shrink->private_data; 388 struct btree_cache *bc = &c->btree_cache; 389 390 if (bch2_btree_shrinker_disabled) 391 return 0; 392 393 return btree_cache_can_free(bc); 394 } 395 396 void bch2_fs_btree_cache_exit(struct bch_fs *c) 397 { 398 struct btree_cache *bc = &c->btree_cache; 399 struct btree *b; 400 unsigned i, flags; 401 402 shrinker_free(bc->shrink); 403 404 /* vfree() can allocate memory: */ 405 flags = memalloc_nofs_save(); 406 mutex_lock(&bc->lock); 407 408 if (c->verify_data) 409 list_move(&c->verify_data->list, &bc->live); 410 411 kvpfree(c->verify_ondisk, btree_bytes(c)); 412 413 for (i = 0; i < btree_id_nr_alive(c); i++) { 414 struct btree_root *r = bch2_btree_id_root(c, i); 415 416 if (r->b) 417 list_add(&r->b->list, &bc->live); 418 } 419 420 list_splice(&bc->freeable, &bc->live); 421 422 while (!list_empty(&bc->live)) { 423 b = list_first_entry(&bc->live, struct btree, list); 424 425 BUG_ON(btree_node_read_in_flight(b) || 426 btree_node_write_in_flight(b)); 427 428 btree_node_data_free(c, b); 429 } 430 431 BUG_ON(!bch2_journal_error(&c->journal) && 432 atomic_read(&c->btree_cache.dirty)); 433 434 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 435 436 while (!list_empty(&bc->freed_nonpcpu)) { 437 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); 438 list_del(&b->list); 439 six_lock_exit(&b->c.lock); 440 kfree(b); 441 } 442 443 mutex_unlock(&bc->lock); 444 memalloc_nofs_restore(flags); 445 446 if (bc->table_init_done) 447 rhashtable_destroy(&bc->table); 448 } 449 450 int bch2_fs_btree_cache_init(struct bch_fs *c) 451 { 452 struct btree_cache *bc = &c->btree_cache; 453 struct shrinker *shrink; 454 unsigned i; 455 int ret = 0; 456 457 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 458 if (ret) 459 goto err; 460 461 bc->table_init_done = true; 462 463 bch2_recalc_btree_reserve(c); 464 465 for (i = 0; i < bc->reserve; i++) 466 if (!__bch2_btree_node_mem_alloc(c)) 467 goto err; 468 469 list_splice_init(&bc->live, &bc->freeable); 470 471 mutex_init(&c->verify_lock); 472 473 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 474 if (!shrink) 475 goto err; 476 bc->shrink = shrink; 477 shrink->count_objects = bch2_btree_cache_count; 478 shrink->scan_objects = bch2_btree_cache_scan; 479 shrink->seeks = 4; 480 shrink->private_data = c; 481 shrinker_register(shrink); 482 483 return 0; 484 err: 485 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 486 } 487 488 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 489 { 490 mutex_init(&bc->lock); 491 INIT_LIST_HEAD(&bc->live); 492 INIT_LIST_HEAD(&bc->freeable); 493 INIT_LIST_HEAD(&bc->freed_pcpu); 494 INIT_LIST_HEAD(&bc->freed_nonpcpu); 495 } 496 497 /* 498 * We can only have one thread cannibalizing other cached btree nodes at a time, 499 * or we'll deadlock. We use an open coded mutex to ensure that, which a 500 * cannibalize_bucket() will take. This means every time we unlock the root of 501 * the btree, we need to release this lock if we have it held. 502 */ 503 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) 504 { 505 struct btree_cache *bc = &c->btree_cache; 506 507 if (bc->alloc_lock == current) { 508 trace_and_count(c, btree_cache_cannibalize_unlock, c); 509 bc->alloc_lock = NULL; 510 closure_wake_up(&bc->alloc_wait); 511 } 512 } 513 514 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) 515 { 516 struct btree_cache *bc = &c->btree_cache; 517 struct task_struct *old; 518 519 old = cmpxchg(&bc->alloc_lock, NULL, current); 520 if (old == NULL || old == current) 521 goto success; 522 523 if (!cl) { 524 trace_and_count(c, btree_cache_cannibalize_lock_fail, c); 525 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 526 } 527 528 closure_wait(&bc->alloc_wait, cl); 529 530 /* Try again, after adding ourselves to waitlist */ 531 old = cmpxchg(&bc->alloc_lock, NULL, current); 532 if (old == NULL || old == current) { 533 /* We raced */ 534 closure_wake_up(&bc->alloc_wait); 535 goto success; 536 } 537 538 trace_and_count(c, btree_cache_cannibalize_lock_fail, c); 539 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 540 541 success: 542 trace_and_count(c, btree_cache_cannibalize_lock, c); 543 return 0; 544 } 545 546 static struct btree *btree_node_cannibalize(struct bch_fs *c) 547 { 548 struct btree_cache *bc = &c->btree_cache; 549 struct btree *b; 550 551 list_for_each_entry_reverse(b, &bc->live, list) 552 if (!btree_node_reclaim(c, b)) 553 return b; 554 555 while (1) { 556 list_for_each_entry_reverse(b, &bc->live, list) 557 if (!btree_node_write_and_reclaim(c, b)) 558 return b; 559 560 /* 561 * Rare case: all nodes were intent-locked. 562 * Just busy-wait. 563 */ 564 WARN_ONCE(1, "btree cache cannibalize failed\n"); 565 cond_resched(); 566 } 567 } 568 569 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 570 { 571 struct bch_fs *c = trans->c; 572 struct btree_cache *bc = &c->btree_cache; 573 struct list_head *freed = pcpu_read_locks 574 ? &bc->freed_pcpu 575 : &bc->freed_nonpcpu; 576 struct btree *b, *b2; 577 u64 start_time = local_clock(); 578 unsigned flags; 579 580 flags = memalloc_nofs_save(); 581 mutex_lock(&bc->lock); 582 583 /* 584 * We never free struct btree itself, just the memory that holds the on 585 * disk node. Check the freed list before allocating a new one: 586 */ 587 list_for_each_entry(b, freed, list) 588 if (!btree_node_reclaim(c, b)) { 589 list_del_init(&b->list); 590 goto got_node; 591 } 592 593 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 594 if (!b) { 595 mutex_unlock(&bc->lock); 596 bch2_trans_unlock(trans); 597 b = __btree_node_mem_alloc(c, GFP_KERNEL); 598 if (!b) 599 goto err; 600 mutex_lock(&bc->lock); 601 } 602 603 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); 604 605 BUG_ON(!six_trylock_intent(&b->c.lock)); 606 BUG_ON(!six_trylock_write(&b->c.lock)); 607 got_node: 608 609 /* 610 * btree_free() doesn't free memory; it sticks the node on the end of 611 * the list. Check if there's any freed nodes there: 612 */ 613 list_for_each_entry(b2, &bc->freeable, list) 614 if (!btree_node_reclaim(c, b2)) { 615 swap(b->data, b2->data); 616 swap(b->aux_data, b2->aux_data); 617 btree_node_to_freedlist(bc, b2); 618 six_unlock_write(&b2->c.lock); 619 six_unlock_intent(&b2->c.lock); 620 goto got_mem; 621 } 622 623 mutex_unlock(&bc->lock); 624 625 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 626 bch2_trans_unlock(trans); 627 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 628 goto err; 629 } 630 631 mutex_lock(&bc->lock); 632 bc->used++; 633 got_mem: 634 mutex_unlock(&bc->lock); 635 636 BUG_ON(btree_node_hashed(b)); 637 BUG_ON(btree_node_dirty(b)); 638 BUG_ON(btree_node_write_in_flight(b)); 639 out: 640 b->flags = 0; 641 b->written = 0; 642 b->nsets = 0; 643 b->sib_u64s[0] = 0; 644 b->sib_u64s[1] = 0; 645 b->whiteout_u64s = 0; 646 bch2_btree_keys_init(b); 647 set_btree_node_accessed(b); 648 649 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 650 start_time); 651 652 memalloc_nofs_restore(flags); 653 return b; 654 err: 655 mutex_lock(&bc->lock); 656 657 /* Try to cannibalize another cached btree node: */ 658 if (bc->alloc_lock == current) { 659 b2 = btree_node_cannibalize(c); 660 clear_btree_node_just_written(b2); 661 bch2_btree_node_hash_remove(bc, b2); 662 663 if (b) { 664 swap(b->data, b2->data); 665 swap(b->aux_data, b2->aux_data); 666 btree_node_to_freedlist(bc, b2); 667 six_unlock_write(&b2->c.lock); 668 six_unlock_intent(&b2->c.lock); 669 } else { 670 b = b2; 671 list_del_init(&b->list); 672 } 673 674 mutex_unlock(&bc->lock); 675 676 trace_and_count(c, btree_cache_cannibalize, c); 677 goto out; 678 } 679 680 mutex_unlock(&bc->lock); 681 memalloc_nofs_restore(flags); 682 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 683 } 684 685 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 686 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 687 struct btree_path *path, 688 const struct bkey_i *k, 689 enum btree_id btree_id, 690 unsigned level, 691 enum six_lock_type lock_type, 692 bool sync) 693 { 694 struct bch_fs *c = trans->c; 695 struct btree_cache *bc = &c->btree_cache; 696 struct btree *b; 697 u32 seq; 698 699 BUG_ON(level + 1 >= BTREE_MAX_DEPTH); 700 /* 701 * Parent node must be locked, else we could read in a btree node that's 702 * been freed: 703 */ 704 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 705 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 706 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 707 } 708 709 b = bch2_btree_node_mem_alloc(trans, level != 0); 710 711 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 712 trans->memory_allocation_failure = true; 713 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 714 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 715 } 716 717 if (IS_ERR(b)) 718 return b; 719 720 /* 721 * Btree nodes read in from disk should not have the accessed bit set 722 * initially, so that linear scans don't thrash the cache: 723 */ 724 clear_btree_node_accessed(b); 725 726 bkey_copy(&b->key, k); 727 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 728 /* raced with another fill: */ 729 730 /* mark as unhashed... */ 731 b->hash_val = 0; 732 733 mutex_lock(&bc->lock); 734 list_add(&b->list, &bc->freeable); 735 mutex_unlock(&bc->lock); 736 737 six_unlock_write(&b->c.lock); 738 six_unlock_intent(&b->c.lock); 739 return NULL; 740 } 741 742 set_btree_node_read_in_flight(b); 743 744 six_unlock_write(&b->c.lock); 745 seq = six_lock_seq(&b->c.lock); 746 six_unlock_intent(&b->c.lock); 747 748 /* Unlock before doing IO: */ 749 if (path && sync) 750 bch2_trans_unlock_noassert(trans); 751 752 bch2_btree_node_read(c, b, sync); 753 754 if (!sync) 755 return NULL; 756 757 if (path) { 758 int ret = bch2_trans_relock(trans) ?: 759 bch2_btree_path_relock_intent(trans, path); 760 if (ret) { 761 BUG_ON(!trans->restarted); 762 return ERR_PTR(ret); 763 } 764 } 765 766 if (!six_relock_type(&b->c.lock, lock_type, seq)) { 767 if (path) 768 trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); 769 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); 770 } 771 772 return b; 773 } 774 775 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 776 { 777 struct printbuf buf = PRINTBUF; 778 779 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 780 return; 781 782 prt_printf(&buf, 783 "btree node header doesn't match ptr\n" 784 "btree %s level %u\n" 785 "ptr: ", 786 bch2_btree_id_str(b->c.btree_id), b->c.level); 787 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 788 789 prt_printf(&buf, "\nheader: btree %s level %llu\n" 790 "min ", 791 bch2_btree_id_str(BTREE_NODE_ID(b->data)), 792 BTREE_NODE_LEVEL(b->data)); 793 bch2_bpos_to_text(&buf, b->data->min_key); 794 795 prt_printf(&buf, "\nmax "); 796 bch2_bpos_to_text(&buf, b->data->max_key); 797 798 bch2_fs_inconsistent(c, "%s", buf.buf); 799 printbuf_exit(&buf); 800 } 801 802 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 803 { 804 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 805 b->c.level != BTREE_NODE_LEVEL(b->data) || 806 !bpos_eq(b->data->max_key, b->key.k.p) || 807 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 808 !bpos_eq(b->data->min_key, 809 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 810 btree_bad_header(c, b); 811 } 812 813 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 814 const struct bkey_i *k, unsigned level, 815 enum six_lock_type lock_type, 816 unsigned long trace_ip) 817 { 818 struct bch_fs *c = trans->c; 819 struct btree_cache *bc = &c->btree_cache; 820 struct btree *b; 821 struct bset_tree *t; 822 bool need_relock = false; 823 int ret; 824 825 EBUG_ON(level >= BTREE_MAX_DEPTH); 826 retry: 827 b = btree_cache_find(bc, k); 828 if (unlikely(!b)) { 829 /* 830 * We must have the parent locked to call bch2_btree_node_fill(), 831 * else we could read in a btree node from disk that's been 832 * freed: 833 */ 834 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 835 level, lock_type, true); 836 need_relock = true; 837 838 /* We raced and found the btree node in the cache */ 839 if (!b) 840 goto retry; 841 842 if (IS_ERR(b)) 843 return b; 844 } else { 845 if (btree_node_read_locked(path, level + 1)) 846 btree_node_unlock(trans, path, level + 1); 847 848 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 849 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 850 return ERR_PTR(ret); 851 852 BUG_ON(ret); 853 854 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 855 b->c.level != level || 856 race_fault())) { 857 six_unlock_type(&b->c.lock, lock_type); 858 if (bch2_btree_node_relock(trans, path, level + 1)) 859 goto retry; 860 861 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 862 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 863 } 864 865 /* avoid atomic set bit if it's not needed: */ 866 if (!btree_node_accessed(b)) 867 set_btree_node_accessed(b); 868 } 869 870 if (unlikely(btree_node_read_in_flight(b))) { 871 u32 seq = six_lock_seq(&b->c.lock); 872 873 six_unlock_type(&b->c.lock, lock_type); 874 bch2_trans_unlock(trans); 875 need_relock = true; 876 877 bch2_btree_node_wait_on_read(b); 878 879 /* 880 * should_be_locked is not set on this path yet, so we need to 881 * relock it specifically: 882 */ 883 if (!six_relock_type(&b->c.lock, lock_type, seq)) 884 goto retry; 885 } 886 887 if (unlikely(need_relock)) { 888 ret = bch2_trans_relock(trans) ?: 889 bch2_btree_path_relock_intent(trans, path); 890 if (ret) { 891 six_unlock_type(&b->c.lock, lock_type); 892 return ERR_PTR(ret); 893 } 894 } 895 896 prefetch(b->aux_data); 897 898 for_each_bset(b, t) { 899 void *p = (u64 *) b->aux_data + t->aux_data_offset; 900 901 prefetch(p + L1_CACHE_BYTES * 0); 902 prefetch(p + L1_CACHE_BYTES * 1); 903 prefetch(p + L1_CACHE_BYTES * 2); 904 } 905 906 if (unlikely(btree_node_read_error(b))) { 907 six_unlock_type(&b->c.lock, lock_type); 908 return ERR_PTR(-EIO); 909 } 910 911 EBUG_ON(b->c.btree_id != path->btree_id); 912 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 913 btree_check_header(c, b); 914 915 return b; 916 } 917 918 /** 919 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 920 * in from disk if necessary. 921 * 922 * @trans: btree transaction object 923 * @path: btree_path being traversed 924 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 925 * @level: level of btree node being looked up (0 == leaf node) 926 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 927 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 928 * 929 * The btree node will have either a read or a write lock held, depending on 930 * the @write parameter. 931 * 932 * Returns: btree node or ERR_PTR() 933 */ 934 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 935 const struct bkey_i *k, unsigned level, 936 enum six_lock_type lock_type, 937 unsigned long trace_ip) 938 { 939 struct bch_fs *c = trans->c; 940 struct btree *b; 941 struct bset_tree *t; 942 int ret; 943 944 EBUG_ON(level >= BTREE_MAX_DEPTH); 945 946 b = btree_node_mem_ptr(k); 947 948 /* 949 * Check b->hash_val _before_ calling btree_node_lock() - this might not 950 * be the node we want anymore, and trying to lock the wrong node could 951 * cause an unneccessary transaction restart: 952 */ 953 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 954 !b || 955 b->hash_val != btree_ptr_hash_val(k))) 956 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 957 958 if (btree_node_read_locked(path, level + 1)) 959 btree_node_unlock(trans, path, level + 1); 960 961 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 962 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 963 return ERR_PTR(ret); 964 965 BUG_ON(ret); 966 967 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 968 b->c.level != level || 969 race_fault())) { 970 six_unlock_type(&b->c.lock, lock_type); 971 if (bch2_btree_node_relock(trans, path, level + 1)) 972 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 973 974 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 975 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 976 } 977 978 if (unlikely(btree_node_read_in_flight(b))) { 979 six_unlock_type(&b->c.lock, lock_type); 980 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 981 } 982 983 prefetch(b->aux_data); 984 985 for_each_bset(b, t) { 986 void *p = (u64 *) b->aux_data + t->aux_data_offset; 987 988 prefetch(p + L1_CACHE_BYTES * 0); 989 prefetch(p + L1_CACHE_BYTES * 1); 990 prefetch(p + L1_CACHE_BYTES * 2); 991 } 992 993 /* avoid atomic set bit if it's not needed: */ 994 if (!btree_node_accessed(b)) 995 set_btree_node_accessed(b); 996 997 if (unlikely(btree_node_read_error(b))) { 998 six_unlock_type(&b->c.lock, lock_type); 999 return ERR_PTR(-EIO); 1000 } 1001 1002 EBUG_ON(b->c.btree_id != path->btree_id); 1003 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1004 btree_check_header(c, b); 1005 1006 return b; 1007 } 1008 1009 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1010 const struct bkey_i *k, 1011 enum btree_id btree_id, 1012 unsigned level, 1013 bool nofill) 1014 { 1015 struct bch_fs *c = trans->c; 1016 struct btree_cache *bc = &c->btree_cache; 1017 struct btree *b; 1018 struct bset_tree *t; 1019 int ret; 1020 1021 EBUG_ON(level >= BTREE_MAX_DEPTH); 1022 1023 if (c->opts.btree_node_mem_ptr_optimization) { 1024 b = btree_node_mem_ptr(k); 1025 if (b) 1026 goto lock_node; 1027 } 1028 retry: 1029 b = btree_cache_find(bc, k); 1030 if (unlikely(!b)) { 1031 if (nofill) 1032 goto out; 1033 1034 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1035 level, SIX_LOCK_read, true); 1036 1037 /* We raced and found the btree node in the cache */ 1038 if (!b) 1039 goto retry; 1040 1041 if (IS_ERR(b) && 1042 !bch2_btree_cache_cannibalize_lock(c, NULL)) 1043 goto retry; 1044 1045 if (IS_ERR(b)) 1046 goto out; 1047 } else { 1048 lock_node: 1049 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1050 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1051 return ERR_PTR(ret); 1052 1053 BUG_ON(ret); 1054 1055 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1056 b->c.btree_id != btree_id || 1057 b->c.level != level)) { 1058 six_unlock_read(&b->c.lock); 1059 goto retry; 1060 } 1061 } 1062 1063 /* XXX: waiting on IO with btree locks held: */ 1064 __bch2_btree_node_wait_on_read(b); 1065 1066 prefetch(b->aux_data); 1067 1068 for_each_bset(b, t) { 1069 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1070 1071 prefetch(p + L1_CACHE_BYTES * 0); 1072 prefetch(p + L1_CACHE_BYTES * 1); 1073 prefetch(p + L1_CACHE_BYTES * 2); 1074 } 1075 1076 /* avoid atomic set bit if it's not needed: */ 1077 if (!btree_node_accessed(b)) 1078 set_btree_node_accessed(b); 1079 1080 if (unlikely(btree_node_read_error(b))) { 1081 six_unlock_read(&b->c.lock); 1082 b = ERR_PTR(-EIO); 1083 goto out; 1084 } 1085 1086 EBUG_ON(b->c.btree_id != btree_id); 1087 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1088 btree_check_header(c, b); 1089 out: 1090 bch2_btree_cache_cannibalize_unlock(c); 1091 return b; 1092 } 1093 1094 int bch2_btree_node_prefetch(struct btree_trans *trans, 1095 struct btree_path *path, 1096 const struct bkey_i *k, 1097 enum btree_id btree_id, unsigned level) 1098 { 1099 struct bch_fs *c = trans->c; 1100 struct btree_cache *bc = &c->btree_cache; 1101 struct btree *b; 1102 1103 BUG_ON(trans && !btree_node_locked(path, level + 1)); 1104 BUG_ON(level >= BTREE_MAX_DEPTH); 1105 1106 b = btree_cache_find(bc, k); 1107 if (b) 1108 return 0; 1109 1110 b = bch2_btree_node_fill(trans, path, k, btree_id, 1111 level, SIX_LOCK_read, false); 1112 return PTR_ERR_OR_ZERO(b); 1113 } 1114 1115 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1116 { 1117 struct bch_fs *c = trans->c; 1118 struct btree_cache *bc = &c->btree_cache; 1119 struct btree *b; 1120 1121 b = btree_cache_find(bc, k); 1122 if (!b) 1123 return; 1124 wait_on_io: 1125 /* not allowed to wait on io with btree locks held: */ 1126 1127 /* XXX we're called from btree_gc which will be holding other btree 1128 * nodes locked 1129 */ 1130 __bch2_btree_node_wait_on_read(b); 1131 __bch2_btree_node_wait_on_write(b); 1132 1133 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1134 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1135 1136 if (btree_node_dirty(b)) { 1137 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1138 six_unlock_write(&b->c.lock); 1139 six_unlock_intent(&b->c.lock); 1140 goto wait_on_io; 1141 } 1142 1143 BUG_ON(btree_node_dirty(b)); 1144 1145 mutex_lock(&bc->lock); 1146 btree_node_data_free(c, b); 1147 bch2_btree_node_hash_remove(bc, b); 1148 mutex_unlock(&bc->lock); 1149 1150 six_unlock_write(&b->c.lock); 1151 six_unlock_intent(&b->c.lock); 1152 } 1153 1154 const char *bch2_btree_id_str(enum btree_id btree) 1155 { 1156 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1157 } 1158 1159 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1160 { 1161 prt_printf(out, "%s level %u/%u\n ", 1162 bch2_btree_id_str(b->c.btree_id), 1163 b->c.level, 1164 bch2_btree_id_root(c, b->c.btree_id)->level); 1165 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1166 } 1167 1168 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1169 { 1170 struct bset_stats stats; 1171 1172 memset(&stats, 0, sizeof(stats)); 1173 1174 bch2_btree_keys_stats(b, &stats); 1175 1176 prt_printf(out, "l %u ", b->c.level); 1177 bch2_bpos_to_text(out, b->data->min_key); 1178 prt_printf(out, " - "); 1179 bch2_bpos_to_text(out, b->data->max_key); 1180 prt_printf(out, ":\n" 1181 " ptrs: "); 1182 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1183 prt_newline(out); 1184 1185 prt_printf(out, 1186 " format: "); 1187 bch2_bkey_format_to_text(out, &b->format); 1188 1189 prt_printf(out, 1190 " unpack fn len: %u\n" 1191 " bytes used %zu/%zu (%zu%% full)\n" 1192 " sib u64s: %u, %u (merge threshold %u)\n" 1193 " nr packed keys %u\n" 1194 " nr unpacked keys %u\n" 1195 " floats %zu\n" 1196 " failed unpacked %zu\n", 1197 b->unpack_fn_len, 1198 b->nr.live_u64s * sizeof(u64), 1199 btree_bytes(c) - sizeof(struct btree_node), 1200 b->nr.live_u64s * 100 / btree_max_u64s(c), 1201 b->sib_u64s[0], 1202 b->sib_u64s[1], 1203 c->btree_foreground_merge_threshold, 1204 b->nr.packed_keys, 1205 b->nr.unpacked_keys, 1206 stats.floats, 1207 stats.failed); 1208 } 1209 1210 void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) 1211 { 1212 prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); 1213 prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); 1214 prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); 1215 } 1216