1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_cache.h" 6 #include "btree_io.h" 7 #include "btree_iter.h" 8 #include "btree_locking.h" 9 #include "debug.h" 10 #include "errcode.h" 11 #include "error.h" 12 #include "trace.h" 13 14 #include <linux/prefetch.h> 15 #include <linux/sched/mm.h> 16 17 const char * const bch2_btree_node_flags[] = { 18 #define x(f) #f, 19 BTREE_FLAGS() 20 #undef x 21 NULL 22 }; 23 24 void bch2_recalc_btree_reserve(struct bch_fs *c) 25 { 26 unsigned i, reserve = 16; 27 28 if (!c->btree_roots_known[0].b) 29 reserve += 8; 30 31 for (i = 0; i < btree_id_nr_alive(c); i++) { 32 struct btree_root *r = bch2_btree_id_root(c, i); 33 34 if (r->b) 35 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 36 } 37 38 c->btree_cache.reserve = reserve; 39 } 40 41 static inline unsigned btree_cache_can_free(struct btree_cache *bc) 42 { 43 return max_t(int, 0, bc->used - bc->reserve); 44 } 45 46 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 47 { 48 if (b->c.lock.readers) 49 list_move(&b->list, &bc->freed_pcpu); 50 else 51 list_move(&b->list, &bc->freed_nonpcpu); 52 } 53 54 static void btree_node_data_free(struct bch_fs *c, struct btree *b) 55 { 56 struct btree_cache *bc = &c->btree_cache; 57 58 EBUG_ON(btree_node_write_in_flight(b)); 59 60 clear_btree_node_just_written(b); 61 62 kvpfree(b->data, btree_bytes(c)); 63 b->data = NULL; 64 #ifdef __KERNEL__ 65 kvfree(b->aux_data); 66 #else 67 munmap(b->aux_data, btree_aux_data_bytes(b)); 68 #endif 69 b->aux_data = NULL; 70 71 bc->used--; 72 73 btree_node_to_freedlist(bc, b); 74 } 75 76 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 77 const void *obj) 78 { 79 const struct btree *b = obj; 80 const u64 *v = arg->key; 81 82 return b->hash_val == *v ? 0 : 1; 83 } 84 85 static const struct rhashtable_params bch_btree_cache_params = { 86 .head_offset = offsetof(struct btree, hash), 87 .key_offset = offsetof(struct btree, hash_val), 88 .key_len = sizeof(u64), 89 .obj_cmpfn = bch2_btree_cache_cmp_fn, 90 }; 91 92 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 93 { 94 BUG_ON(b->data || b->aux_data); 95 96 b->data = kvpmalloc(btree_bytes(c), gfp); 97 if (!b->data) 98 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 99 #ifdef __KERNEL__ 100 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 101 #else 102 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 103 PROT_READ|PROT_WRITE|PROT_EXEC, 104 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 105 if (b->aux_data == MAP_FAILED) 106 b->aux_data = NULL; 107 #endif 108 if (!b->aux_data) { 109 kvpfree(b->data, btree_bytes(c)); 110 b->data = NULL; 111 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 112 } 113 114 return 0; 115 } 116 117 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 118 { 119 struct btree *b; 120 121 b = kzalloc(sizeof(struct btree), gfp); 122 if (!b) 123 return NULL; 124 125 bkey_btree_ptr_init(&b->key); 126 INIT_LIST_HEAD(&b->list); 127 INIT_LIST_HEAD(&b->write_blocked); 128 b->byte_order = ilog2(btree_bytes(c)); 129 return b; 130 } 131 132 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 133 { 134 struct btree_cache *bc = &c->btree_cache; 135 struct btree *b; 136 137 b = __btree_node_mem_alloc(c, GFP_KERNEL); 138 if (!b) 139 return NULL; 140 141 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 142 kfree(b); 143 return NULL; 144 } 145 146 bch2_btree_lock_init(&b->c, 0); 147 148 bc->used++; 149 list_add(&b->list, &bc->freeable); 150 return b; 151 } 152 153 /* Btree in memory cache - hash table */ 154 155 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 156 { 157 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 158 159 BUG_ON(ret); 160 161 /* Cause future lookups for this node to fail: */ 162 b->hash_val = 0; 163 } 164 165 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 166 { 167 BUG_ON(b->hash_val); 168 b->hash_val = btree_ptr_hash_val(&b->key); 169 170 return rhashtable_lookup_insert_fast(&bc->table, &b->hash, 171 bch_btree_cache_params); 172 } 173 174 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 175 unsigned level, enum btree_id id) 176 { 177 int ret; 178 179 b->c.level = level; 180 b->c.btree_id = id; 181 182 mutex_lock(&bc->lock); 183 ret = __bch2_btree_node_hash_insert(bc, b); 184 if (!ret) 185 list_add_tail(&b->list, &bc->live); 186 mutex_unlock(&bc->lock); 187 188 return ret; 189 } 190 191 __flatten 192 static inline struct btree *btree_cache_find(struct btree_cache *bc, 193 const struct bkey_i *k) 194 { 195 u64 v = btree_ptr_hash_val(k); 196 197 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 198 } 199 200 /* 201 * this version is for btree nodes that have already been freed (we're not 202 * reaping a real btree node) 203 */ 204 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) 205 { 206 struct btree_cache *bc = &c->btree_cache; 207 int ret = 0; 208 209 lockdep_assert_held(&bc->lock); 210 wait_on_io: 211 if (b->flags & ((1U << BTREE_NODE_dirty)| 212 (1U << BTREE_NODE_read_in_flight)| 213 (1U << BTREE_NODE_write_in_flight))) { 214 if (!flush) 215 return -BCH_ERR_ENOMEM_btree_node_reclaim; 216 217 /* XXX: waiting on IO with btree cache lock held */ 218 bch2_btree_node_wait_on_read(b); 219 bch2_btree_node_wait_on_write(b); 220 } 221 222 if (!six_trylock_intent(&b->c.lock)) 223 return -BCH_ERR_ENOMEM_btree_node_reclaim; 224 225 if (!six_trylock_write(&b->c.lock)) 226 goto out_unlock_intent; 227 228 /* recheck under lock */ 229 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 230 (1U << BTREE_NODE_write_in_flight))) { 231 if (!flush) 232 goto out_unlock; 233 six_unlock_write(&b->c.lock); 234 six_unlock_intent(&b->c.lock); 235 goto wait_on_io; 236 } 237 238 if (btree_node_noevict(b) || 239 btree_node_write_blocked(b) || 240 btree_node_will_make_reachable(b)) 241 goto out_unlock; 242 243 if (btree_node_dirty(b)) { 244 if (!flush) 245 goto out_unlock; 246 /* 247 * Using the underscore version because we don't want to compact 248 * bsets after the write, since this node is about to be evicted 249 * - unless btree verify mode is enabled, since it runs out of 250 * the post write cleanup: 251 */ 252 if (bch2_verify_btree_ondisk) 253 bch2_btree_node_write(c, b, SIX_LOCK_intent, 254 BTREE_WRITE_cache_reclaim); 255 else 256 __bch2_btree_node_write(c, b, 257 BTREE_WRITE_cache_reclaim); 258 259 six_unlock_write(&b->c.lock); 260 six_unlock_intent(&b->c.lock); 261 goto wait_on_io; 262 } 263 out: 264 if (b->hash_val && !ret) 265 trace_and_count(c, btree_cache_reap, c, b); 266 return ret; 267 out_unlock: 268 six_unlock_write(&b->c.lock); 269 out_unlock_intent: 270 six_unlock_intent(&b->c.lock); 271 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 272 goto out; 273 } 274 275 static int btree_node_reclaim(struct bch_fs *c, struct btree *b) 276 { 277 return __btree_node_reclaim(c, b, false); 278 } 279 280 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 281 { 282 return __btree_node_reclaim(c, b, true); 283 } 284 285 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 286 struct shrink_control *sc) 287 { 288 struct bch_fs *c = shrink->private_data; 289 struct btree_cache *bc = &c->btree_cache; 290 struct btree *b, *t; 291 unsigned long nr = sc->nr_to_scan; 292 unsigned long can_free = 0; 293 unsigned long freed = 0; 294 unsigned long touched = 0; 295 unsigned i, flags; 296 unsigned long ret = SHRINK_STOP; 297 bool trigger_writes = atomic_read(&bc->dirty) + nr >= 298 bc->used * 3 / 4; 299 300 if (bch2_btree_shrinker_disabled) 301 return SHRINK_STOP; 302 303 mutex_lock(&bc->lock); 304 flags = memalloc_nofs_save(); 305 306 /* 307 * It's _really_ critical that we don't free too many btree nodes - we 308 * have to always leave ourselves a reserve. The reserve is how we 309 * guarantee that allocating memory for a new btree node can always 310 * succeed, so that inserting keys into the btree can always succeed and 311 * IO can always make forward progress: 312 */ 313 can_free = btree_cache_can_free(bc); 314 nr = min_t(unsigned long, nr, can_free); 315 316 i = 0; 317 list_for_each_entry_safe(b, t, &bc->freeable, list) { 318 /* 319 * Leave a few nodes on the freeable list, so that a btree split 320 * won't have to hit the system allocator: 321 */ 322 if (++i <= 3) 323 continue; 324 325 touched++; 326 327 if (touched >= nr) 328 goto out; 329 330 if (!btree_node_reclaim(c, b)) { 331 btree_node_data_free(c, b); 332 six_unlock_write(&b->c.lock); 333 six_unlock_intent(&b->c.lock); 334 freed++; 335 } 336 } 337 restart: 338 list_for_each_entry_safe(b, t, &bc->live, list) { 339 touched++; 340 341 if (btree_node_accessed(b)) { 342 clear_btree_node_accessed(b); 343 } else if (!btree_node_reclaim(c, b)) { 344 freed++; 345 btree_node_data_free(c, b); 346 347 bch2_btree_node_hash_remove(bc, b); 348 six_unlock_write(&b->c.lock); 349 six_unlock_intent(&b->c.lock); 350 351 if (freed == nr) 352 goto out_rotate; 353 } else if (trigger_writes && 354 btree_node_dirty(b) && 355 !btree_node_will_make_reachable(b) && 356 !btree_node_write_blocked(b) && 357 six_trylock_read(&b->c.lock)) { 358 list_move(&bc->live, &b->list); 359 mutex_unlock(&bc->lock); 360 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 361 six_unlock_read(&b->c.lock); 362 if (touched >= nr) 363 goto out_nounlock; 364 mutex_lock(&bc->lock); 365 goto restart; 366 } 367 368 if (touched >= nr) 369 break; 370 } 371 out_rotate: 372 if (&t->list != &bc->live) 373 list_move_tail(&bc->live, &t->list); 374 out: 375 mutex_unlock(&bc->lock); 376 out_nounlock: 377 ret = freed; 378 memalloc_nofs_restore(flags); 379 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 380 return ret; 381 } 382 383 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 384 struct shrink_control *sc) 385 { 386 struct bch_fs *c = shrink->private_data; 387 struct btree_cache *bc = &c->btree_cache; 388 389 if (bch2_btree_shrinker_disabled) 390 return 0; 391 392 return btree_cache_can_free(bc); 393 } 394 395 void bch2_fs_btree_cache_exit(struct bch_fs *c) 396 { 397 struct btree_cache *bc = &c->btree_cache; 398 struct btree *b; 399 unsigned i, flags; 400 401 shrinker_free(bc->shrink); 402 403 /* vfree() can allocate memory: */ 404 flags = memalloc_nofs_save(); 405 mutex_lock(&bc->lock); 406 407 if (c->verify_data) 408 list_move(&c->verify_data->list, &bc->live); 409 410 kvpfree(c->verify_ondisk, btree_bytes(c)); 411 412 for (i = 0; i < btree_id_nr_alive(c); i++) { 413 struct btree_root *r = bch2_btree_id_root(c, i); 414 415 if (r->b) 416 list_add(&r->b->list, &bc->live); 417 } 418 419 list_splice(&bc->freeable, &bc->live); 420 421 while (!list_empty(&bc->live)) { 422 b = list_first_entry(&bc->live, struct btree, list); 423 424 BUG_ON(btree_node_read_in_flight(b) || 425 btree_node_write_in_flight(b)); 426 427 if (btree_node_dirty(b)) 428 bch2_btree_complete_write(c, b, btree_current_write(b)); 429 clear_btree_node_dirty_acct(c, b); 430 431 btree_node_data_free(c, b); 432 } 433 434 BUG_ON(atomic_read(&c->btree_cache.dirty)); 435 436 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 437 438 while (!list_empty(&bc->freed_nonpcpu)) { 439 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); 440 list_del(&b->list); 441 six_lock_exit(&b->c.lock); 442 kfree(b); 443 } 444 445 mutex_unlock(&bc->lock); 446 memalloc_nofs_restore(flags); 447 448 if (bc->table_init_done) 449 rhashtable_destroy(&bc->table); 450 } 451 452 int bch2_fs_btree_cache_init(struct bch_fs *c) 453 { 454 struct btree_cache *bc = &c->btree_cache; 455 struct shrinker *shrink; 456 unsigned i; 457 int ret = 0; 458 459 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 460 if (ret) 461 goto err; 462 463 bc->table_init_done = true; 464 465 bch2_recalc_btree_reserve(c); 466 467 for (i = 0; i < bc->reserve; i++) 468 if (!__bch2_btree_node_mem_alloc(c)) 469 goto err; 470 471 list_splice_init(&bc->live, &bc->freeable); 472 473 mutex_init(&c->verify_lock); 474 475 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 476 if (!shrink) 477 goto err; 478 bc->shrink = shrink; 479 shrink->count_objects = bch2_btree_cache_count; 480 shrink->scan_objects = bch2_btree_cache_scan; 481 shrink->seeks = 4; 482 shrink->private_data = c; 483 shrinker_register(shrink); 484 485 return 0; 486 err: 487 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 488 } 489 490 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 491 { 492 mutex_init(&bc->lock); 493 INIT_LIST_HEAD(&bc->live); 494 INIT_LIST_HEAD(&bc->freeable); 495 INIT_LIST_HEAD(&bc->freed_pcpu); 496 INIT_LIST_HEAD(&bc->freed_nonpcpu); 497 } 498 499 /* 500 * We can only have one thread cannibalizing other cached btree nodes at a time, 501 * or we'll deadlock. We use an open coded mutex to ensure that, which a 502 * cannibalize_bucket() will take. This means every time we unlock the root of 503 * the btree, we need to release this lock if we have it held. 504 */ 505 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) 506 { 507 struct btree_cache *bc = &c->btree_cache; 508 509 if (bc->alloc_lock == current) { 510 trace_and_count(c, btree_cache_cannibalize_unlock, c); 511 bc->alloc_lock = NULL; 512 closure_wake_up(&bc->alloc_wait); 513 } 514 } 515 516 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) 517 { 518 struct btree_cache *bc = &c->btree_cache; 519 struct task_struct *old; 520 521 old = cmpxchg(&bc->alloc_lock, NULL, current); 522 if (old == NULL || old == current) 523 goto success; 524 525 if (!cl) { 526 trace_and_count(c, btree_cache_cannibalize_lock_fail, c); 527 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 528 } 529 530 closure_wait(&bc->alloc_wait, cl); 531 532 /* Try again, after adding ourselves to waitlist */ 533 old = cmpxchg(&bc->alloc_lock, NULL, current); 534 if (old == NULL || old == current) { 535 /* We raced */ 536 closure_wake_up(&bc->alloc_wait); 537 goto success; 538 } 539 540 trace_and_count(c, btree_cache_cannibalize_lock_fail, c); 541 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 542 543 success: 544 trace_and_count(c, btree_cache_cannibalize_lock, c); 545 return 0; 546 } 547 548 static struct btree *btree_node_cannibalize(struct bch_fs *c) 549 { 550 struct btree_cache *bc = &c->btree_cache; 551 struct btree *b; 552 553 list_for_each_entry_reverse(b, &bc->live, list) 554 if (!btree_node_reclaim(c, b)) 555 return b; 556 557 while (1) { 558 list_for_each_entry_reverse(b, &bc->live, list) 559 if (!btree_node_write_and_reclaim(c, b)) 560 return b; 561 562 /* 563 * Rare case: all nodes were intent-locked. 564 * Just busy-wait. 565 */ 566 WARN_ONCE(1, "btree cache cannibalize failed\n"); 567 cond_resched(); 568 } 569 } 570 571 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 572 { 573 struct bch_fs *c = trans->c; 574 struct btree_cache *bc = &c->btree_cache; 575 struct list_head *freed = pcpu_read_locks 576 ? &bc->freed_pcpu 577 : &bc->freed_nonpcpu; 578 struct btree *b, *b2; 579 u64 start_time = local_clock(); 580 unsigned flags; 581 582 flags = memalloc_nofs_save(); 583 mutex_lock(&bc->lock); 584 585 /* 586 * We never free struct btree itself, just the memory that holds the on 587 * disk node. Check the freed list before allocating a new one: 588 */ 589 list_for_each_entry(b, freed, list) 590 if (!btree_node_reclaim(c, b)) { 591 list_del_init(&b->list); 592 goto got_node; 593 } 594 595 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 596 if (!b) { 597 mutex_unlock(&bc->lock); 598 bch2_trans_unlock(trans); 599 b = __btree_node_mem_alloc(c, GFP_KERNEL); 600 if (!b) 601 goto err; 602 mutex_lock(&bc->lock); 603 } 604 605 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); 606 607 BUG_ON(!six_trylock_intent(&b->c.lock)); 608 BUG_ON(!six_trylock_write(&b->c.lock)); 609 got_node: 610 611 /* 612 * btree_free() doesn't free memory; it sticks the node on the end of 613 * the list. Check if there's any freed nodes there: 614 */ 615 list_for_each_entry(b2, &bc->freeable, list) 616 if (!btree_node_reclaim(c, b2)) { 617 swap(b->data, b2->data); 618 swap(b->aux_data, b2->aux_data); 619 btree_node_to_freedlist(bc, b2); 620 six_unlock_write(&b2->c.lock); 621 six_unlock_intent(&b2->c.lock); 622 goto got_mem; 623 } 624 625 mutex_unlock(&bc->lock); 626 627 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 628 bch2_trans_unlock(trans); 629 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 630 goto err; 631 } 632 633 mutex_lock(&bc->lock); 634 bc->used++; 635 got_mem: 636 mutex_unlock(&bc->lock); 637 638 BUG_ON(btree_node_hashed(b)); 639 BUG_ON(btree_node_dirty(b)); 640 BUG_ON(btree_node_write_in_flight(b)); 641 out: 642 b->flags = 0; 643 b->written = 0; 644 b->nsets = 0; 645 b->sib_u64s[0] = 0; 646 b->sib_u64s[1] = 0; 647 b->whiteout_u64s = 0; 648 bch2_btree_keys_init(b); 649 set_btree_node_accessed(b); 650 651 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 652 start_time); 653 654 memalloc_nofs_restore(flags); 655 return b; 656 err: 657 mutex_lock(&bc->lock); 658 659 /* Try to cannibalize another cached btree node: */ 660 if (bc->alloc_lock == current) { 661 b2 = btree_node_cannibalize(c); 662 clear_btree_node_just_written(b2); 663 bch2_btree_node_hash_remove(bc, b2); 664 665 if (b) { 666 swap(b->data, b2->data); 667 swap(b->aux_data, b2->aux_data); 668 btree_node_to_freedlist(bc, b2); 669 six_unlock_write(&b2->c.lock); 670 six_unlock_intent(&b2->c.lock); 671 } else { 672 b = b2; 673 list_del_init(&b->list); 674 } 675 676 mutex_unlock(&bc->lock); 677 678 trace_and_count(c, btree_cache_cannibalize, c); 679 goto out; 680 } 681 682 mutex_unlock(&bc->lock); 683 memalloc_nofs_restore(flags); 684 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 685 } 686 687 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 688 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 689 struct btree_path *path, 690 const struct bkey_i *k, 691 enum btree_id btree_id, 692 unsigned level, 693 enum six_lock_type lock_type, 694 bool sync) 695 { 696 struct bch_fs *c = trans->c; 697 struct btree_cache *bc = &c->btree_cache; 698 struct btree *b; 699 u32 seq; 700 701 BUG_ON(level + 1 >= BTREE_MAX_DEPTH); 702 /* 703 * Parent node must be locked, else we could read in a btree node that's 704 * been freed: 705 */ 706 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 707 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 708 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 709 } 710 711 b = bch2_btree_node_mem_alloc(trans, level != 0); 712 713 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 714 trans->memory_allocation_failure = true; 715 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 716 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 717 } 718 719 if (IS_ERR(b)) 720 return b; 721 722 /* 723 * Btree nodes read in from disk should not have the accessed bit set 724 * initially, so that linear scans don't thrash the cache: 725 */ 726 clear_btree_node_accessed(b); 727 728 bkey_copy(&b->key, k); 729 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 730 /* raced with another fill: */ 731 732 /* mark as unhashed... */ 733 b->hash_val = 0; 734 735 mutex_lock(&bc->lock); 736 list_add(&b->list, &bc->freeable); 737 mutex_unlock(&bc->lock); 738 739 six_unlock_write(&b->c.lock); 740 six_unlock_intent(&b->c.lock); 741 return NULL; 742 } 743 744 set_btree_node_read_in_flight(b); 745 746 six_unlock_write(&b->c.lock); 747 seq = six_lock_seq(&b->c.lock); 748 six_unlock_intent(&b->c.lock); 749 750 /* Unlock before doing IO: */ 751 if (path && sync) 752 bch2_trans_unlock_noassert(trans); 753 754 bch2_btree_node_read(c, b, sync); 755 756 if (!sync) 757 return NULL; 758 759 if (path) { 760 int ret = bch2_trans_relock(trans) ?: 761 bch2_btree_path_relock_intent(trans, path); 762 if (ret) { 763 BUG_ON(!trans->restarted); 764 return ERR_PTR(ret); 765 } 766 } 767 768 if (!six_relock_type(&b->c.lock, lock_type, seq)) { 769 if (path) 770 trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); 771 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); 772 } 773 774 return b; 775 } 776 777 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 778 { 779 struct printbuf buf = PRINTBUF; 780 781 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 782 return; 783 784 prt_printf(&buf, 785 "btree node header doesn't match ptr\n" 786 "btree %s level %u\n" 787 "ptr: ", 788 bch2_btree_id_str(b->c.btree_id), b->c.level); 789 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 790 791 prt_printf(&buf, "\nheader: btree %s level %llu\n" 792 "min ", 793 bch2_btree_id_str(BTREE_NODE_ID(b->data)), 794 BTREE_NODE_LEVEL(b->data)); 795 bch2_bpos_to_text(&buf, b->data->min_key); 796 797 prt_printf(&buf, "\nmax "); 798 bch2_bpos_to_text(&buf, b->data->max_key); 799 800 bch2_fs_inconsistent(c, "%s", buf.buf); 801 printbuf_exit(&buf); 802 } 803 804 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 805 { 806 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 807 b->c.level != BTREE_NODE_LEVEL(b->data) || 808 !bpos_eq(b->data->max_key, b->key.k.p) || 809 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 810 !bpos_eq(b->data->min_key, 811 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 812 btree_bad_header(c, b); 813 } 814 815 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 816 const struct bkey_i *k, unsigned level, 817 enum six_lock_type lock_type, 818 unsigned long trace_ip) 819 { 820 struct bch_fs *c = trans->c; 821 struct btree_cache *bc = &c->btree_cache; 822 struct btree *b; 823 struct bset_tree *t; 824 bool need_relock = false; 825 int ret; 826 827 EBUG_ON(level >= BTREE_MAX_DEPTH); 828 retry: 829 b = btree_cache_find(bc, k); 830 if (unlikely(!b)) { 831 /* 832 * We must have the parent locked to call bch2_btree_node_fill(), 833 * else we could read in a btree node from disk that's been 834 * freed: 835 */ 836 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 837 level, lock_type, true); 838 need_relock = true; 839 840 /* We raced and found the btree node in the cache */ 841 if (!b) 842 goto retry; 843 844 if (IS_ERR(b)) 845 return b; 846 } else { 847 if (btree_node_read_locked(path, level + 1)) 848 btree_node_unlock(trans, path, level + 1); 849 850 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 851 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 852 return ERR_PTR(ret); 853 854 BUG_ON(ret); 855 856 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 857 b->c.level != level || 858 race_fault())) { 859 six_unlock_type(&b->c.lock, lock_type); 860 if (bch2_btree_node_relock(trans, path, level + 1)) 861 goto retry; 862 863 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 864 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 865 } 866 867 /* avoid atomic set bit if it's not needed: */ 868 if (!btree_node_accessed(b)) 869 set_btree_node_accessed(b); 870 } 871 872 if (unlikely(btree_node_read_in_flight(b))) { 873 u32 seq = six_lock_seq(&b->c.lock); 874 875 six_unlock_type(&b->c.lock, lock_type); 876 bch2_trans_unlock(trans); 877 need_relock = true; 878 879 bch2_btree_node_wait_on_read(b); 880 881 /* 882 * should_be_locked is not set on this path yet, so we need to 883 * relock it specifically: 884 */ 885 if (!six_relock_type(&b->c.lock, lock_type, seq)) 886 goto retry; 887 } 888 889 if (unlikely(need_relock)) { 890 ret = bch2_trans_relock(trans) ?: 891 bch2_btree_path_relock_intent(trans, path); 892 if (ret) { 893 six_unlock_type(&b->c.lock, lock_type); 894 return ERR_PTR(ret); 895 } 896 } 897 898 prefetch(b->aux_data); 899 900 for_each_bset(b, t) { 901 void *p = (u64 *) b->aux_data + t->aux_data_offset; 902 903 prefetch(p + L1_CACHE_BYTES * 0); 904 prefetch(p + L1_CACHE_BYTES * 1); 905 prefetch(p + L1_CACHE_BYTES * 2); 906 } 907 908 if (unlikely(btree_node_read_error(b))) { 909 six_unlock_type(&b->c.lock, lock_type); 910 return ERR_PTR(-EIO); 911 } 912 913 EBUG_ON(b->c.btree_id != path->btree_id); 914 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 915 btree_check_header(c, b); 916 917 return b; 918 } 919 920 /** 921 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 922 * in from disk if necessary. 923 * 924 * @trans: btree transaction object 925 * @path: btree_path being traversed 926 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 927 * @level: level of btree node being looked up (0 == leaf node) 928 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 929 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 930 * 931 * The btree node will have either a read or a write lock held, depending on 932 * the @write parameter. 933 * 934 * Returns: btree node or ERR_PTR() 935 */ 936 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 937 const struct bkey_i *k, unsigned level, 938 enum six_lock_type lock_type, 939 unsigned long trace_ip) 940 { 941 struct bch_fs *c = trans->c; 942 struct btree *b; 943 struct bset_tree *t; 944 int ret; 945 946 EBUG_ON(level >= BTREE_MAX_DEPTH); 947 948 b = btree_node_mem_ptr(k); 949 950 /* 951 * Check b->hash_val _before_ calling btree_node_lock() - this might not 952 * be the node we want anymore, and trying to lock the wrong node could 953 * cause an unneccessary transaction restart: 954 */ 955 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 956 !b || 957 b->hash_val != btree_ptr_hash_val(k))) 958 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 959 960 if (btree_node_read_locked(path, level + 1)) 961 btree_node_unlock(trans, path, level + 1); 962 963 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 964 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 965 return ERR_PTR(ret); 966 967 BUG_ON(ret); 968 969 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 970 b->c.level != level || 971 race_fault())) { 972 six_unlock_type(&b->c.lock, lock_type); 973 if (bch2_btree_node_relock(trans, path, level + 1)) 974 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 975 976 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 977 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 978 } 979 980 if (unlikely(btree_node_read_in_flight(b))) { 981 six_unlock_type(&b->c.lock, lock_type); 982 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 983 } 984 985 prefetch(b->aux_data); 986 987 for_each_bset(b, t) { 988 void *p = (u64 *) b->aux_data + t->aux_data_offset; 989 990 prefetch(p + L1_CACHE_BYTES * 0); 991 prefetch(p + L1_CACHE_BYTES * 1); 992 prefetch(p + L1_CACHE_BYTES * 2); 993 } 994 995 /* avoid atomic set bit if it's not needed: */ 996 if (!btree_node_accessed(b)) 997 set_btree_node_accessed(b); 998 999 if (unlikely(btree_node_read_error(b))) { 1000 six_unlock_type(&b->c.lock, lock_type); 1001 return ERR_PTR(-EIO); 1002 } 1003 1004 EBUG_ON(b->c.btree_id != path->btree_id); 1005 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1006 btree_check_header(c, b); 1007 1008 return b; 1009 } 1010 1011 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1012 const struct bkey_i *k, 1013 enum btree_id btree_id, 1014 unsigned level, 1015 bool nofill) 1016 { 1017 struct bch_fs *c = trans->c; 1018 struct btree_cache *bc = &c->btree_cache; 1019 struct btree *b; 1020 struct bset_tree *t; 1021 int ret; 1022 1023 EBUG_ON(level >= BTREE_MAX_DEPTH); 1024 1025 if (c->opts.btree_node_mem_ptr_optimization) { 1026 b = btree_node_mem_ptr(k); 1027 if (b) 1028 goto lock_node; 1029 } 1030 retry: 1031 b = btree_cache_find(bc, k); 1032 if (unlikely(!b)) { 1033 if (nofill) 1034 goto out; 1035 1036 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1037 level, SIX_LOCK_read, true); 1038 1039 /* We raced and found the btree node in the cache */ 1040 if (!b) 1041 goto retry; 1042 1043 if (IS_ERR(b) && 1044 !bch2_btree_cache_cannibalize_lock(c, NULL)) 1045 goto retry; 1046 1047 if (IS_ERR(b)) 1048 goto out; 1049 } else { 1050 lock_node: 1051 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1052 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1053 return ERR_PTR(ret); 1054 1055 BUG_ON(ret); 1056 1057 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1058 b->c.btree_id != btree_id || 1059 b->c.level != level)) { 1060 six_unlock_read(&b->c.lock); 1061 goto retry; 1062 } 1063 } 1064 1065 /* XXX: waiting on IO with btree locks held: */ 1066 __bch2_btree_node_wait_on_read(b); 1067 1068 prefetch(b->aux_data); 1069 1070 for_each_bset(b, t) { 1071 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1072 1073 prefetch(p + L1_CACHE_BYTES * 0); 1074 prefetch(p + L1_CACHE_BYTES * 1); 1075 prefetch(p + L1_CACHE_BYTES * 2); 1076 } 1077 1078 /* avoid atomic set bit if it's not needed: */ 1079 if (!btree_node_accessed(b)) 1080 set_btree_node_accessed(b); 1081 1082 if (unlikely(btree_node_read_error(b))) { 1083 six_unlock_read(&b->c.lock); 1084 b = ERR_PTR(-EIO); 1085 goto out; 1086 } 1087 1088 EBUG_ON(b->c.btree_id != btree_id); 1089 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1090 btree_check_header(c, b); 1091 out: 1092 bch2_btree_cache_cannibalize_unlock(c); 1093 return b; 1094 } 1095 1096 int bch2_btree_node_prefetch(struct btree_trans *trans, 1097 struct btree_path *path, 1098 const struct bkey_i *k, 1099 enum btree_id btree_id, unsigned level) 1100 { 1101 struct bch_fs *c = trans->c; 1102 struct btree_cache *bc = &c->btree_cache; 1103 struct btree *b; 1104 1105 BUG_ON(trans && !btree_node_locked(path, level + 1)); 1106 BUG_ON(level >= BTREE_MAX_DEPTH); 1107 1108 b = btree_cache_find(bc, k); 1109 if (b) 1110 return 0; 1111 1112 b = bch2_btree_node_fill(trans, path, k, btree_id, 1113 level, SIX_LOCK_read, false); 1114 return PTR_ERR_OR_ZERO(b); 1115 } 1116 1117 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1118 { 1119 struct bch_fs *c = trans->c; 1120 struct btree_cache *bc = &c->btree_cache; 1121 struct btree *b; 1122 1123 b = btree_cache_find(bc, k); 1124 if (!b) 1125 return; 1126 wait_on_io: 1127 /* not allowed to wait on io with btree locks held: */ 1128 1129 /* XXX we're called from btree_gc which will be holding other btree 1130 * nodes locked 1131 */ 1132 __bch2_btree_node_wait_on_read(b); 1133 __bch2_btree_node_wait_on_write(b); 1134 1135 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1136 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1137 1138 if (btree_node_dirty(b)) { 1139 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1140 six_unlock_write(&b->c.lock); 1141 six_unlock_intent(&b->c.lock); 1142 goto wait_on_io; 1143 } 1144 1145 BUG_ON(btree_node_dirty(b)); 1146 1147 mutex_lock(&bc->lock); 1148 btree_node_data_free(c, b); 1149 bch2_btree_node_hash_remove(bc, b); 1150 mutex_unlock(&bc->lock); 1151 1152 six_unlock_write(&b->c.lock); 1153 six_unlock_intent(&b->c.lock); 1154 } 1155 1156 const char *bch2_btree_id_str(enum btree_id btree) 1157 { 1158 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1159 } 1160 1161 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1162 { 1163 prt_printf(out, "%s level %u/%u\n ", 1164 bch2_btree_id_str(b->c.btree_id), 1165 b->c.level, 1166 bch2_btree_id_root(c, b->c.btree_id)->level); 1167 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1168 } 1169 1170 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1171 { 1172 struct bset_stats stats; 1173 1174 memset(&stats, 0, sizeof(stats)); 1175 1176 bch2_btree_keys_stats(b, &stats); 1177 1178 prt_printf(out, "l %u ", b->c.level); 1179 bch2_bpos_to_text(out, b->data->min_key); 1180 prt_printf(out, " - "); 1181 bch2_bpos_to_text(out, b->data->max_key); 1182 prt_printf(out, ":\n" 1183 " ptrs: "); 1184 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1185 prt_newline(out); 1186 1187 prt_printf(out, 1188 " format: "); 1189 bch2_bkey_format_to_text(out, &b->format); 1190 1191 prt_printf(out, 1192 " unpack fn len: %u\n" 1193 " bytes used %zu/%zu (%zu%% full)\n" 1194 " sib u64s: %u, %u (merge threshold %u)\n" 1195 " nr packed keys %u\n" 1196 " nr unpacked keys %u\n" 1197 " floats %zu\n" 1198 " failed unpacked %zu\n", 1199 b->unpack_fn_len, 1200 b->nr.live_u64s * sizeof(u64), 1201 btree_bytes(c) - sizeof(struct btree_node), 1202 b->nr.live_u64s * 100 / btree_max_u64s(c), 1203 b->sib_u64s[0], 1204 b->sib_u64s[1], 1205 c->btree_foreground_merge_threshold, 1206 b->nr.packed_keys, 1207 b->nr.unpacked_keys, 1208 stats.floats, 1209 stats.failed); 1210 } 1211 1212 void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) 1213 { 1214 prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); 1215 prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); 1216 prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); 1217 } 1218