1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bbpos.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_iter.h" 9 #include "btree_locking.h" 10 #include "debug.h" 11 #include "errcode.h" 12 #include "error.h" 13 #include "journal.h" 14 #include "trace.h" 15 16 #include <linux/prefetch.h> 17 #include <linux/sched/mm.h> 18 #include <linux/swap.h> 19 20 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ 21 do { \ 22 if (shrinker_counter) \ 23 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ 24 } while (0) 25 26 const char * const bch2_btree_node_flags[] = { 27 #define x(f) #f, 28 BTREE_FLAGS() 29 #undef x 30 NULL 31 }; 32 33 void bch2_recalc_btree_reserve(struct bch_fs *c) 34 { 35 unsigned reserve = 16; 36 37 if (!c->btree_roots_known[0].b) 38 reserve += 8; 39 40 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 41 struct btree_root *r = bch2_btree_id_root(c, i); 42 43 if (r->b) 44 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 45 } 46 47 c->btree_cache.nr_reserve = reserve; 48 } 49 50 static inline size_t btree_cache_can_free(struct btree_cache_list *list) 51 { 52 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 53 54 size_t can_free = list->nr; 55 if (!list->idx) 56 can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); 57 return can_free; 58 } 59 60 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 61 { 62 BUG_ON(!list_empty(&b->list)); 63 64 if (b->c.lock.readers) 65 list_add(&b->list, &bc->freed_pcpu); 66 else 67 list_add(&b->list, &bc->freed_nonpcpu); 68 } 69 70 static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) 71 { 72 BUG_ON(!list_empty(&b->list)); 73 BUG_ON(!b->data); 74 75 bc->nr_freeable++; 76 list_add(&b->list, &bc->freeable); 77 } 78 79 void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) 80 { 81 struct btree_cache *bc = &c->btree_cache; 82 83 mutex_lock(&bc->lock); 84 __bch2_btree_node_to_freelist(bc, b); 85 mutex_unlock(&bc->lock); 86 87 six_unlock_write(&b->c.lock); 88 six_unlock_intent(&b->c.lock); 89 } 90 91 static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) 92 { 93 BUG_ON(!list_empty(&b->list)); 94 BUG_ON(btree_node_hashed(b)); 95 96 /* 97 * This should really be done in slub/vmalloc, but we're using the 98 * kmalloc_large() path, so we're working around a slub bug by doing 99 * this here: 100 */ 101 if (b->data) 102 mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); 103 if (b->aux_data) 104 mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); 105 106 EBUG_ON(btree_node_write_in_flight(b)); 107 108 clear_btree_node_just_written(b); 109 110 kvfree(b->data); 111 b->data = NULL; 112 #ifdef __KERNEL__ 113 kvfree(b->aux_data); 114 #else 115 munmap(b->aux_data, btree_aux_data_bytes(b)); 116 #endif 117 b->aux_data = NULL; 118 119 btree_node_to_freedlist(bc, b); 120 } 121 122 static void btree_node_data_free(struct btree_cache *bc, struct btree *b) 123 { 124 BUG_ON(list_empty(&b->list)); 125 list_del_init(&b->list); 126 --bc->nr_freeable; 127 __btree_node_data_free(bc, b); 128 } 129 130 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 131 const void *obj) 132 { 133 const struct btree *b = obj; 134 const u64 *v = arg->key; 135 136 return b->hash_val == *v ? 0 : 1; 137 } 138 139 static const struct rhashtable_params bch_btree_cache_params = { 140 .head_offset = offsetof(struct btree, hash), 141 .key_offset = offsetof(struct btree, hash_val), 142 .key_len = sizeof(u64), 143 .obj_cmpfn = bch2_btree_cache_cmp_fn, 144 .automatic_shrinking = true, 145 }; 146 147 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 148 { 149 BUG_ON(b->data || b->aux_data); 150 151 gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; 152 153 b->data = kvmalloc(btree_buf_bytes(b), gfp); 154 if (!b->data) 155 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 156 #ifdef __KERNEL__ 157 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 158 #else 159 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 160 PROT_READ|PROT_WRITE|PROT_EXEC, 161 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 162 if (b->aux_data == MAP_FAILED) 163 b->aux_data = NULL; 164 #endif 165 if (!b->aux_data) { 166 kvfree(b->data); 167 b->data = NULL; 168 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 169 } 170 171 return 0; 172 } 173 174 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 175 { 176 struct btree *b; 177 178 b = kzalloc(sizeof(struct btree), gfp); 179 if (!b) 180 return NULL; 181 182 bkey_btree_ptr_init(&b->key); 183 INIT_LIST_HEAD(&b->list); 184 INIT_LIST_HEAD(&b->write_blocked); 185 b->byte_order = ilog2(c->opts.btree_node_size); 186 return b; 187 } 188 189 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 190 { 191 struct btree_cache *bc = &c->btree_cache; 192 struct btree *b; 193 194 b = __btree_node_mem_alloc(c, GFP_KERNEL); 195 if (!b) 196 return NULL; 197 198 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 199 kfree(b); 200 return NULL; 201 } 202 203 bch2_btree_lock_init(&b->c, 0); 204 205 __bch2_btree_node_to_freelist(bc, b); 206 return b; 207 } 208 209 static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) 210 { 211 struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 212 213 u64 mask = bc->pinned_nodes_mask[!!b->c.level]; 214 215 return ((mask & BIT_ULL(b->c.btree_id)) && 216 bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 217 bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); 218 } 219 220 void bch2_node_pin(struct bch_fs *c, struct btree *b) 221 { 222 struct btree_cache *bc = &c->btree_cache; 223 224 mutex_lock(&bc->lock); 225 BUG_ON(!__btree_node_pinned(bc, b)); 226 if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { 227 set_btree_node_pinned(b); 228 list_move(&b->list, &bc->live[1].list); 229 bc->live[0].nr--; 230 bc->live[1].nr++; 231 } 232 mutex_unlock(&bc->lock); 233 } 234 235 void bch2_btree_cache_unpin(struct bch_fs *c) 236 { 237 struct btree_cache *bc = &c->btree_cache; 238 struct btree *b, *n; 239 240 mutex_lock(&bc->lock); 241 c->btree_cache.pinned_nodes_mask[0] = 0; 242 c->btree_cache.pinned_nodes_mask[1] = 0; 243 244 list_for_each_entry_safe(b, n, &bc->live[1].list, list) { 245 clear_btree_node_pinned(b); 246 list_move(&b->list, &bc->live[0].list); 247 bc->live[0].nr++; 248 bc->live[1].nr--; 249 } 250 251 mutex_unlock(&bc->lock); 252 } 253 254 /* Btree in memory cache - hash table */ 255 256 void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 257 { 258 lockdep_assert_held(&bc->lock); 259 260 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 261 BUG_ON(ret); 262 263 /* Cause future lookups for this node to fail: */ 264 b->hash_val = 0; 265 266 if (b->c.btree_id < BTREE_ID_NR) 267 --bc->nr_by_btree[b->c.btree_id]; 268 --bc->live[btree_node_pinned(b)].nr; 269 list_del_init(&b->list); 270 } 271 272 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 273 { 274 __bch2_btree_node_hash_remove(bc, b); 275 __bch2_btree_node_to_freelist(bc, b); 276 } 277 278 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 279 { 280 BUG_ON(!list_empty(&b->list)); 281 BUG_ON(b->hash_val); 282 283 b->hash_val = btree_ptr_hash_val(&b->key); 284 int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, 285 bch_btree_cache_params); 286 if (ret) 287 return ret; 288 289 if (b->c.btree_id < BTREE_ID_NR) 290 bc->nr_by_btree[b->c.btree_id]++; 291 292 bool p = __btree_node_pinned(bc, b); 293 mod_bit(BTREE_NODE_pinned, &b->flags, p); 294 295 list_add_tail(&b->list, &bc->live[p].list); 296 bc->live[p].nr++; 297 return 0; 298 } 299 300 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 301 unsigned level, enum btree_id id) 302 { 303 b->c.level = level; 304 b->c.btree_id = id; 305 306 mutex_lock(&bc->lock); 307 int ret = __bch2_btree_node_hash_insert(bc, b); 308 mutex_unlock(&bc->lock); 309 310 return ret; 311 } 312 313 void bch2_btree_node_update_key_early(struct btree_trans *trans, 314 enum btree_id btree, unsigned level, 315 struct bkey_s_c old, struct bkey_i *new) 316 { 317 struct bch_fs *c = trans->c; 318 struct btree *b; 319 struct bkey_buf tmp; 320 int ret; 321 322 bch2_bkey_buf_init(&tmp); 323 bch2_bkey_buf_reassemble(&tmp, c, old); 324 325 b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); 326 if (!IS_ERR_OR_NULL(b)) { 327 mutex_lock(&c->btree_cache.lock); 328 329 bch2_btree_node_hash_remove(&c->btree_cache, b); 330 331 bkey_copy(&b->key, new); 332 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 333 BUG_ON(ret); 334 335 mutex_unlock(&c->btree_cache.lock); 336 six_unlock_read(&b->c.lock); 337 } 338 339 bch2_bkey_buf_exit(&tmp, c); 340 } 341 342 __flatten 343 static inline struct btree *btree_cache_find(struct btree_cache *bc, 344 const struct bkey_i *k) 345 { 346 u64 v = btree_ptr_hash_val(k); 347 348 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 349 } 350 351 /* 352 * this version is for btree nodes that have already been freed (we're not 353 * reaping a real btree node) 354 */ 355 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) 356 { 357 struct btree_cache *bc = &c->btree_cache; 358 int ret = 0; 359 360 lockdep_assert_held(&bc->lock); 361 wait_on_io: 362 if (b->flags & ((1U << BTREE_NODE_dirty)| 363 (1U << BTREE_NODE_read_in_flight)| 364 (1U << BTREE_NODE_write_in_flight))) { 365 if (!flush) { 366 if (btree_node_dirty(b)) 367 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 368 else if (btree_node_read_in_flight(b)) 369 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 370 else if (btree_node_write_in_flight(b)) 371 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 372 return -BCH_ERR_ENOMEM_btree_node_reclaim; 373 } 374 375 /* XXX: waiting on IO with btree cache lock held */ 376 bch2_btree_node_wait_on_read(b); 377 bch2_btree_node_wait_on_write(b); 378 } 379 380 if (!six_trylock_intent(&b->c.lock)) { 381 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); 382 return -BCH_ERR_ENOMEM_btree_node_reclaim; 383 } 384 385 if (!six_trylock_write(&b->c.lock)) { 386 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); 387 goto out_unlock_intent; 388 } 389 390 /* recheck under lock */ 391 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 392 (1U << BTREE_NODE_write_in_flight))) { 393 if (!flush) { 394 if (btree_node_read_in_flight(b)) 395 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 396 else if (btree_node_write_in_flight(b)) 397 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 398 goto out_unlock; 399 } 400 six_unlock_write(&b->c.lock); 401 six_unlock_intent(&b->c.lock); 402 goto wait_on_io; 403 } 404 405 if (btree_node_noevict(b)) { 406 BTREE_CACHE_NOT_FREED_INCREMENT(noevict); 407 goto out_unlock; 408 } 409 if (btree_node_write_blocked(b)) { 410 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); 411 goto out_unlock; 412 } 413 if (btree_node_will_make_reachable(b)) { 414 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); 415 goto out_unlock; 416 } 417 418 if (btree_node_dirty(b)) { 419 if (!flush) { 420 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 421 goto out_unlock; 422 } 423 /* 424 * Using the underscore version because we don't want to compact 425 * bsets after the write, since this node is about to be evicted 426 * - unless btree verify mode is enabled, since it runs out of 427 * the post write cleanup: 428 */ 429 if (bch2_verify_btree_ondisk) 430 bch2_btree_node_write(c, b, SIX_LOCK_intent, 431 BTREE_WRITE_cache_reclaim); 432 else 433 __bch2_btree_node_write(c, b, 434 BTREE_WRITE_cache_reclaim); 435 436 six_unlock_write(&b->c.lock); 437 six_unlock_intent(&b->c.lock); 438 goto wait_on_io; 439 } 440 out: 441 if (b->hash_val && !ret) 442 trace_and_count(c, btree_cache_reap, c, b); 443 return ret; 444 out_unlock: 445 six_unlock_write(&b->c.lock); 446 out_unlock_intent: 447 six_unlock_intent(&b->c.lock); 448 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 449 goto out; 450 } 451 452 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) 453 { 454 return __btree_node_reclaim(c, b, false, shrinker_counter); 455 } 456 457 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 458 { 459 return __btree_node_reclaim(c, b, true, false); 460 } 461 462 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 463 struct shrink_control *sc) 464 { 465 struct btree_cache_list *list = shrink->private_data; 466 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 467 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 468 struct btree *b, *t; 469 unsigned long nr = sc->nr_to_scan; 470 unsigned long can_free = 0; 471 unsigned long freed = 0; 472 unsigned long touched = 0; 473 unsigned i, flags; 474 unsigned long ret = SHRINK_STOP; 475 bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; 476 477 if (bch2_btree_shrinker_disabled) 478 return SHRINK_STOP; 479 480 mutex_lock(&bc->lock); 481 flags = memalloc_nofs_save(); 482 483 /* 484 * It's _really_ critical that we don't free too many btree nodes - we 485 * have to always leave ourselves a reserve. The reserve is how we 486 * guarantee that allocating memory for a new btree node can always 487 * succeed, so that inserting keys into the btree can always succeed and 488 * IO can always make forward progress: 489 */ 490 can_free = btree_cache_can_free(list); 491 nr = min_t(unsigned long, nr, can_free); 492 493 i = 0; 494 list_for_each_entry_safe(b, t, &bc->freeable, list) { 495 /* 496 * Leave a few nodes on the freeable list, so that a btree split 497 * won't have to hit the system allocator: 498 */ 499 if (++i <= 3) 500 continue; 501 502 touched++; 503 504 if (touched >= nr) 505 goto out; 506 507 if (!btree_node_reclaim(c, b, true)) { 508 btree_node_data_free(bc, b); 509 six_unlock_write(&b->c.lock); 510 six_unlock_intent(&b->c.lock); 511 freed++; 512 bc->nr_freed++; 513 } 514 } 515 restart: 516 list_for_each_entry_safe(b, t, &list->list, list) { 517 touched++; 518 519 if (btree_node_accessed(b)) { 520 clear_btree_node_accessed(b); 521 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; 522 --touched;; 523 } else if (!btree_node_reclaim(c, b, true)) { 524 __bch2_btree_node_hash_remove(bc, b); 525 __btree_node_data_free(bc, b); 526 527 freed++; 528 bc->nr_freed++; 529 530 six_unlock_write(&b->c.lock); 531 six_unlock_intent(&b->c.lock); 532 533 if (freed == nr) 534 goto out_rotate; 535 } else if (trigger_writes && 536 btree_node_dirty(b) && 537 !btree_node_will_make_reachable(b) && 538 !btree_node_write_blocked(b) && 539 six_trylock_read(&b->c.lock)) { 540 list_move(&list->list, &b->list); 541 mutex_unlock(&bc->lock); 542 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 543 six_unlock_read(&b->c.lock); 544 if (touched >= nr) 545 goto out_nounlock; 546 mutex_lock(&bc->lock); 547 goto restart; 548 } 549 550 if (touched >= nr) 551 break; 552 } 553 out_rotate: 554 if (&t->list != &list->list) 555 list_move_tail(&list->list, &t->list); 556 out: 557 mutex_unlock(&bc->lock); 558 out_nounlock: 559 ret = freed; 560 memalloc_nofs_restore(flags); 561 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 562 return ret; 563 } 564 565 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 566 struct shrink_control *sc) 567 { 568 struct btree_cache_list *list = shrink->private_data; 569 570 if (bch2_btree_shrinker_disabled) 571 return 0; 572 573 return btree_cache_can_free(list); 574 } 575 576 void bch2_fs_btree_cache_exit(struct bch_fs *c) 577 { 578 struct btree_cache *bc = &c->btree_cache; 579 struct btree *b, *t; 580 unsigned long flags; 581 582 shrinker_free(bc->live[1].shrink); 583 shrinker_free(bc->live[0].shrink); 584 585 /* vfree() can allocate memory: */ 586 flags = memalloc_nofs_save(); 587 mutex_lock(&bc->lock); 588 589 if (c->verify_data) 590 list_move(&c->verify_data->list, &bc->live[0].list); 591 592 kvfree(c->verify_ondisk); 593 594 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 595 struct btree_root *r = bch2_btree_id_root(c, i); 596 597 if (r->b) 598 list_add(&r->b->list, &bc->live[0].list); 599 } 600 601 list_for_each_entry_safe(b, t, &bc->live[1].list, list) 602 bch2_btree_node_hash_remove(bc, b); 603 list_for_each_entry_safe(b, t, &bc->live[0].list, list) 604 bch2_btree_node_hash_remove(bc, b); 605 606 list_for_each_entry_safe(b, t, &bc->freeable, list) { 607 BUG_ON(btree_node_read_in_flight(b) || 608 btree_node_write_in_flight(b)); 609 610 btree_node_data_free(bc, b); 611 } 612 613 BUG_ON(!bch2_journal_error(&c->journal) && 614 atomic_long_read(&c->btree_cache.nr_dirty)); 615 616 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 617 618 list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { 619 list_del(&b->list); 620 six_lock_exit(&b->c.lock); 621 kfree(b); 622 } 623 624 mutex_unlock(&bc->lock); 625 memalloc_nofs_restore(flags); 626 627 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) 628 BUG_ON(bc->nr_by_btree[i]); 629 BUG_ON(bc->live[0].nr); 630 BUG_ON(bc->live[1].nr); 631 BUG_ON(bc->nr_freeable); 632 633 if (bc->table_init_done) 634 rhashtable_destroy(&bc->table); 635 } 636 637 int bch2_fs_btree_cache_init(struct bch_fs *c) 638 { 639 struct btree_cache *bc = &c->btree_cache; 640 struct shrinker *shrink; 641 unsigned i; 642 int ret = 0; 643 644 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 645 if (ret) 646 goto err; 647 648 bc->table_init_done = true; 649 650 bch2_recalc_btree_reserve(c); 651 652 for (i = 0; i < bc->nr_reserve; i++) 653 if (!__bch2_btree_node_mem_alloc(c)) 654 goto err; 655 656 list_splice_init(&bc->live[0].list, &bc->freeable); 657 658 mutex_init(&c->verify_lock); 659 660 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 661 if (!shrink) 662 goto err; 663 bc->live[0].shrink = shrink; 664 shrink->count_objects = bch2_btree_cache_count; 665 shrink->scan_objects = bch2_btree_cache_scan; 666 shrink->seeks = 2; 667 shrink->private_data = &bc->live[0]; 668 shrinker_register(shrink); 669 670 shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); 671 if (!shrink) 672 goto err; 673 bc->live[1].shrink = shrink; 674 shrink->count_objects = bch2_btree_cache_count; 675 shrink->scan_objects = bch2_btree_cache_scan; 676 shrink->seeks = 8; 677 shrink->private_data = &bc->live[1]; 678 shrinker_register(shrink); 679 680 return 0; 681 err: 682 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 683 } 684 685 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 686 { 687 mutex_init(&bc->lock); 688 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { 689 bc->live[i].idx = i; 690 INIT_LIST_HEAD(&bc->live[i].list); 691 } 692 INIT_LIST_HEAD(&bc->freeable); 693 INIT_LIST_HEAD(&bc->freed_pcpu); 694 INIT_LIST_HEAD(&bc->freed_nonpcpu); 695 } 696 697 /* 698 * We can only have one thread cannibalizing other cached btree nodes at a time, 699 * or we'll deadlock. We use an open coded mutex to ensure that, which a 700 * cannibalize_bucket() will take. This means every time we unlock the root of 701 * the btree, we need to release this lock if we have it held. 702 */ 703 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) 704 { 705 struct bch_fs *c = trans->c; 706 struct btree_cache *bc = &c->btree_cache; 707 708 if (bc->alloc_lock == current) { 709 trace_and_count(c, btree_cache_cannibalize_unlock, trans); 710 bc->alloc_lock = NULL; 711 closure_wake_up(&bc->alloc_wait); 712 } 713 } 714 715 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) 716 { 717 struct bch_fs *c = trans->c; 718 struct btree_cache *bc = &c->btree_cache; 719 struct task_struct *old; 720 721 old = NULL; 722 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) 723 goto success; 724 725 if (!cl) { 726 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 727 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 728 } 729 730 closure_wait(&bc->alloc_wait, cl); 731 732 /* Try again, after adding ourselves to waitlist */ 733 old = NULL; 734 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { 735 /* We raced */ 736 closure_wake_up(&bc->alloc_wait); 737 goto success; 738 } 739 740 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 741 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 742 743 success: 744 trace_and_count(c, btree_cache_cannibalize_lock, trans); 745 return 0; 746 } 747 748 static struct btree *btree_node_cannibalize(struct bch_fs *c) 749 { 750 struct btree_cache *bc = &c->btree_cache; 751 struct btree *b; 752 753 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 754 list_for_each_entry_reverse(b, &bc->live[i].list, list) 755 if (!btree_node_reclaim(c, b, false)) 756 return b; 757 758 while (1) { 759 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 760 list_for_each_entry_reverse(b, &bc->live[i].list, list) 761 if (!btree_node_write_and_reclaim(c, b)) 762 return b; 763 764 /* 765 * Rare case: all nodes were intent-locked. 766 * Just busy-wait. 767 */ 768 WARN_ONCE(1, "btree cache cannibalize failed\n"); 769 cond_resched(); 770 } 771 } 772 773 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 774 { 775 struct bch_fs *c = trans->c; 776 struct btree_cache *bc = &c->btree_cache; 777 struct list_head *freed = pcpu_read_locks 778 ? &bc->freed_pcpu 779 : &bc->freed_nonpcpu; 780 struct btree *b, *b2; 781 u64 start_time = local_clock(); 782 783 mutex_lock(&bc->lock); 784 785 /* 786 * We never free struct btree itself, just the memory that holds the on 787 * disk node. Check the freed list before allocating a new one: 788 */ 789 list_for_each_entry(b, freed, list) 790 if (!btree_node_reclaim(c, b, false)) { 791 list_del_init(&b->list); 792 goto got_node; 793 } 794 795 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 796 if (!b) { 797 mutex_unlock(&bc->lock); 798 bch2_trans_unlock(trans); 799 b = __btree_node_mem_alloc(c, GFP_KERNEL); 800 if (!b) 801 goto err; 802 mutex_lock(&bc->lock); 803 } 804 805 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); 806 807 BUG_ON(!six_trylock_intent(&b->c.lock)); 808 BUG_ON(!six_trylock_write(&b->c.lock)); 809 810 got_node: 811 /* 812 * btree_free() doesn't free memory; it sticks the node on the end of 813 * the list. Check if there's any freed nodes there: 814 */ 815 list_for_each_entry(b2, &bc->freeable, list) 816 if (!btree_node_reclaim(c, b2, false)) { 817 swap(b->data, b2->data); 818 swap(b->aux_data, b2->aux_data); 819 820 list_del_init(&b2->list); 821 --bc->nr_freeable; 822 btree_node_to_freedlist(bc, b2); 823 mutex_unlock(&bc->lock); 824 825 six_unlock_write(&b2->c.lock); 826 six_unlock_intent(&b2->c.lock); 827 goto got_mem; 828 } 829 830 mutex_unlock(&bc->lock); 831 832 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 833 bch2_trans_unlock(trans); 834 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 835 goto err; 836 } 837 838 got_mem: 839 BUG_ON(!list_empty(&b->list)); 840 BUG_ON(btree_node_hashed(b)); 841 BUG_ON(btree_node_dirty(b)); 842 BUG_ON(btree_node_write_in_flight(b)); 843 out: 844 b->flags = 0; 845 b->written = 0; 846 b->nsets = 0; 847 b->sib_u64s[0] = 0; 848 b->sib_u64s[1] = 0; 849 b->whiteout_u64s = 0; 850 bch2_btree_keys_init(b); 851 set_btree_node_accessed(b); 852 853 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 854 start_time); 855 856 int ret = bch2_trans_relock(trans); 857 if (unlikely(ret)) { 858 bch2_btree_node_to_freelist(c, b); 859 return ERR_PTR(ret); 860 } 861 862 return b; 863 err: 864 mutex_lock(&bc->lock); 865 866 /* Try to cannibalize another cached btree node: */ 867 if (bc->alloc_lock == current) { 868 b2 = btree_node_cannibalize(c); 869 clear_btree_node_just_written(b2); 870 __bch2_btree_node_hash_remove(bc, b2); 871 872 if (b) { 873 swap(b->data, b2->data); 874 swap(b->aux_data, b2->aux_data); 875 btree_node_to_freedlist(bc, b2); 876 six_unlock_write(&b2->c.lock); 877 six_unlock_intent(&b2->c.lock); 878 } else { 879 b = b2; 880 } 881 882 BUG_ON(!list_empty(&b->list)); 883 mutex_unlock(&bc->lock); 884 885 trace_and_count(c, btree_cache_cannibalize, trans); 886 goto out; 887 } 888 889 mutex_unlock(&bc->lock); 890 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 891 } 892 893 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 894 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 895 struct btree_path *path, 896 const struct bkey_i *k, 897 enum btree_id btree_id, 898 unsigned level, 899 enum six_lock_type lock_type, 900 bool sync) 901 { 902 struct bch_fs *c = trans->c; 903 struct btree_cache *bc = &c->btree_cache; 904 struct btree *b; 905 906 if (unlikely(level >= BTREE_MAX_DEPTH)) { 907 int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", 908 level, BTREE_MAX_DEPTH); 909 return ERR_PTR(ret); 910 } 911 912 if (unlikely(!bkey_is_btree_ptr(&k->k))) { 913 struct printbuf buf = PRINTBUF; 914 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 915 916 int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); 917 printbuf_exit(&buf); 918 return ERR_PTR(ret); 919 } 920 921 if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { 922 struct printbuf buf = PRINTBUF; 923 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 924 925 int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); 926 printbuf_exit(&buf); 927 return ERR_PTR(ret); 928 } 929 930 /* 931 * Parent node must be locked, else we could read in a btree node that's 932 * been freed: 933 */ 934 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 935 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 936 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 937 } 938 939 b = bch2_btree_node_mem_alloc(trans, level != 0); 940 941 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 942 if (!path) 943 return b; 944 945 trans->memory_allocation_failure = true; 946 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 947 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 948 } 949 950 if (IS_ERR(b)) 951 return b; 952 953 bkey_copy(&b->key, k); 954 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 955 /* raced with another fill: */ 956 957 /* mark as unhashed... */ 958 b->hash_val = 0; 959 960 mutex_lock(&bc->lock); 961 __bch2_btree_node_to_freelist(bc, b); 962 mutex_unlock(&bc->lock); 963 964 six_unlock_write(&b->c.lock); 965 six_unlock_intent(&b->c.lock); 966 return NULL; 967 } 968 969 set_btree_node_read_in_flight(b); 970 six_unlock_write(&b->c.lock); 971 972 if (path) { 973 u32 seq = six_lock_seq(&b->c.lock); 974 975 /* Unlock before doing IO: */ 976 six_unlock_intent(&b->c.lock); 977 bch2_trans_unlock_noassert(trans); 978 979 bch2_btree_node_read(trans, b, sync); 980 981 int ret = bch2_trans_relock(trans); 982 if (ret) 983 return ERR_PTR(ret); 984 985 if (!sync) 986 return NULL; 987 988 if (!six_relock_type(&b->c.lock, lock_type, seq)) 989 b = NULL; 990 } else { 991 bch2_btree_node_read(trans, b, sync); 992 if (lock_type == SIX_LOCK_read) 993 six_lock_downgrade(&b->c.lock); 994 } 995 996 return b; 997 } 998 999 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 1000 { 1001 struct printbuf buf = PRINTBUF; 1002 1003 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 1004 return; 1005 1006 prt_printf(&buf, 1007 "btree node header doesn't match ptr\n" 1008 "btree %s level %u\n" 1009 "ptr: ", 1010 bch2_btree_id_str(b->c.btree_id), b->c.level); 1011 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 1012 1013 prt_printf(&buf, "\nheader: btree %s level %llu\n" 1014 "min ", 1015 bch2_btree_id_str(BTREE_NODE_ID(b->data)), 1016 BTREE_NODE_LEVEL(b->data)); 1017 bch2_bpos_to_text(&buf, b->data->min_key); 1018 1019 prt_printf(&buf, "\nmax "); 1020 bch2_bpos_to_text(&buf, b->data->max_key); 1021 1022 bch2_fs_topology_error(c, "%s", buf.buf); 1023 1024 printbuf_exit(&buf); 1025 } 1026 1027 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 1028 { 1029 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 1030 b->c.level != BTREE_NODE_LEVEL(b->data) || 1031 !bpos_eq(b->data->max_key, b->key.k.p) || 1032 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 1033 !bpos_eq(b->data->min_key, 1034 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 1035 btree_bad_header(c, b); 1036 } 1037 1038 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1039 const struct bkey_i *k, unsigned level, 1040 enum six_lock_type lock_type, 1041 unsigned long trace_ip) 1042 { 1043 struct bch_fs *c = trans->c; 1044 struct btree_cache *bc = &c->btree_cache; 1045 struct btree *b; 1046 bool need_relock = false; 1047 int ret; 1048 1049 EBUG_ON(level >= BTREE_MAX_DEPTH); 1050 retry: 1051 b = btree_cache_find(bc, k); 1052 if (unlikely(!b)) { 1053 /* 1054 * We must have the parent locked to call bch2_btree_node_fill(), 1055 * else we could read in a btree node from disk that's been 1056 * freed: 1057 */ 1058 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 1059 level, lock_type, true); 1060 need_relock = true; 1061 1062 /* We raced and found the btree node in the cache */ 1063 if (!b) 1064 goto retry; 1065 1066 if (IS_ERR(b)) 1067 return b; 1068 } else { 1069 if (btree_node_read_locked(path, level + 1)) 1070 btree_node_unlock(trans, path, level + 1); 1071 1072 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1073 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1074 return ERR_PTR(ret); 1075 1076 BUG_ON(ret); 1077 1078 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1079 b->c.level != level || 1080 race_fault())) { 1081 six_unlock_type(&b->c.lock, lock_type); 1082 if (bch2_btree_node_relock(trans, path, level + 1)) 1083 goto retry; 1084 1085 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1086 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1087 } 1088 1089 /* avoid atomic set bit if it's not needed: */ 1090 if (!btree_node_accessed(b)) 1091 set_btree_node_accessed(b); 1092 } 1093 1094 if (unlikely(btree_node_read_in_flight(b))) { 1095 u32 seq = six_lock_seq(&b->c.lock); 1096 1097 six_unlock_type(&b->c.lock, lock_type); 1098 bch2_trans_unlock(trans); 1099 need_relock = true; 1100 1101 bch2_btree_node_wait_on_read(b); 1102 1103 ret = bch2_trans_relock(trans); 1104 if (ret) 1105 return ERR_PTR(ret); 1106 1107 /* 1108 * should_be_locked is not set on this path yet, so we need to 1109 * relock it specifically: 1110 */ 1111 if (!six_relock_type(&b->c.lock, lock_type, seq)) 1112 goto retry; 1113 } 1114 1115 if (unlikely(need_relock)) { 1116 ret = bch2_trans_relock(trans) ?: 1117 bch2_btree_path_relock_intent(trans, path); 1118 if (ret) { 1119 six_unlock_type(&b->c.lock, lock_type); 1120 return ERR_PTR(ret); 1121 } 1122 } 1123 1124 prefetch(b->aux_data); 1125 1126 for_each_bset(b, t) { 1127 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1128 1129 prefetch(p + L1_CACHE_BYTES * 0); 1130 prefetch(p + L1_CACHE_BYTES * 1); 1131 prefetch(p + L1_CACHE_BYTES * 2); 1132 } 1133 1134 if (unlikely(btree_node_read_error(b))) { 1135 six_unlock_type(&b->c.lock, lock_type); 1136 return ERR_PTR(-BCH_ERR_btree_node_read_error); 1137 } 1138 1139 EBUG_ON(b->c.btree_id != path->btree_id); 1140 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1141 btree_check_header(c, b); 1142 1143 return b; 1144 } 1145 1146 /** 1147 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 1148 * in from disk if necessary. 1149 * 1150 * @trans: btree transaction object 1151 * @path: btree_path being traversed 1152 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 1153 * @level: level of btree node being looked up (0 == leaf node) 1154 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 1155 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 1156 * 1157 * The btree node will have either a read or a write lock held, depending on 1158 * the @write parameter. 1159 * 1160 * Returns: btree node or ERR_PTR() 1161 */ 1162 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1163 const struct bkey_i *k, unsigned level, 1164 enum six_lock_type lock_type, 1165 unsigned long trace_ip) 1166 { 1167 struct bch_fs *c = trans->c; 1168 struct btree *b; 1169 int ret; 1170 1171 EBUG_ON(level >= BTREE_MAX_DEPTH); 1172 1173 b = btree_node_mem_ptr(k); 1174 1175 /* 1176 * Check b->hash_val _before_ calling btree_node_lock() - this might not 1177 * be the node we want anymore, and trying to lock the wrong node could 1178 * cause an unneccessary transaction restart: 1179 */ 1180 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 1181 !b || 1182 b->hash_val != btree_ptr_hash_val(k))) 1183 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1184 1185 if (btree_node_read_locked(path, level + 1)) 1186 btree_node_unlock(trans, path, level + 1); 1187 1188 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1189 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1190 return ERR_PTR(ret); 1191 1192 BUG_ON(ret); 1193 1194 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1195 b->c.level != level || 1196 race_fault())) { 1197 six_unlock_type(&b->c.lock, lock_type); 1198 if (bch2_btree_node_relock(trans, path, level + 1)) 1199 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1200 1201 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1202 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1203 } 1204 1205 if (unlikely(btree_node_read_in_flight(b))) { 1206 six_unlock_type(&b->c.lock, lock_type); 1207 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1208 } 1209 1210 prefetch(b->aux_data); 1211 1212 for_each_bset(b, t) { 1213 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1214 1215 prefetch(p + L1_CACHE_BYTES * 0); 1216 prefetch(p + L1_CACHE_BYTES * 1); 1217 prefetch(p + L1_CACHE_BYTES * 2); 1218 } 1219 1220 /* avoid atomic set bit if it's not needed: */ 1221 if (!btree_node_accessed(b)) 1222 set_btree_node_accessed(b); 1223 1224 if (unlikely(btree_node_read_error(b))) { 1225 six_unlock_type(&b->c.lock, lock_type); 1226 return ERR_PTR(-BCH_ERR_btree_node_read_error); 1227 } 1228 1229 EBUG_ON(b->c.btree_id != path->btree_id); 1230 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1231 btree_check_header(c, b); 1232 1233 return b; 1234 } 1235 1236 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1237 const struct bkey_i *k, 1238 enum btree_id btree_id, 1239 unsigned level, 1240 bool nofill) 1241 { 1242 struct bch_fs *c = trans->c; 1243 struct btree_cache *bc = &c->btree_cache; 1244 struct btree *b; 1245 int ret; 1246 1247 EBUG_ON(level >= BTREE_MAX_DEPTH); 1248 1249 if (c->opts.btree_node_mem_ptr_optimization) { 1250 b = btree_node_mem_ptr(k); 1251 if (b) 1252 goto lock_node; 1253 } 1254 retry: 1255 b = btree_cache_find(bc, k); 1256 if (unlikely(!b)) { 1257 if (nofill) 1258 goto out; 1259 1260 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1261 level, SIX_LOCK_read, true); 1262 1263 /* We raced and found the btree node in the cache */ 1264 if (!b) 1265 goto retry; 1266 1267 if (IS_ERR(b) && 1268 !bch2_btree_cache_cannibalize_lock(trans, NULL)) 1269 goto retry; 1270 1271 if (IS_ERR(b)) 1272 goto out; 1273 } else { 1274 lock_node: 1275 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1276 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1277 return ERR_PTR(ret); 1278 1279 BUG_ON(ret); 1280 1281 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1282 b->c.btree_id != btree_id || 1283 b->c.level != level)) { 1284 six_unlock_read(&b->c.lock); 1285 goto retry; 1286 } 1287 } 1288 1289 /* XXX: waiting on IO with btree locks held: */ 1290 __bch2_btree_node_wait_on_read(b); 1291 1292 prefetch(b->aux_data); 1293 1294 for_each_bset(b, t) { 1295 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1296 1297 prefetch(p + L1_CACHE_BYTES * 0); 1298 prefetch(p + L1_CACHE_BYTES * 1); 1299 prefetch(p + L1_CACHE_BYTES * 2); 1300 } 1301 1302 /* avoid atomic set bit if it's not needed: */ 1303 if (!btree_node_accessed(b)) 1304 set_btree_node_accessed(b); 1305 1306 if (unlikely(btree_node_read_error(b))) { 1307 six_unlock_read(&b->c.lock); 1308 b = ERR_PTR(-BCH_ERR_btree_node_read_error); 1309 goto out; 1310 } 1311 1312 EBUG_ON(b->c.btree_id != btree_id); 1313 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1314 btree_check_header(c, b); 1315 out: 1316 bch2_btree_cache_cannibalize_unlock(trans); 1317 return b; 1318 } 1319 1320 int bch2_btree_node_prefetch(struct btree_trans *trans, 1321 struct btree_path *path, 1322 const struct bkey_i *k, 1323 enum btree_id btree_id, unsigned level) 1324 { 1325 struct bch_fs *c = trans->c; 1326 struct btree_cache *bc = &c->btree_cache; 1327 1328 BUG_ON(path && !btree_node_locked(path, level + 1)); 1329 BUG_ON(level >= BTREE_MAX_DEPTH); 1330 1331 struct btree *b = btree_cache_find(bc, k); 1332 if (b) 1333 return 0; 1334 1335 b = bch2_btree_node_fill(trans, path, k, btree_id, 1336 level, SIX_LOCK_read, false); 1337 int ret = PTR_ERR_OR_ZERO(b); 1338 if (ret) 1339 return ret; 1340 if (b) 1341 six_unlock_read(&b->c.lock); 1342 return 0; 1343 } 1344 1345 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1346 { 1347 struct bch_fs *c = trans->c; 1348 struct btree_cache *bc = &c->btree_cache; 1349 struct btree *b; 1350 1351 b = btree_cache_find(bc, k); 1352 if (!b) 1353 return; 1354 1355 BUG_ON(b == btree_node_root(trans->c, b)); 1356 wait_on_io: 1357 /* not allowed to wait on io with btree locks held: */ 1358 1359 /* XXX we're called from btree_gc which will be holding other btree 1360 * nodes locked 1361 */ 1362 __bch2_btree_node_wait_on_read(b); 1363 __bch2_btree_node_wait_on_write(b); 1364 1365 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1366 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1367 if (unlikely(b->hash_val != btree_ptr_hash_val(k))) 1368 goto out; 1369 1370 if (btree_node_dirty(b)) { 1371 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1372 six_unlock_write(&b->c.lock); 1373 six_unlock_intent(&b->c.lock); 1374 goto wait_on_io; 1375 } 1376 1377 BUG_ON(btree_node_dirty(b)); 1378 1379 mutex_lock(&bc->lock); 1380 bch2_btree_node_hash_remove(bc, b); 1381 btree_node_data_free(bc, b); 1382 mutex_unlock(&bc->lock); 1383 out: 1384 six_unlock_write(&b->c.lock); 1385 six_unlock_intent(&b->c.lock); 1386 } 1387 1388 const char *bch2_btree_id_str(enum btree_id btree) 1389 { 1390 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1391 } 1392 1393 void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) 1394 { 1395 if (btree < BTREE_ID_NR) 1396 prt_str(out, __bch2_btree_ids[btree]); 1397 else 1398 prt_printf(out, "(unknown btree %u)", btree); 1399 } 1400 1401 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1402 { 1403 prt_printf(out, "%s level %u/%u\n ", 1404 bch2_btree_id_str(b->c.btree_id), 1405 b->c.level, 1406 bch2_btree_id_root(c, b->c.btree_id)->level); 1407 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1408 } 1409 1410 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1411 { 1412 struct bset_stats stats; 1413 1414 memset(&stats, 0, sizeof(stats)); 1415 1416 bch2_btree_keys_stats(b, &stats); 1417 1418 prt_printf(out, "l %u ", b->c.level); 1419 bch2_bpos_to_text(out, b->data->min_key); 1420 prt_printf(out, " - "); 1421 bch2_bpos_to_text(out, b->data->max_key); 1422 prt_printf(out, ":\n" 1423 " ptrs: "); 1424 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1425 prt_newline(out); 1426 1427 prt_printf(out, 1428 " format: "); 1429 bch2_bkey_format_to_text(out, &b->format); 1430 1431 prt_printf(out, 1432 " unpack fn len: %u\n" 1433 " bytes used %zu/%zu (%zu%% full)\n" 1434 " sib u64s: %u, %u (merge threshold %u)\n" 1435 " nr packed keys %u\n" 1436 " nr unpacked keys %u\n" 1437 " floats %zu\n" 1438 " failed unpacked %zu\n", 1439 b->unpack_fn_len, 1440 b->nr.live_u64s * sizeof(u64), 1441 btree_buf_bytes(b) - sizeof(struct btree_node), 1442 b->nr.live_u64s * 100 / btree_max_u64s(c), 1443 b->sib_u64s[0], 1444 b->sib_u64s[1], 1445 c->btree_foreground_merge_threshold, 1446 b->nr.packed_keys, 1447 b->nr.unpacked_keys, 1448 stats.floats, 1449 stats.failed); 1450 } 1451 1452 static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, 1453 const char *label, size_t nr) 1454 { 1455 prt_printf(out, "%s\t", label); 1456 prt_human_readable_u64(out, nr * c->opts.btree_node_size); 1457 prt_printf(out, " (%zu)\n", nr); 1458 } 1459 1460 static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { 1461 #define x(n) #n, 1462 BCH_BTREE_CACHE_NOT_FREED_REASONS() 1463 #undef x 1464 NULL 1465 }; 1466 1467 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) 1468 { 1469 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 1470 1471 if (!out->nr_tabstops) 1472 printbuf_tabstop_push(out, 32); 1473 1474 prt_btree_cache_line(out, c, "live:", bc->live[0].nr); 1475 prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); 1476 prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); 1477 prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); 1478 prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); 1479 prt_newline(out); 1480 1481 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) 1482 prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]); 1483 1484 prt_newline(out); 1485 prt_printf(out, "freed:\t%zu\n", bc->nr_freed); 1486 prt_printf(out, "not freed:\n"); 1487 1488 for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) 1489 prt_printf(out, " %s\t%llu\n", 1490 bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); 1491 } 1492