1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bbpos.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_iter.h" 9 #include "btree_locking.h" 10 #include "debug.h" 11 #include "errcode.h" 12 #include "error.h" 13 #include "journal.h" 14 #include "trace.h" 15 16 #include <linux/prefetch.h> 17 #include <linux/sched/mm.h> 18 #include <linux/swap.h> 19 20 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ 21 do { \ 22 if (shrinker_counter) \ 23 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ 24 } while (0) 25 26 const char * const bch2_btree_node_flags[] = { 27 #define x(f) #f, 28 BTREE_FLAGS() 29 #undef x 30 NULL 31 }; 32 33 void bch2_recalc_btree_reserve(struct bch_fs *c) 34 { 35 unsigned reserve = 16; 36 37 if (!c->btree_roots_known[0].b) 38 reserve += 8; 39 40 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 41 struct btree_root *r = bch2_btree_id_root(c, i); 42 43 if (r->b) 44 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 45 } 46 47 c->btree_cache.nr_reserve = reserve; 48 } 49 50 static inline size_t btree_cache_can_free(struct btree_cache_list *list) 51 { 52 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 53 54 size_t can_free = list->nr; 55 if (!list->idx) 56 can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); 57 return can_free; 58 } 59 60 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 61 { 62 if (b->c.lock.readers) 63 list_move(&b->list, &bc->freed_pcpu); 64 else 65 list_move(&b->list, &bc->freed_nonpcpu); 66 } 67 68 static void btree_node_data_free(struct bch_fs *c, struct btree *b) 69 { 70 struct btree_cache *bc = &c->btree_cache; 71 72 BUG_ON(btree_node_hashed(b)); 73 74 /* 75 * This should really be done in slub/vmalloc, but we're using the 76 * kmalloc_large() path, so we're working around a slub bug by doing 77 * this here: 78 */ 79 if (b->data) 80 mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); 81 if (b->aux_data) 82 mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); 83 84 EBUG_ON(btree_node_write_in_flight(b)); 85 86 clear_btree_node_just_written(b); 87 88 kvfree(b->data); 89 b->data = NULL; 90 #ifdef __KERNEL__ 91 kvfree(b->aux_data); 92 #else 93 munmap(b->aux_data, btree_aux_data_bytes(b)); 94 #endif 95 b->aux_data = NULL; 96 97 bc->nr_freeable--; 98 99 btree_node_to_freedlist(bc, b); 100 } 101 102 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 103 const void *obj) 104 { 105 const struct btree *b = obj; 106 const u64 *v = arg->key; 107 108 return b->hash_val == *v ? 0 : 1; 109 } 110 111 static const struct rhashtable_params bch_btree_cache_params = { 112 .head_offset = offsetof(struct btree, hash), 113 .key_offset = offsetof(struct btree, hash_val), 114 .key_len = sizeof(u64), 115 .obj_cmpfn = bch2_btree_cache_cmp_fn, 116 .automatic_shrinking = true, 117 }; 118 119 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 120 { 121 BUG_ON(b->data || b->aux_data); 122 123 gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; 124 125 b->data = kvmalloc(btree_buf_bytes(b), gfp); 126 if (!b->data) 127 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 128 #ifdef __KERNEL__ 129 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 130 #else 131 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 132 PROT_READ|PROT_WRITE|PROT_EXEC, 133 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 134 if (b->aux_data == MAP_FAILED) 135 b->aux_data = NULL; 136 #endif 137 if (!b->aux_data) { 138 kvfree(b->data); 139 b->data = NULL; 140 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 141 } 142 143 return 0; 144 } 145 146 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 147 { 148 struct btree *b; 149 150 b = kzalloc(sizeof(struct btree), gfp); 151 if (!b) 152 return NULL; 153 154 bkey_btree_ptr_init(&b->key); 155 INIT_LIST_HEAD(&b->list); 156 INIT_LIST_HEAD(&b->write_blocked); 157 b->byte_order = ilog2(c->opts.btree_node_size); 158 return b; 159 } 160 161 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 162 { 163 struct btree_cache *bc = &c->btree_cache; 164 struct btree *b; 165 166 b = __btree_node_mem_alloc(c, GFP_KERNEL); 167 if (!b) 168 return NULL; 169 170 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 171 kfree(b); 172 return NULL; 173 } 174 175 bch2_btree_lock_init(&b->c, 0); 176 177 bc->nr_freeable++; 178 list_add(&b->list, &bc->freeable); 179 return b; 180 } 181 182 void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) 183 { 184 mutex_lock(&c->btree_cache.lock); 185 list_move(&b->list, &c->btree_cache.freeable); 186 mutex_unlock(&c->btree_cache.lock); 187 188 six_unlock_write(&b->c.lock); 189 six_unlock_intent(&b->c.lock); 190 } 191 192 static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) 193 { 194 struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 195 196 u64 mask = bc->pinned_nodes_mask[!!b->c.level]; 197 198 return ((mask & BIT_ULL(b->c.btree_id)) && 199 bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 200 bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); 201 } 202 203 void bch2_node_pin(struct bch_fs *c, struct btree *b) 204 { 205 struct btree_cache *bc = &c->btree_cache; 206 207 mutex_lock(&bc->lock); 208 BUG_ON(!__btree_node_pinned(bc, b)); 209 if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { 210 set_btree_node_pinned(b); 211 list_move(&b->list, &bc->live[1].list); 212 bc->live[0].nr--; 213 bc->live[1].nr++; 214 } 215 mutex_unlock(&bc->lock); 216 } 217 218 void bch2_btree_cache_unpin(struct bch_fs *c) 219 { 220 struct btree_cache *bc = &c->btree_cache; 221 struct btree *b, *n; 222 223 mutex_lock(&bc->lock); 224 c->btree_cache.pinned_nodes_mask[0] = 0; 225 c->btree_cache.pinned_nodes_mask[1] = 0; 226 227 list_for_each_entry_safe(b, n, &bc->live[1].list, list) { 228 clear_btree_node_pinned(b); 229 list_move(&b->list, &bc->live[0].list); 230 bc->live[0].nr++; 231 bc->live[1].nr--; 232 } 233 234 mutex_unlock(&bc->lock); 235 } 236 237 /* Btree in memory cache - hash table */ 238 239 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 240 { 241 lockdep_assert_held(&bc->lock); 242 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 243 244 BUG_ON(ret); 245 246 /* Cause future lookups for this node to fail: */ 247 b->hash_val = 0; 248 249 if (b->c.btree_id < BTREE_ID_NR) 250 --bc->nr_by_btree[b->c.btree_id]; 251 252 bc->live[btree_node_pinned(b)].nr--; 253 bc->nr_freeable++; 254 list_move(&b->list, &bc->freeable); 255 } 256 257 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 258 { 259 BUG_ON(b->hash_val); 260 b->hash_val = btree_ptr_hash_val(&b->key); 261 262 int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, 263 bch_btree_cache_params); 264 if (ret) 265 return ret; 266 267 if (b->c.btree_id < BTREE_ID_NR) 268 bc->nr_by_btree[b->c.btree_id]++; 269 270 bool p = __btree_node_pinned(bc, b); 271 mod_bit(BTREE_NODE_pinned, &b->flags, p); 272 273 list_move_tail(&b->list, &bc->live[p].list); 274 bc->live[p].nr++; 275 276 bc->nr_freeable--; 277 return 0; 278 } 279 280 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 281 unsigned level, enum btree_id id) 282 { 283 b->c.level = level; 284 b->c.btree_id = id; 285 286 mutex_lock(&bc->lock); 287 int ret = __bch2_btree_node_hash_insert(bc, b); 288 mutex_unlock(&bc->lock); 289 290 return ret; 291 } 292 293 void bch2_btree_node_update_key_early(struct btree_trans *trans, 294 enum btree_id btree, unsigned level, 295 struct bkey_s_c old, struct bkey_i *new) 296 { 297 struct bch_fs *c = trans->c; 298 struct btree *b; 299 struct bkey_buf tmp; 300 int ret; 301 302 bch2_bkey_buf_init(&tmp); 303 bch2_bkey_buf_reassemble(&tmp, c, old); 304 305 b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); 306 if (!IS_ERR_OR_NULL(b)) { 307 mutex_lock(&c->btree_cache.lock); 308 309 bch2_btree_node_hash_remove(&c->btree_cache, b); 310 311 bkey_copy(&b->key, new); 312 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 313 BUG_ON(ret); 314 315 mutex_unlock(&c->btree_cache.lock); 316 six_unlock_read(&b->c.lock); 317 } 318 319 bch2_bkey_buf_exit(&tmp, c); 320 } 321 322 __flatten 323 static inline struct btree *btree_cache_find(struct btree_cache *bc, 324 const struct bkey_i *k) 325 { 326 u64 v = btree_ptr_hash_val(k); 327 328 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 329 } 330 331 /* 332 * this version is for btree nodes that have already been freed (we're not 333 * reaping a real btree node) 334 */ 335 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) 336 { 337 struct btree_cache *bc = &c->btree_cache; 338 int ret = 0; 339 340 lockdep_assert_held(&bc->lock); 341 wait_on_io: 342 if (b->flags & ((1U << BTREE_NODE_dirty)| 343 (1U << BTREE_NODE_read_in_flight)| 344 (1U << BTREE_NODE_write_in_flight))) { 345 if (!flush) { 346 if (btree_node_dirty(b)) 347 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 348 else if (btree_node_read_in_flight(b)) 349 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 350 else if (btree_node_write_in_flight(b)) 351 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 352 return -BCH_ERR_ENOMEM_btree_node_reclaim; 353 } 354 355 /* XXX: waiting on IO with btree cache lock held */ 356 bch2_btree_node_wait_on_read(b); 357 bch2_btree_node_wait_on_write(b); 358 } 359 360 if (!six_trylock_intent(&b->c.lock)) { 361 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); 362 return -BCH_ERR_ENOMEM_btree_node_reclaim; 363 } 364 365 if (!six_trylock_write(&b->c.lock)) { 366 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); 367 goto out_unlock_intent; 368 } 369 370 /* recheck under lock */ 371 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 372 (1U << BTREE_NODE_write_in_flight))) { 373 if (!flush) { 374 if (btree_node_read_in_flight(b)) 375 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 376 else if (btree_node_write_in_flight(b)) 377 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 378 goto out_unlock; 379 } 380 six_unlock_write(&b->c.lock); 381 six_unlock_intent(&b->c.lock); 382 goto wait_on_io; 383 } 384 385 if (btree_node_noevict(b)) { 386 BTREE_CACHE_NOT_FREED_INCREMENT(noevict); 387 goto out_unlock; 388 } 389 if (btree_node_write_blocked(b)) { 390 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); 391 goto out_unlock; 392 } 393 if (btree_node_will_make_reachable(b)) { 394 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); 395 goto out_unlock; 396 } 397 398 if (btree_node_dirty(b)) { 399 if (!flush) { 400 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 401 goto out_unlock; 402 } 403 /* 404 * Using the underscore version because we don't want to compact 405 * bsets after the write, since this node is about to be evicted 406 * - unless btree verify mode is enabled, since it runs out of 407 * the post write cleanup: 408 */ 409 if (bch2_verify_btree_ondisk) 410 bch2_btree_node_write(c, b, SIX_LOCK_intent, 411 BTREE_WRITE_cache_reclaim); 412 else 413 __bch2_btree_node_write(c, b, 414 BTREE_WRITE_cache_reclaim); 415 416 six_unlock_write(&b->c.lock); 417 six_unlock_intent(&b->c.lock); 418 goto wait_on_io; 419 } 420 out: 421 if (b->hash_val && !ret) 422 trace_and_count(c, btree_cache_reap, c, b); 423 return ret; 424 out_unlock: 425 six_unlock_write(&b->c.lock); 426 out_unlock_intent: 427 six_unlock_intent(&b->c.lock); 428 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 429 goto out; 430 } 431 432 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) 433 { 434 return __btree_node_reclaim(c, b, false, shrinker_counter); 435 } 436 437 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 438 { 439 return __btree_node_reclaim(c, b, true, false); 440 } 441 442 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 443 struct shrink_control *sc) 444 { 445 struct btree_cache_list *list = shrink->private_data; 446 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 447 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 448 struct btree *b, *t; 449 unsigned long nr = sc->nr_to_scan; 450 unsigned long can_free = 0; 451 unsigned long freed = 0; 452 unsigned long touched = 0; 453 unsigned i, flags; 454 unsigned long ret = SHRINK_STOP; 455 bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; 456 457 if (bch2_btree_shrinker_disabled) 458 return SHRINK_STOP; 459 460 mutex_lock(&bc->lock); 461 flags = memalloc_nofs_save(); 462 463 /* 464 * It's _really_ critical that we don't free too many btree nodes - we 465 * have to always leave ourselves a reserve. The reserve is how we 466 * guarantee that allocating memory for a new btree node can always 467 * succeed, so that inserting keys into the btree can always succeed and 468 * IO can always make forward progress: 469 */ 470 can_free = btree_cache_can_free(list); 471 nr = min_t(unsigned long, nr, can_free); 472 473 i = 0; 474 list_for_each_entry_safe(b, t, &bc->freeable, list) { 475 /* 476 * Leave a few nodes on the freeable list, so that a btree split 477 * won't have to hit the system allocator: 478 */ 479 if (++i <= 3) 480 continue; 481 482 touched++; 483 484 if (touched >= nr) 485 goto out; 486 487 if (!btree_node_reclaim(c, b, true)) { 488 btree_node_data_free(c, b); 489 six_unlock_write(&b->c.lock); 490 six_unlock_intent(&b->c.lock); 491 freed++; 492 bc->nr_freed++; 493 } 494 } 495 restart: 496 list_for_each_entry_safe(b, t, &list->list, list) { 497 touched++; 498 499 if (btree_node_accessed(b)) { 500 clear_btree_node_accessed(b); 501 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; 502 --touched;; 503 } else if (!btree_node_reclaim(c, b, true)) { 504 bch2_btree_node_hash_remove(bc, b); 505 506 freed++; 507 btree_node_data_free(c, b); 508 bc->nr_freed++; 509 510 six_unlock_write(&b->c.lock); 511 six_unlock_intent(&b->c.lock); 512 513 if (freed == nr) 514 goto out_rotate; 515 } else if (trigger_writes && 516 btree_node_dirty(b) && 517 !btree_node_will_make_reachable(b) && 518 !btree_node_write_blocked(b) && 519 six_trylock_read(&b->c.lock)) { 520 list_move(&list->list, &b->list); 521 mutex_unlock(&bc->lock); 522 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 523 six_unlock_read(&b->c.lock); 524 if (touched >= nr) 525 goto out_nounlock; 526 mutex_lock(&bc->lock); 527 goto restart; 528 } 529 530 if (touched >= nr) 531 break; 532 } 533 out_rotate: 534 if (&t->list != &list->list) 535 list_move_tail(&list->list, &t->list); 536 out: 537 mutex_unlock(&bc->lock); 538 out_nounlock: 539 ret = freed; 540 memalloc_nofs_restore(flags); 541 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 542 return ret; 543 } 544 545 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 546 struct shrink_control *sc) 547 { 548 struct btree_cache_list *list = shrink->private_data; 549 550 if (bch2_btree_shrinker_disabled) 551 return 0; 552 553 return btree_cache_can_free(list); 554 } 555 556 void bch2_fs_btree_cache_exit(struct bch_fs *c) 557 { 558 struct btree_cache *bc = &c->btree_cache; 559 struct btree *b, *t; 560 unsigned long flags; 561 562 shrinker_free(bc->live[1].shrink); 563 shrinker_free(bc->live[0].shrink); 564 565 /* vfree() can allocate memory: */ 566 flags = memalloc_nofs_save(); 567 mutex_lock(&bc->lock); 568 569 if (c->verify_data) 570 list_move(&c->verify_data->list, &bc->live[0].list); 571 572 kvfree(c->verify_ondisk); 573 574 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 575 struct btree_root *r = bch2_btree_id_root(c, i); 576 577 if (r->b) 578 list_add(&r->b->list, &bc->live[0].list); 579 } 580 581 list_for_each_entry_safe(b, t, &bc->live[1].list, list) 582 bch2_btree_node_hash_remove(bc, b); 583 list_for_each_entry_safe(b, t, &bc->live[0].list, list) 584 bch2_btree_node_hash_remove(bc, b); 585 586 list_for_each_entry_safe(b, t, &bc->freeable, list) { 587 BUG_ON(btree_node_read_in_flight(b) || 588 btree_node_write_in_flight(b)); 589 590 btree_node_data_free(c, b); 591 } 592 593 BUG_ON(!bch2_journal_error(&c->journal) && 594 atomic_long_read(&c->btree_cache.nr_dirty)); 595 596 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 597 598 list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { 599 list_del(&b->list); 600 six_lock_exit(&b->c.lock); 601 kfree(b); 602 } 603 604 mutex_unlock(&bc->lock); 605 memalloc_nofs_restore(flags); 606 607 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) 608 BUG_ON(bc->nr_by_btree[i]); 609 BUG_ON(bc->live[0].nr); 610 BUG_ON(bc->live[1].nr); 611 BUG_ON(bc->nr_freeable); 612 613 if (bc->table_init_done) 614 rhashtable_destroy(&bc->table); 615 } 616 617 int bch2_fs_btree_cache_init(struct bch_fs *c) 618 { 619 struct btree_cache *bc = &c->btree_cache; 620 struct shrinker *shrink; 621 unsigned i; 622 int ret = 0; 623 624 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 625 if (ret) 626 goto err; 627 628 bc->table_init_done = true; 629 630 bch2_recalc_btree_reserve(c); 631 632 for (i = 0; i < bc->nr_reserve; i++) 633 if (!__bch2_btree_node_mem_alloc(c)) 634 goto err; 635 636 list_splice_init(&bc->live[0].list, &bc->freeable); 637 638 mutex_init(&c->verify_lock); 639 640 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 641 if (!shrink) 642 goto err; 643 bc->live[0].shrink = shrink; 644 shrink->count_objects = bch2_btree_cache_count; 645 shrink->scan_objects = bch2_btree_cache_scan; 646 shrink->seeks = 2; 647 shrink->private_data = &bc->live[0]; 648 shrinker_register(shrink); 649 650 shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); 651 if (!shrink) 652 goto err; 653 bc->live[1].shrink = shrink; 654 shrink->count_objects = bch2_btree_cache_count; 655 shrink->scan_objects = bch2_btree_cache_scan; 656 shrink->seeks = 8; 657 shrink->private_data = &bc->live[1]; 658 shrinker_register(shrink); 659 660 return 0; 661 err: 662 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 663 } 664 665 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 666 { 667 mutex_init(&bc->lock); 668 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { 669 bc->live[i].idx = i; 670 INIT_LIST_HEAD(&bc->live[i].list); 671 } 672 INIT_LIST_HEAD(&bc->freeable); 673 INIT_LIST_HEAD(&bc->freed_pcpu); 674 INIT_LIST_HEAD(&bc->freed_nonpcpu); 675 } 676 677 /* 678 * We can only have one thread cannibalizing other cached btree nodes at a time, 679 * or we'll deadlock. We use an open coded mutex to ensure that, which a 680 * cannibalize_bucket() will take. This means every time we unlock the root of 681 * the btree, we need to release this lock if we have it held. 682 */ 683 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) 684 { 685 struct bch_fs *c = trans->c; 686 struct btree_cache *bc = &c->btree_cache; 687 688 if (bc->alloc_lock == current) { 689 trace_and_count(c, btree_cache_cannibalize_unlock, trans); 690 bc->alloc_lock = NULL; 691 closure_wake_up(&bc->alloc_wait); 692 } 693 } 694 695 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) 696 { 697 struct bch_fs *c = trans->c; 698 struct btree_cache *bc = &c->btree_cache; 699 struct task_struct *old; 700 701 old = NULL; 702 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) 703 goto success; 704 705 if (!cl) { 706 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 707 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 708 } 709 710 closure_wait(&bc->alloc_wait, cl); 711 712 /* Try again, after adding ourselves to waitlist */ 713 old = NULL; 714 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { 715 /* We raced */ 716 closure_wake_up(&bc->alloc_wait); 717 goto success; 718 } 719 720 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 721 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 722 723 success: 724 trace_and_count(c, btree_cache_cannibalize_lock, trans); 725 return 0; 726 } 727 728 static struct btree *btree_node_cannibalize(struct bch_fs *c) 729 { 730 struct btree_cache *bc = &c->btree_cache; 731 struct btree *b; 732 733 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 734 list_for_each_entry_reverse(b, &bc->live[i].list, list) 735 if (!btree_node_reclaim(c, b, false)) 736 return b; 737 738 while (1) { 739 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 740 list_for_each_entry_reverse(b, &bc->live[i].list, list) 741 if (!btree_node_write_and_reclaim(c, b)) 742 return b; 743 744 /* 745 * Rare case: all nodes were intent-locked. 746 * Just busy-wait. 747 */ 748 WARN_ONCE(1, "btree cache cannibalize failed\n"); 749 cond_resched(); 750 } 751 } 752 753 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 754 { 755 struct bch_fs *c = trans->c; 756 struct btree_cache *bc = &c->btree_cache; 757 struct list_head *freed = pcpu_read_locks 758 ? &bc->freed_pcpu 759 : &bc->freed_nonpcpu; 760 struct btree *b, *b2; 761 u64 start_time = local_clock(); 762 763 mutex_lock(&bc->lock); 764 765 /* 766 * We never free struct btree itself, just the memory that holds the on 767 * disk node. Check the freed list before allocating a new one: 768 */ 769 list_for_each_entry(b, freed, list) 770 if (!btree_node_reclaim(c, b, false)) { 771 list_del_init(&b->list); 772 goto got_node; 773 } 774 775 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 776 if (!b) { 777 mutex_unlock(&bc->lock); 778 bch2_trans_unlock(trans); 779 b = __btree_node_mem_alloc(c, GFP_KERNEL); 780 if (!b) 781 goto err; 782 mutex_lock(&bc->lock); 783 } 784 785 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); 786 787 BUG_ON(!six_trylock_intent(&b->c.lock)); 788 BUG_ON(!six_trylock_write(&b->c.lock)); 789 got_node: 790 791 /* 792 * btree_free() doesn't free memory; it sticks the node on the end of 793 * the list. Check if there's any freed nodes there: 794 */ 795 list_for_each_entry(b2, &bc->freeable, list) 796 if (!btree_node_reclaim(c, b2, false)) { 797 swap(b->data, b2->data); 798 swap(b->aux_data, b2->aux_data); 799 btree_node_to_freedlist(bc, b2); 800 six_unlock_write(&b2->c.lock); 801 six_unlock_intent(&b2->c.lock); 802 goto got_mem; 803 } 804 805 mutex_unlock(&bc->lock); 806 807 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 808 bch2_trans_unlock(trans); 809 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 810 goto err; 811 } 812 813 mutex_lock(&bc->lock); 814 bc->nr_freeable++; 815 got_mem: 816 mutex_unlock(&bc->lock); 817 818 BUG_ON(btree_node_hashed(b)); 819 BUG_ON(btree_node_dirty(b)); 820 BUG_ON(btree_node_write_in_flight(b)); 821 out: 822 b->flags = 0; 823 b->written = 0; 824 b->nsets = 0; 825 b->sib_u64s[0] = 0; 826 b->sib_u64s[1] = 0; 827 b->whiteout_u64s = 0; 828 bch2_btree_keys_init(b); 829 set_btree_node_accessed(b); 830 831 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 832 start_time); 833 834 int ret = bch2_trans_relock(trans); 835 if (unlikely(ret)) { 836 bch2_btree_node_to_freelist(c, b); 837 return ERR_PTR(ret); 838 } 839 840 return b; 841 err: 842 mutex_lock(&bc->lock); 843 844 /* Try to cannibalize another cached btree node: */ 845 if (bc->alloc_lock == current) { 846 b2 = btree_node_cannibalize(c); 847 clear_btree_node_just_written(b2); 848 bch2_btree_node_hash_remove(bc, b2); 849 850 if (b) { 851 swap(b->data, b2->data); 852 swap(b->aux_data, b2->aux_data); 853 btree_node_to_freedlist(bc, b2); 854 six_unlock_write(&b2->c.lock); 855 six_unlock_intent(&b2->c.lock); 856 } else { 857 b = b2; 858 list_del_init(&b->list); 859 } 860 861 mutex_unlock(&bc->lock); 862 863 trace_and_count(c, btree_cache_cannibalize, trans); 864 goto out; 865 } 866 867 mutex_unlock(&bc->lock); 868 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 869 } 870 871 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 872 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 873 struct btree_path *path, 874 const struct bkey_i *k, 875 enum btree_id btree_id, 876 unsigned level, 877 enum six_lock_type lock_type, 878 bool sync) 879 { 880 struct bch_fs *c = trans->c; 881 struct btree_cache *bc = &c->btree_cache; 882 struct btree *b; 883 884 if (unlikely(level >= BTREE_MAX_DEPTH)) { 885 int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", 886 level, BTREE_MAX_DEPTH); 887 return ERR_PTR(ret); 888 } 889 890 if (unlikely(!bkey_is_btree_ptr(&k->k))) { 891 struct printbuf buf = PRINTBUF; 892 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 893 894 int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); 895 printbuf_exit(&buf); 896 return ERR_PTR(ret); 897 } 898 899 if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { 900 struct printbuf buf = PRINTBUF; 901 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 902 903 int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); 904 printbuf_exit(&buf); 905 return ERR_PTR(ret); 906 } 907 908 /* 909 * Parent node must be locked, else we could read in a btree node that's 910 * been freed: 911 */ 912 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 913 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 914 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 915 } 916 917 b = bch2_btree_node_mem_alloc(trans, level != 0); 918 919 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 920 if (!path) 921 return b; 922 923 trans->memory_allocation_failure = true; 924 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 925 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 926 } 927 928 if (IS_ERR(b)) 929 return b; 930 931 bkey_copy(&b->key, k); 932 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 933 /* raced with another fill: */ 934 935 /* mark as unhashed... */ 936 b->hash_val = 0; 937 938 mutex_lock(&bc->lock); 939 list_add(&b->list, &bc->freeable); 940 mutex_unlock(&bc->lock); 941 942 six_unlock_write(&b->c.lock); 943 six_unlock_intent(&b->c.lock); 944 return NULL; 945 } 946 947 set_btree_node_read_in_flight(b); 948 six_unlock_write(&b->c.lock); 949 950 if (path) { 951 u32 seq = six_lock_seq(&b->c.lock); 952 953 /* Unlock before doing IO: */ 954 six_unlock_intent(&b->c.lock); 955 bch2_trans_unlock_noassert(trans); 956 957 bch2_btree_node_read(trans, b, sync); 958 959 int ret = bch2_trans_relock(trans); 960 if (ret) 961 return ERR_PTR(ret); 962 963 if (!sync) 964 return NULL; 965 966 if (!six_relock_type(&b->c.lock, lock_type, seq)) 967 b = NULL; 968 } else { 969 bch2_btree_node_read(trans, b, sync); 970 if (lock_type == SIX_LOCK_read) 971 six_lock_downgrade(&b->c.lock); 972 } 973 974 return b; 975 } 976 977 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 978 { 979 struct printbuf buf = PRINTBUF; 980 981 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 982 return; 983 984 prt_printf(&buf, 985 "btree node header doesn't match ptr\n" 986 "btree %s level %u\n" 987 "ptr: ", 988 bch2_btree_id_str(b->c.btree_id), b->c.level); 989 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 990 991 prt_printf(&buf, "\nheader: btree %s level %llu\n" 992 "min ", 993 bch2_btree_id_str(BTREE_NODE_ID(b->data)), 994 BTREE_NODE_LEVEL(b->data)); 995 bch2_bpos_to_text(&buf, b->data->min_key); 996 997 prt_printf(&buf, "\nmax "); 998 bch2_bpos_to_text(&buf, b->data->max_key); 999 1000 bch2_fs_topology_error(c, "%s", buf.buf); 1001 1002 printbuf_exit(&buf); 1003 } 1004 1005 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 1006 { 1007 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 1008 b->c.level != BTREE_NODE_LEVEL(b->data) || 1009 !bpos_eq(b->data->max_key, b->key.k.p) || 1010 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 1011 !bpos_eq(b->data->min_key, 1012 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 1013 btree_bad_header(c, b); 1014 } 1015 1016 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1017 const struct bkey_i *k, unsigned level, 1018 enum six_lock_type lock_type, 1019 unsigned long trace_ip) 1020 { 1021 struct bch_fs *c = trans->c; 1022 struct btree_cache *bc = &c->btree_cache; 1023 struct btree *b; 1024 bool need_relock = false; 1025 int ret; 1026 1027 EBUG_ON(level >= BTREE_MAX_DEPTH); 1028 retry: 1029 b = btree_cache_find(bc, k); 1030 if (unlikely(!b)) { 1031 /* 1032 * We must have the parent locked to call bch2_btree_node_fill(), 1033 * else we could read in a btree node from disk that's been 1034 * freed: 1035 */ 1036 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 1037 level, lock_type, true); 1038 need_relock = true; 1039 1040 /* We raced and found the btree node in the cache */ 1041 if (!b) 1042 goto retry; 1043 1044 if (IS_ERR(b)) 1045 return b; 1046 } else { 1047 if (btree_node_read_locked(path, level + 1)) 1048 btree_node_unlock(trans, path, level + 1); 1049 1050 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1051 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1052 return ERR_PTR(ret); 1053 1054 BUG_ON(ret); 1055 1056 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1057 b->c.level != level || 1058 race_fault())) { 1059 six_unlock_type(&b->c.lock, lock_type); 1060 if (bch2_btree_node_relock(trans, path, level + 1)) 1061 goto retry; 1062 1063 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1064 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1065 } 1066 1067 /* avoid atomic set bit if it's not needed: */ 1068 if (!btree_node_accessed(b)) 1069 set_btree_node_accessed(b); 1070 } 1071 1072 if (unlikely(btree_node_read_in_flight(b))) { 1073 u32 seq = six_lock_seq(&b->c.lock); 1074 1075 six_unlock_type(&b->c.lock, lock_type); 1076 bch2_trans_unlock(trans); 1077 need_relock = true; 1078 1079 bch2_btree_node_wait_on_read(b); 1080 1081 ret = bch2_trans_relock(trans); 1082 if (ret) 1083 return ERR_PTR(ret); 1084 1085 /* 1086 * should_be_locked is not set on this path yet, so we need to 1087 * relock it specifically: 1088 */ 1089 if (!six_relock_type(&b->c.lock, lock_type, seq)) 1090 goto retry; 1091 } 1092 1093 if (unlikely(need_relock)) { 1094 ret = bch2_trans_relock(trans) ?: 1095 bch2_btree_path_relock_intent(trans, path); 1096 if (ret) { 1097 six_unlock_type(&b->c.lock, lock_type); 1098 return ERR_PTR(ret); 1099 } 1100 } 1101 1102 prefetch(b->aux_data); 1103 1104 for_each_bset(b, t) { 1105 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1106 1107 prefetch(p + L1_CACHE_BYTES * 0); 1108 prefetch(p + L1_CACHE_BYTES * 1); 1109 prefetch(p + L1_CACHE_BYTES * 2); 1110 } 1111 1112 if (unlikely(btree_node_read_error(b))) { 1113 six_unlock_type(&b->c.lock, lock_type); 1114 return ERR_PTR(-BCH_ERR_btree_node_read_error); 1115 } 1116 1117 EBUG_ON(b->c.btree_id != path->btree_id); 1118 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1119 btree_check_header(c, b); 1120 1121 return b; 1122 } 1123 1124 /** 1125 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 1126 * in from disk if necessary. 1127 * 1128 * @trans: btree transaction object 1129 * @path: btree_path being traversed 1130 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 1131 * @level: level of btree node being looked up (0 == leaf node) 1132 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 1133 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 1134 * 1135 * The btree node will have either a read or a write lock held, depending on 1136 * the @write parameter. 1137 * 1138 * Returns: btree node or ERR_PTR() 1139 */ 1140 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1141 const struct bkey_i *k, unsigned level, 1142 enum six_lock_type lock_type, 1143 unsigned long trace_ip) 1144 { 1145 struct bch_fs *c = trans->c; 1146 struct btree *b; 1147 int ret; 1148 1149 EBUG_ON(level >= BTREE_MAX_DEPTH); 1150 1151 b = btree_node_mem_ptr(k); 1152 1153 /* 1154 * Check b->hash_val _before_ calling btree_node_lock() - this might not 1155 * be the node we want anymore, and trying to lock the wrong node could 1156 * cause an unneccessary transaction restart: 1157 */ 1158 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 1159 !b || 1160 b->hash_val != btree_ptr_hash_val(k))) 1161 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1162 1163 if (btree_node_read_locked(path, level + 1)) 1164 btree_node_unlock(trans, path, level + 1); 1165 1166 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1167 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1168 return ERR_PTR(ret); 1169 1170 BUG_ON(ret); 1171 1172 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1173 b->c.level != level || 1174 race_fault())) { 1175 six_unlock_type(&b->c.lock, lock_type); 1176 if (bch2_btree_node_relock(trans, path, level + 1)) 1177 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1178 1179 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1180 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1181 } 1182 1183 if (unlikely(btree_node_read_in_flight(b))) { 1184 six_unlock_type(&b->c.lock, lock_type); 1185 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1186 } 1187 1188 prefetch(b->aux_data); 1189 1190 for_each_bset(b, t) { 1191 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1192 1193 prefetch(p + L1_CACHE_BYTES * 0); 1194 prefetch(p + L1_CACHE_BYTES * 1); 1195 prefetch(p + L1_CACHE_BYTES * 2); 1196 } 1197 1198 /* avoid atomic set bit if it's not needed: */ 1199 if (!btree_node_accessed(b)) 1200 set_btree_node_accessed(b); 1201 1202 if (unlikely(btree_node_read_error(b))) { 1203 six_unlock_type(&b->c.lock, lock_type); 1204 return ERR_PTR(-BCH_ERR_btree_node_read_error); 1205 } 1206 1207 EBUG_ON(b->c.btree_id != path->btree_id); 1208 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1209 btree_check_header(c, b); 1210 1211 return b; 1212 } 1213 1214 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1215 const struct bkey_i *k, 1216 enum btree_id btree_id, 1217 unsigned level, 1218 bool nofill) 1219 { 1220 struct bch_fs *c = trans->c; 1221 struct btree_cache *bc = &c->btree_cache; 1222 struct btree *b; 1223 int ret; 1224 1225 EBUG_ON(level >= BTREE_MAX_DEPTH); 1226 1227 if (c->opts.btree_node_mem_ptr_optimization) { 1228 b = btree_node_mem_ptr(k); 1229 if (b) 1230 goto lock_node; 1231 } 1232 retry: 1233 b = btree_cache_find(bc, k); 1234 if (unlikely(!b)) { 1235 if (nofill) 1236 goto out; 1237 1238 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1239 level, SIX_LOCK_read, true); 1240 1241 /* We raced and found the btree node in the cache */ 1242 if (!b) 1243 goto retry; 1244 1245 if (IS_ERR(b) && 1246 !bch2_btree_cache_cannibalize_lock(trans, NULL)) 1247 goto retry; 1248 1249 if (IS_ERR(b)) 1250 goto out; 1251 } else { 1252 lock_node: 1253 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1254 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1255 return ERR_PTR(ret); 1256 1257 BUG_ON(ret); 1258 1259 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1260 b->c.btree_id != btree_id || 1261 b->c.level != level)) { 1262 six_unlock_read(&b->c.lock); 1263 goto retry; 1264 } 1265 } 1266 1267 /* XXX: waiting on IO with btree locks held: */ 1268 __bch2_btree_node_wait_on_read(b); 1269 1270 prefetch(b->aux_data); 1271 1272 for_each_bset(b, t) { 1273 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1274 1275 prefetch(p + L1_CACHE_BYTES * 0); 1276 prefetch(p + L1_CACHE_BYTES * 1); 1277 prefetch(p + L1_CACHE_BYTES * 2); 1278 } 1279 1280 /* avoid atomic set bit if it's not needed: */ 1281 if (!btree_node_accessed(b)) 1282 set_btree_node_accessed(b); 1283 1284 if (unlikely(btree_node_read_error(b))) { 1285 six_unlock_read(&b->c.lock); 1286 b = ERR_PTR(-BCH_ERR_btree_node_read_error); 1287 goto out; 1288 } 1289 1290 EBUG_ON(b->c.btree_id != btree_id); 1291 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1292 btree_check_header(c, b); 1293 out: 1294 bch2_btree_cache_cannibalize_unlock(trans); 1295 return b; 1296 } 1297 1298 int bch2_btree_node_prefetch(struct btree_trans *trans, 1299 struct btree_path *path, 1300 const struct bkey_i *k, 1301 enum btree_id btree_id, unsigned level) 1302 { 1303 struct bch_fs *c = trans->c; 1304 struct btree_cache *bc = &c->btree_cache; 1305 1306 BUG_ON(path && !btree_node_locked(path, level + 1)); 1307 BUG_ON(level >= BTREE_MAX_DEPTH); 1308 1309 struct btree *b = btree_cache_find(bc, k); 1310 if (b) 1311 return 0; 1312 1313 b = bch2_btree_node_fill(trans, path, k, btree_id, 1314 level, SIX_LOCK_read, false); 1315 if (!IS_ERR_OR_NULL(b)) 1316 six_unlock_read(&b->c.lock); 1317 return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b); 1318 } 1319 1320 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1321 { 1322 struct bch_fs *c = trans->c; 1323 struct btree_cache *bc = &c->btree_cache; 1324 struct btree *b; 1325 1326 b = btree_cache_find(bc, k); 1327 if (!b) 1328 return; 1329 1330 BUG_ON(b == btree_node_root(trans->c, b)); 1331 wait_on_io: 1332 /* not allowed to wait on io with btree locks held: */ 1333 1334 /* XXX we're called from btree_gc which will be holding other btree 1335 * nodes locked 1336 */ 1337 __bch2_btree_node_wait_on_read(b); 1338 __bch2_btree_node_wait_on_write(b); 1339 1340 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1341 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1342 if (unlikely(b->hash_val != btree_ptr_hash_val(k))) 1343 goto out; 1344 1345 if (btree_node_dirty(b)) { 1346 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1347 six_unlock_write(&b->c.lock); 1348 six_unlock_intent(&b->c.lock); 1349 goto wait_on_io; 1350 } 1351 1352 BUG_ON(btree_node_dirty(b)); 1353 1354 mutex_lock(&bc->lock); 1355 bch2_btree_node_hash_remove(bc, b); 1356 btree_node_data_free(c, b); 1357 mutex_unlock(&bc->lock); 1358 out: 1359 six_unlock_write(&b->c.lock); 1360 six_unlock_intent(&b->c.lock); 1361 } 1362 1363 const char *bch2_btree_id_str(enum btree_id btree) 1364 { 1365 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1366 } 1367 1368 void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) 1369 { 1370 if (btree < BTREE_ID_NR) 1371 prt_str(out, __bch2_btree_ids[btree]); 1372 else 1373 prt_printf(out, "(unknown btree %u)", btree); 1374 } 1375 1376 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1377 { 1378 prt_printf(out, "%s level %u/%u\n ", 1379 bch2_btree_id_str(b->c.btree_id), 1380 b->c.level, 1381 bch2_btree_id_root(c, b->c.btree_id)->level); 1382 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1383 } 1384 1385 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1386 { 1387 struct bset_stats stats; 1388 1389 memset(&stats, 0, sizeof(stats)); 1390 1391 bch2_btree_keys_stats(b, &stats); 1392 1393 prt_printf(out, "l %u ", b->c.level); 1394 bch2_bpos_to_text(out, b->data->min_key); 1395 prt_printf(out, " - "); 1396 bch2_bpos_to_text(out, b->data->max_key); 1397 prt_printf(out, ":\n" 1398 " ptrs: "); 1399 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1400 prt_newline(out); 1401 1402 prt_printf(out, 1403 " format: "); 1404 bch2_bkey_format_to_text(out, &b->format); 1405 1406 prt_printf(out, 1407 " unpack fn len: %u\n" 1408 " bytes used %zu/%zu (%zu%% full)\n" 1409 " sib u64s: %u, %u (merge threshold %u)\n" 1410 " nr packed keys %u\n" 1411 " nr unpacked keys %u\n" 1412 " floats %zu\n" 1413 " failed unpacked %zu\n", 1414 b->unpack_fn_len, 1415 b->nr.live_u64s * sizeof(u64), 1416 btree_buf_bytes(b) - sizeof(struct btree_node), 1417 b->nr.live_u64s * 100 / btree_max_u64s(c), 1418 b->sib_u64s[0], 1419 b->sib_u64s[1], 1420 c->btree_foreground_merge_threshold, 1421 b->nr.packed_keys, 1422 b->nr.unpacked_keys, 1423 stats.floats, 1424 stats.failed); 1425 } 1426 1427 static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, 1428 const char *label, size_t nr) 1429 { 1430 prt_printf(out, "%s\t", label); 1431 prt_human_readable_u64(out, nr * c->opts.btree_node_size); 1432 prt_printf(out, " (%zu)\n", nr); 1433 } 1434 1435 static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { 1436 #define x(n) #n, 1437 BCH_BTREE_CACHE_NOT_FREED_REASONS() 1438 #undef x 1439 NULL 1440 }; 1441 1442 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) 1443 { 1444 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 1445 1446 if (!out->nr_tabstops) 1447 printbuf_tabstop_push(out, 32); 1448 1449 prt_btree_cache_line(out, c, "live:", bc->live[0].nr); 1450 prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); 1451 prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); 1452 prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); 1453 prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); 1454 prt_newline(out); 1455 1456 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) 1457 prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]); 1458 1459 prt_newline(out); 1460 prt_printf(out, "freed:\t%zu\n", bc->nr_freed); 1461 prt_printf(out, "not freed:\n"); 1462 1463 for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) 1464 prt_printf(out, " %s\t%llu\n", 1465 bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); 1466 } 1467