1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bbpos.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_iter.h" 9 #include "btree_locking.h" 10 #include "debug.h" 11 #include "errcode.h" 12 #include "error.h" 13 #include "journal.h" 14 #include "trace.h" 15 16 #include <linux/prefetch.h> 17 #include <linux/sched/mm.h> 18 #include <linux/swap.h> 19 20 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ 21 do { \ 22 if (shrinker_counter) \ 23 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ 24 } while (0) 25 26 const char * const bch2_btree_node_flags[] = { 27 "typebit", 28 "typebit", 29 "typebit", 30 #define x(f) [BTREE_NODE_##f] = #f, 31 BTREE_FLAGS() 32 #undef x 33 NULL 34 }; 35 36 void bch2_recalc_btree_reserve(struct bch_fs *c) 37 { 38 unsigned reserve = 16; 39 40 if (!c->btree_roots_known[0].b) 41 reserve += 8; 42 43 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 44 struct btree_root *r = bch2_btree_id_root(c, i); 45 46 if (r->b) 47 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 48 } 49 50 c->btree_cache.nr_reserve = reserve; 51 } 52 53 static inline size_t btree_cache_can_free(struct btree_cache_list *list) 54 { 55 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 56 57 size_t can_free = list->nr; 58 if (!list->idx) 59 can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); 60 return can_free; 61 } 62 63 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 64 { 65 BUG_ON(!list_empty(&b->list)); 66 67 if (b->c.lock.readers) 68 list_add(&b->list, &bc->freed_pcpu); 69 else 70 list_add(&b->list, &bc->freed_nonpcpu); 71 } 72 73 static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) 74 { 75 BUG_ON(!list_empty(&b->list)); 76 BUG_ON(!b->data); 77 78 bc->nr_freeable++; 79 list_add(&b->list, &bc->freeable); 80 } 81 82 void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) 83 { 84 struct btree_cache *bc = &c->btree_cache; 85 86 mutex_lock(&bc->lock); 87 __bch2_btree_node_to_freelist(bc, b); 88 mutex_unlock(&bc->lock); 89 90 six_unlock_write(&b->c.lock); 91 six_unlock_intent(&b->c.lock); 92 } 93 94 static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) 95 { 96 BUG_ON(!list_empty(&b->list)); 97 BUG_ON(btree_node_hashed(b)); 98 99 /* 100 * This should really be done in slub/vmalloc, but we're using the 101 * kmalloc_large() path, so we're working around a slub bug by doing 102 * this here: 103 */ 104 if (b->data) 105 mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); 106 if (b->aux_data) 107 mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); 108 109 EBUG_ON(btree_node_write_in_flight(b)); 110 111 clear_btree_node_just_written(b); 112 113 kvfree(b->data); 114 b->data = NULL; 115 #ifdef __KERNEL__ 116 kvfree(b->aux_data); 117 #else 118 munmap(b->aux_data, btree_aux_data_bytes(b)); 119 #endif 120 b->aux_data = NULL; 121 122 btree_node_to_freedlist(bc, b); 123 } 124 125 static void btree_node_data_free(struct btree_cache *bc, struct btree *b) 126 { 127 BUG_ON(list_empty(&b->list)); 128 list_del_init(&b->list); 129 --bc->nr_freeable; 130 __btree_node_data_free(bc, b); 131 } 132 133 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 134 const void *obj) 135 { 136 const struct btree *b = obj; 137 const u64 *v = arg->key; 138 139 return b->hash_val == *v ? 0 : 1; 140 } 141 142 static const struct rhashtable_params bch_btree_cache_params = { 143 .head_offset = offsetof(struct btree, hash), 144 .key_offset = offsetof(struct btree, hash_val), 145 .key_len = sizeof(u64), 146 .obj_cmpfn = bch2_btree_cache_cmp_fn, 147 .automatic_shrinking = true, 148 }; 149 150 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 151 { 152 BUG_ON(b->data || b->aux_data); 153 154 gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; 155 156 b->data = kvmalloc(btree_buf_bytes(b), gfp); 157 if (!b->data) 158 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 159 #ifdef __KERNEL__ 160 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 161 #else 162 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 163 PROT_READ|PROT_WRITE|PROT_EXEC, 164 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 165 if (b->aux_data == MAP_FAILED) 166 b->aux_data = NULL; 167 #endif 168 if (!b->aux_data) { 169 kvfree(b->data); 170 b->data = NULL; 171 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 172 } 173 174 return 0; 175 } 176 177 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 178 { 179 struct btree *b; 180 181 b = kzalloc(sizeof(struct btree), gfp); 182 if (!b) 183 return NULL; 184 185 bkey_btree_ptr_init(&b->key); 186 INIT_LIST_HEAD(&b->list); 187 INIT_LIST_HEAD(&b->write_blocked); 188 b->byte_order = ilog2(c->opts.btree_node_size); 189 return b; 190 } 191 192 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 193 { 194 struct btree_cache *bc = &c->btree_cache; 195 struct btree *b; 196 197 b = __btree_node_mem_alloc(c, GFP_KERNEL); 198 if (!b) 199 return NULL; 200 201 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 202 kfree(b); 203 return NULL; 204 } 205 206 bch2_btree_lock_init(&b->c, 0); 207 208 __bch2_btree_node_to_freelist(bc, b); 209 return b; 210 } 211 212 static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) 213 { 214 struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 215 216 u64 mask = bc->pinned_nodes_mask[!!b->c.level]; 217 218 return ((mask & BIT_ULL(b->c.btree_id)) && 219 bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 220 bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); 221 } 222 223 void bch2_node_pin(struct bch_fs *c, struct btree *b) 224 { 225 struct btree_cache *bc = &c->btree_cache; 226 227 mutex_lock(&bc->lock); 228 if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { 229 set_btree_node_pinned(b); 230 list_move(&b->list, &bc->live[1].list); 231 bc->live[0].nr--; 232 bc->live[1].nr++; 233 } 234 mutex_unlock(&bc->lock); 235 } 236 237 void bch2_btree_cache_unpin(struct bch_fs *c) 238 { 239 struct btree_cache *bc = &c->btree_cache; 240 struct btree *b, *n; 241 242 mutex_lock(&bc->lock); 243 c->btree_cache.pinned_nodes_mask[0] = 0; 244 c->btree_cache.pinned_nodes_mask[1] = 0; 245 246 list_for_each_entry_safe(b, n, &bc->live[1].list, list) { 247 clear_btree_node_pinned(b); 248 list_move(&b->list, &bc->live[0].list); 249 bc->live[0].nr++; 250 bc->live[1].nr--; 251 } 252 253 mutex_unlock(&bc->lock); 254 } 255 256 /* Btree in memory cache - hash table */ 257 258 void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 259 { 260 lockdep_assert_held(&bc->lock); 261 262 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 263 BUG_ON(ret); 264 265 /* Cause future lookups for this node to fail: */ 266 b->hash_val = 0; 267 268 if (b->c.btree_id < BTREE_ID_NR) 269 --bc->nr_by_btree[b->c.btree_id]; 270 --bc->live[btree_node_pinned(b)].nr; 271 list_del_init(&b->list); 272 } 273 274 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 275 { 276 __bch2_btree_node_hash_remove(bc, b); 277 __bch2_btree_node_to_freelist(bc, b); 278 } 279 280 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 281 { 282 BUG_ON(!list_empty(&b->list)); 283 BUG_ON(b->hash_val); 284 285 b->hash_val = btree_ptr_hash_val(&b->key); 286 int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, 287 bch_btree_cache_params); 288 if (ret) 289 return ret; 290 291 if (b->c.btree_id < BTREE_ID_NR) 292 bc->nr_by_btree[b->c.btree_id]++; 293 294 bool p = __btree_node_pinned(bc, b); 295 mod_bit(BTREE_NODE_pinned, &b->flags, p); 296 297 list_add_tail(&b->list, &bc->live[p].list); 298 bc->live[p].nr++; 299 return 0; 300 } 301 302 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 303 unsigned level, enum btree_id id) 304 { 305 b->c.level = level; 306 b->c.btree_id = id; 307 308 mutex_lock(&bc->lock); 309 int ret = __bch2_btree_node_hash_insert(bc, b); 310 mutex_unlock(&bc->lock); 311 312 return ret; 313 } 314 315 void bch2_btree_node_update_key_early(struct btree_trans *trans, 316 enum btree_id btree, unsigned level, 317 struct bkey_s_c old, struct bkey_i *new) 318 { 319 struct bch_fs *c = trans->c; 320 struct btree *b; 321 struct bkey_buf tmp; 322 int ret; 323 324 bch2_bkey_buf_init(&tmp); 325 bch2_bkey_buf_reassemble(&tmp, c, old); 326 327 b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); 328 if (!IS_ERR_OR_NULL(b)) { 329 mutex_lock(&c->btree_cache.lock); 330 331 __bch2_btree_node_hash_remove(&c->btree_cache, b); 332 333 bkey_copy(&b->key, new); 334 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 335 BUG_ON(ret); 336 337 mutex_unlock(&c->btree_cache.lock); 338 six_unlock_read(&b->c.lock); 339 } 340 341 bch2_bkey_buf_exit(&tmp, c); 342 } 343 344 __flatten 345 static inline struct btree *btree_cache_find(struct btree_cache *bc, 346 const struct bkey_i *k) 347 { 348 u64 v = btree_ptr_hash_val(k); 349 350 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 351 } 352 353 /* 354 * this version is for btree nodes that have already been freed (we're not 355 * reaping a real btree node) 356 */ 357 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) 358 { 359 struct btree_cache *bc = &c->btree_cache; 360 int ret = 0; 361 362 lockdep_assert_held(&bc->lock); 363 wait_on_io: 364 if (b->flags & ((1U << BTREE_NODE_dirty)| 365 (1U << BTREE_NODE_read_in_flight)| 366 (1U << BTREE_NODE_write_in_flight))) { 367 if (!flush) { 368 if (btree_node_dirty(b)) 369 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 370 else if (btree_node_read_in_flight(b)) 371 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 372 else if (btree_node_write_in_flight(b)) 373 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 374 return -BCH_ERR_ENOMEM_btree_node_reclaim; 375 } 376 377 /* XXX: waiting on IO with btree cache lock held */ 378 bch2_btree_node_wait_on_read(b); 379 bch2_btree_node_wait_on_write(b); 380 } 381 382 if (!six_trylock_intent(&b->c.lock)) { 383 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); 384 return -BCH_ERR_ENOMEM_btree_node_reclaim; 385 } 386 387 if (!six_trylock_write(&b->c.lock)) { 388 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); 389 goto out_unlock_intent; 390 } 391 392 /* recheck under lock */ 393 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 394 (1U << BTREE_NODE_write_in_flight))) { 395 if (!flush) { 396 if (btree_node_read_in_flight(b)) 397 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 398 else if (btree_node_write_in_flight(b)) 399 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 400 goto out_unlock; 401 } 402 six_unlock_write(&b->c.lock); 403 six_unlock_intent(&b->c.lock); 404 goto wait_on_io; 405 } 406 407 if (btree_node_noevict(b)) { 408 BTREE_CACHE_NOT_FREED_INCREMENT(noevict); 409 goto out_unlock; 410 } 411 if (btree_node_write_blocked(b)) { 412 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); 413 goto out_unlock; 414 } 415 if (btree_node_will_make_reachable(b)) { 416 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); 417 goto out_unlock; 418 } 419 420 if (btree_node_dirty(b)) { 421 if (!flush) { 422 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 423 goto out_unlock; 424 } 425 /* 426 * Using the underscore version because we don't want to compact 427 * bsets after the write, since this node is about to be evicted 428 * - unless btree verify mode is enabled, since it runs out of 429 * the post write cleanup: 430 */ 431 if (bch2_verify_btree_ondisk) 432 bch2_btree_node_write(c, b, SIX_LOCK_intent, 433 BTREE_WRITE_cache_reclaim); 434 else 435 __bch2_btree_node_write(c, b, 436 BTREE_WRITE_cache_reclaim); 437 438 six_unlock_write(&b->c.lock); 439 six_unlock_intent(&b->c.lock); 440 goto wait_on_io; 441 } 442 out: 443 if (b->hash_val && !ret) 444 trace_and_count(c, btree_cache_reap, c, b); 445 return ret; 446 out_unlock: 447 six_unlock_write(&b->c.lock); 448 out_unlock_intent: 449 six_unlock_intent(&b->c.lock); 450 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 451 goto out; 452 } 453 454 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) 455 { 456 return __btree_node_reclaim(c, b, false, shrinker_counter); 457 } 458 459 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 460 { 461 return __btree_node_reclaim(c, b, true, false); 462 } 463 464 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 465 struct shrink_control *sc) 466 { 467 struct btree_cache_list *list = shrink->private_data; 468 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 469 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 470 struct btree *b, *t; 471 unsigned long nr = sc->nr_to_scan; 472 unsigned long can_free = 0; 473 unsigned long freed = 0; 474 unsigned long touched = 0; 475 unsigned i, flags; 476 unsigned long ret = SHRINK_STOP; 477 bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; 478 479 if (bch2_btree_shrinker_disabled) 480 return SHRINK_STOP; 481 482 mutex_lock(&bc->lock); 483 flags = memalloc_nofs_save(); 484 485 /* 486 * It's _really_ critical that we don't free too many btree nodes - we 487 * have to always leave ourselves a reserve. The reserve is how we 488 * guarantee that allocating memory for a new btree node can always 489 * succeed, so that inserting keys into the btree can always succeed and 490 * IO can always make forward progress: 491 */ 492 can_free = btree_cache_can_free(list); 493 nr = min_t(unsigned long, nr, can_free); 494 495 i = 0; 496 list_for_each_entry_safe(b, t, &bc->freeable, list) { 497 /* 498 * Leave a few nodes on the freeable list, so that a btree split 499 * won't have to hit the system allocator: 500 */ 501 if (++i <= 3) 502 continue; 503 504 touched++; 505 506 if (touched >= nr) 507 goto out; 508 509 if (!btree_node_reclaim(c, b, true)) { 510 btree_node_data_free(bc, b); 511 six_unlock_write(&b->c.lock); 512 six_unlock_intent(&b->c.lock); 513 freed++; 514 bc->nr_freed++; 515 } 516 } 517 restart: 518 list_for_each_entry_safe(b, t, &list->list, list) { 519 touched++; 520 521 if (btree_node_accessed(b)) { 522 clear_btree_node_accessed(b); 523 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; 524 --touched;; 525 } else if (!btree_node_reclaim(c, b, true)) { 526 __bch2_btree_node_hash_remove(bc, b); 527 __btree_node_data_free(bc, b); 528 529 freed++; 530 bc->nr_freed++; 531 532 six_unlock_write(&b->c.lock); 533 six_unlock_intent(&b->c.lock); 534 535 if (freed == nr) 536 goto out_rotate; 537 } else if (trigger_writes && 538 btree_node_dirty(b) && 539 !btree_node_will_make_reachable(b) && 540 !btree_node_write_blocked(b) && 541 six_trylock_read(&b->c.lock)) { 542 list_move(&list->list, &b->list); 543 mutex_unlock(&bc->lock); 544 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 545 six_unlock_read(&b->c.lock); 546 if (touched >= nr) 547 goto out_nounlock; 548 mutex_lock(&bc->lock); 549 goto restart; 550 } 551 552 if (touched >= nr) 553 break; 554 } 555 out_rotate: 556 if (&t->list != &list->list) 557 list_move_tail(&list->list, &t->list); 558 out: 559 mutex_unlock(&bc->lock); 560 out_nounlock: 561 ret = freed; 562 memalloc_nofs_restore(flags); 563 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 564 return ret; 565 } 566 567 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 568 struct shrink_control *sc) 569 { 570 struct btree_cache_list *list = shrink->private_data; 571 572 if (bch2_btree_shrinker_disabled) 573 return 0; 574 575 return btree_cache_can_free(list); 576 } 577 578 void bch2_fs_btree_cache_exit(struct bch_fs *c) 579 { 580 struct btree_cache *bc = &c->btree_cache; 581 struct btree *b, *t; 582 unsigned long flags; 583 584 shrinker_free(bc->live[1].shrink); 585 shrinker_free(bc->live[0].shrink); 586 587 /* vfree() can allocate memory: */ 588 flags = memalloc_nofs_save(); 589 mutex_lock(&bc->lock); 590 591 if (c->verify_data) 592 list_move(&c->verify_data->list, &bc->live[0].list); 593 594 kvfree(c->verify_ondisk); 595 596 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 597 struct btree_root *r = bch2_btree_id_root(c, i); 598 599 if (r->b) 600 list_add(&r->b->list, &bc->live[0].list); 601 } 602 603 list_for_each_entry_safe(b, t, &bc->live[1].list, list) 604 bch2_btree_node_hash_remove(bc, b); 605 list_for_each_entry_safe(b, t, &bc->live[0].list, list) 606 bch2_btree_node_hash_remove(bc, b); 607 608 list_for_each_entry_safe(b, t, &bc->freeable, list) { 609 BUG_ON(btree_node_read_in_flight(b) || 610 btree_node_write_in_flight(b)); 611 612 btree_node_data_free(bc, b); 613 } 614 615 BUG_ON(!bch2_journal_error(&c->journal) && 616 atomic_long_read(&c->btree_cache.nr_dirty)); 617 618 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 619 620 list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { 621 list_del(&b->list); 622 six_lock_exit(&b->c.lock); 623 kfree(b); 624 } 625 626 mutex_unlock(&bc->lock); 627 memalloc_nofs_restore(flags); 628 629 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) 630 BUG_ON(bc->nr_by_btree[i]); 631 BUG_ON(bc->live[0].nr); 632 BUG_ON(bc->live[1].nr); 633 BUG_ON(bc->nr_freeable); 634 635 if (bc->table_init_done) 636 rhashtable_destroy(&bc->table); 637 } 638 639 int bch2_fs_btree_cache_init(struct bch_fs *c) 640 { 641 struct btree_cache *bc = &c->btree_cache; 642 struct shrinker *shrink; 643 unsigned i; 644 int ret = 0; 645 646 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 647 if (ret) 648 goto err; 649 650 bc->table_init_done = true; 651 652 bch2_recalc_btree_reserve(c); 653 654 for (i = 0; i < bc->nr_reserve; i++) 655 if (!__bch2_btree_node_mem_alloc(c)) 656 goto err; 657 658 list_splice_init(&bc->live[0].list, &bc->freeable); 659 660 mutex_init(&c->verify_lock); 661 662 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 663 if (!shrink) 664 goto err; 665 bc->live[0].shrink = shrink; 666 shrink->count_objects = bch2_btree_cache_count; 667 shrink->scan_objects = bch2_btree_cache_scan; 668 shrink->seeks = 2; 669 shrink->private_data = &bc->live[0]; 670 shrinker_register(shrink); 671 672 shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); 673 if (!shrink) 674 goto err; 675 bc->live[1].shrink = shrink; 676 shrink->count_objects = bch2_btree_cache_count; 677 shrink->scan_objects = bch2_btree_cache_scan; 678 shrink->seeks = 8; 679 shrink->private_data = &bc->live[1]; 680 shrinker_register(shrink); 681 682 return 0; 683 err: 684 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 685 } 686 687 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 688 { 689 mutex_init(&bc->lock); 690 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { 691 bc->live[i].idx = i; 692 INIT_LIST_HEAD(&bc->live[i].list); 693 } 694 INIT_LIST_HEAD(&bc->freeable); 695 INIT_LIST_HEAD(&bc->freed_pcpu); 696 INIT_LIST_HEAD(&bc->freed_nonpcpu); 697 } 698 699 /* 700 * We can only have one thread cannibalizing other cached btree nodes at a time, 701 * or we'll deadlock. We use an open coded mutex to ensure that, which a 702 * cannibalize_bucket() will take. This means every time we unlock the root of 703 * the btree, we need to release this lock if we have it held. 704 */ 705 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) 706 { 707 struct bch_fs *c = trans->c; 708 struct btree_cache *bc = &c->btree_cache; 709 710 if (bc->alloc_lock == current) { 711 trace_and_count(c, btree_cache_cannibalize_unlock, trans); 712 bc->alloc_lock = NULL; 713 closure_wake_up(&bc->alloc_wait); 714 } 715 } 716 717 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) 718 { 719 struct bch_fs *c = trans->c; 720 struct btree_cache *bc = &c->btree_cache; 721 struct task_struct *old; 722 723 old = NULL; 724 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) 725 goto success; 726 727 if (!cl) { 728 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 729 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 730 } 731 732 closure_wait(&bc->alloc_wait, cl); 733 734 /* Try again, after adding ourselves to waitlist */ 735 old = NULL; 736 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { 737 /* We raced */ 738 closure_wake_up(&bc->alloc_wait); 739 goto success; 740 } 741 742 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 743 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 744 745 success: 746 trace_and_count(c, btree_cache_cannibalize_lock, trans); 747 return 0; 748 } 749 750 static struct btree *btree_node_cannibalize(struct bch_fs *c) 751 { 752 struct btree_cache *bc = &c->btree_cache; 753 struct btree *b; 754 755 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 756 list_for_each_entry_reverse(b, &bc->live[i].list, list) 757 if (!btree_node_reclaim(c, b, false)) 758 return b; 759 760 while (1) { 761 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 762 list_for_each_entry_reverse(b, &bc->live[i].list, list) 763 if (!btree_node_write_and_reclaim(c, b)) 764 return b; 765 766 /* 767 * Rare case: all nodes were intent-locked. 768 * Just busy-wait. 769 */ 770 WARN_ONCE(1, "btree cache cannibalize failed\n"); 771 cond_resched(); 772 } 773 } 774 775 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 776 { 777 struct bch_fs *c = trans->c; 778 struct btree_cache *bc = &c->btree_cache; 779 struct list_head *freed = pcpu_read_locks 780 ? &bc->freed_pcpu 781 : &bc->freed_nonpcpu; 782 struct btree *b, *b2; 783 u64 start_time = local_clock(); 784 785 mutex_lock(&bc->lock); 786 787 /* 788 * We never free struct btree itself, just the memory that holds the on 789 * disk node. Check the freed list before allocating a new one: 790 */ 791 list_for_each_entry(b, freed, list) 792 if (!btree_node_reclaim(c, b, false)) { 793 list_del_init(&b->list); 794 goto got_node; 795 } 796 797 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 798 if (!b) { 799 mutex_unlock(&bc->lock); 800 bch2_trans_unlock(trans); 801 b = __btree_node_mem_alloc(c, GFP_KERNEL); 802 if (!b) 803 goto err; 804 mutex_lock(&bc->lock); 805 } 806 807 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); 808 809 BUG_ON(!six_trylock_intent(&b->c.lock)); 810 BUG_ON(!six_trylock_write(&b->c.lock)); 811 812 got_node: 813 /* 814 * btree_free() doesn't free memory; it sticks the node on the end of 815 * the list. Check if there's any freed nodes there: 816 */ 817 list_for_each_entry(b2, &bc->freeable, list) 818 if (!btree_node_reclaim(c, b2, false)) { 819 swap(b->data, b2->data); 820 swap(b->aux_data, b2->aux_data); 821 822 list_del_init(&b2->list); 823 --bc->nr_freeable; 824 btree_node_to_freedlist(bc, b2); 825 mutex_unlock(&bc->lock); 826 827 six_unlock_write(&b2->c.lock); 828 six_unlock_intent(&b2->c.lock); 829 goto got_mem; 830 } 831 832 mutex_unlock(&bc->lock); 833 834 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 835 bch2_trans_unlock(trans); 836 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 837 goto err; 838 } 839 840 got_mem: 841 BUG_ON(!list_empty(&b->list)); 842 BUG_ON(btree_node_hashed(b)); 843 BUG_ON(btree_node_dirty(b)); 844 BUG_ON(btree_node_write_in_flight(b)); 845 out: 846 b->flags = 0; 847 b->written = 0; 848 b->nsets = 0; 849 b->sib_u64s[0] = 0; 850 b->sib_u64s[1] = 0; 851 b->whiteout_u64s = 0; 852 bch2_btree_keys_init(b); 853 set_btree_node_accessed(b); 854 855 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 856 start_time); 857 858 int ret = bch2_trans_relock(trans); 859 if (unlikely(ret)) { 860 bch2_btree_node_to_freelist(c, b); 861 return ERR_PTR(ret); 862 } 863 864 return b; 865 err: 866 mutex_lock(&bc->lock); 867 868 /* Try to cannibalize another cached btree node: */ 869 if (bc->alloc_lock == current) { 870 b2 = btree_node_cannibalize(c); 871 clear_btree_node_just_written(b2); 872 __bch2_btree_node_hash_remove(bc, b2); 873 874 if (b) { 875 swap(b->data, b2->data); 876 swap(b->aux_data, b2->aux_data); 877 btree_node_to_freedlist(bc, b2); 878 six_unlock_write(&b2->c.lock); 879 six_unlock_intent(&b2->c.lock); 880 } else { 881 b = b2; 882 } 883 884 BUG_ON(!list_empty(&b->list)); 885 mutex_unlock(&bc->lock); 886 887 trace_and_count(c, btree_cache_cannibalize, trans); 888 goto out; 889 } 890 891 mutex_unlock(&bc->lock); 892 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 893 } 894 895 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 896 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 897 struct btree_path *path, 898 const struct bkey_i *k, 899 enum btree_id btree_id, 900 unsigned level, 901 enum six_lock_type lock_type, 902 bool sync) 903 { 904 struct bch_fs *c = trans->c; 905 struct btree_cache *bc = &c->btree_cache; 906 struct btree *b; 907 908 if (unlikely(level >= BTREE_MAX_DEPTH)) { 909 int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", 910 level, BTREE_MAX_DEPTH); 911 return ERR_PTR(ret); 912 } 913 914 if (unlikely(!bkey_is_btree_ptr(&k->k))) { 915 struct printbuf buf = PRINTBUF; 916 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 917 918 int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); 919 printbuf_exit(&buf); 920 return ERR_PTR(ret); 921 } 922 923 if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { 924 struct printbuf buf = PRINTBUF; 925 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 926 927 int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); 928 printbuf_exit(&buf); 929 return ERR_PTR(ret); 930 } 931 932 /* 933 * Parent node must be locked, else we could read in a btree node that's 934 * been freed: 935 */ 936 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 937 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 938 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 939 } 940 941 b = bch2_btree_node_mem_alloc(trans, level != 0); 942 943 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 944 if (!path) 945 return b; 946 947 trans->memory_allocation_failure = true; 948 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 949 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 950 } 951 952 if (IS_ERR(b)) 953 return b; 954 955 bkey_copy(&b->key, k); 956 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 957 /* raced with another fill: */ 958 959 /* mark as unhashed... */ 960 b->hash_val = 0; 961 962 mutex_lock(&bc->lock); 963 __bch2_btree_node_to_freelist(bc, b); 964 mutex_unlock(&bc->lock); 965 966 six_unlock_write(&b->c.lock); 967 six_unlock_intent(&b->c.lock); 968 return NULL; 969 } 970 971 set_btree_node_read_in_flight(b); 972 six_unlock_write(&b->c.lock); 973 974 if (path) { 975 u32 seq = six_lock_seq(&b->c.lock); 976 977 /* Unlock before doing IO: */ 978 six_unlock_intent(&b->c.lock); 979 bch2_trans_unlock_noassert(trans); 980 981 bch2_btree_node_read(trans, b, sync); 982 983 int ret = bch2_trans_relock(trans); 984 if (ret) 985 return ERR_PTR(ret); 986 987 if (!sync) 988 return NULL; 989 990 if (!six_relock_type(&b->c.lock, lock_type, seq)) 991 b = NULL; 992 } else { 993 bch2_btree_node_read(trans, b, sync); 994 if (lock_type == SIX_LOCK_read) 995 six_lock_downgrade(&b->c.lock); 996 } 997 998 return b; 999 } 1000 1001 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 1002 { 1003 struct printbuf buf = PRINTBUF; 1004 1005 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 1006 return; 1007 1008 prt_printf(&buf, 1009 "btree node header doesn't match ptr: "); 1010 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 1011 prt_str(&buf, "\nptr: "); 1012 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 1013 1014 prt_str(&buf, "\nheader: "); 1015 bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); 1016 prt_str(&buf, "\nmin "); 1017 bch2_bpos_to_text(&buf, b->data->min_key); 1018 1019 prt_printf(&buf, "\nmax "); 1020 bch2_bpos_to_text(&buf, b->data->max_key); 1021 1022 bch2_fs_topology_error(c, "%s", buf.buf); 1023 1024 printbuf_exit(&buf); 1025 } 1026 1027 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 1028 { 1029 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 1030 b->c.level != BTREE_NODE_LEVEL(b->data) || 1031 !bpos_eq(b->data->max_key, b->key.k.p) || 1032 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 1033 !bpos_eq(b->data->min_key, 1034 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 1035 btree_bad_header(c, b); 1036 } 1037 1038 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1039 const struct bkey_i *k, unsigned level, 1040 enum six_lock_type lock_type, 1041 unsigned long trace_ip) 1042 { 1043 struct bch_fs *c = trans->c; 1044 struct btree_cache *bc = &c->btree_cache; 1045 struct btree *b; 1046 bool need_relock = false; 1047 int ret; 1048 1049 EBUG_ON(level >= BTREE_MAX_DEPTH); 1050 retry: 1051 b = btree_cache_find(bc, k); 1052 if (unlikely(!b)) { 1053 /* 1054 * We must have the parent locked to call bch2_btree_node_fill(), 1055 * else we could read in a btree node from disk that's been 1056 * freed: 1057 */ 1058 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 1059 level, lock_type, true); 1060 need_relock = true; 1061 1062 /* We raced and found the btree node in the cache */ 1063 if (!b) 1064 goto retry; 1065 1066 if (IS_ERR(b)) 1067 return b; 1068 } else { 1069 if (btree_node_read_locked(path, level + 1)) 1070 btree_node_unlock(trans, path, level + 1); 1071 1072 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1073 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1074 return ERR_PTR(ret); 1075 1076 BUG_ON(ret); 1077 1078 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1079 b->c.level != level || 1080 race_fault())) { 1081 six_unlock_type(&b->c.lock, lock_type); 1082 if (bch2_btree_node_relock(trans, path, level + 1)) 1083 goto retry; 1084 1085 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1086 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1087 } 1088 1089 /* avoid atomic set bit if it's not needed: */ 1090 if (!btree_node_accessed(b)) 1091 set_btree_node_accessed(b); 1092 } 1093 1094 if (unlikely(btree_node_read_in_flight(b))) { 1095 u32 seq = six_lock_seq(&b->c.lock); 1096 1097 six_unlock_type(&b->c.lock, lock_type); 1098 bch2_trans_unlock(trans); 1099 need_relock = true; 1100 1101 bch2_btree_node_wait_on_read(b); 1102 1103 ret = bch2_trans_relock(trans); 1104 if (ret) 1105 return ERR_PTR(ret); 1106 1107 /* 1108 * should_be_locked is not set on this path yet, so we need to 1109 * relock it specifically: 1110 */ 1111 if (!six_relock_type(&b->c.lock, lock_type, seq)) 1112 goto retry; 1113 } 1114 1115 if (unlikely(need_relock)) { 1116 ret = bch2_trans_relock(trans) ?: 1117 bch2_btree_path_relock_intent(trans, path); 1118 if (ret) { 1119 six_unlock_type(&b->c.lock, lock_type); 1120 return ERR_PTR(ret); 1121 } 1122 } 1123 1124 prefetch(b->aux_data); 1125 1126 for_each_bset(b, t) { 1127 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1128 1129 prefetch(p + L1_CACHE_BYTES * 0); 1130 prefetch(p + L1_CACHE_BYTES * 1); 1131 prefetch(p + L1_CACHE_BYTES * 2); 1132 } 1133 1134 if (unlikely(btree_node_read_error(b))) { 1135 six_unlock_type(&b->c.lock, lock_type); 1136 return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1137 } 1138 1139 EBUG_ON(b->c.btree_id != path->btree_id); 1140 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1141 btree_check_header(c, b); 1142 1143 return b; 1144 } 1145 1146 /** 1147 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 1148 * in from disk if necessary. 1149 * 1150 * @trans: btree transaction object 1151 * @path: btree_path being traversed 1152 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 1153 * @level: level of btree node being looked up (0 == leaf node) 1154 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 1155 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 1156 * 1157 * The btree node will have either a read or a write lock held, depending on 1158 * the @write parameter. 1159 * 1160 * Returns: btree node or ERR_PTR() 1161 */ 1162 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1163 const struct bkey_i *k, unsigned level, 1164 enum six_lock_type lock_type, 1165 unsigned long trace_ip) 1166 { 1167 struct bch_fs *c = trans->c; 1168 struct btree *b; 1169 int ret; 1170 1171 EBUG_ON(level >= BTREE_MAX_DEPTH); 1172 1173 b = btree_node_mem_ptr(k); 1174 1175 /* 1176 * Check b->hash_val _before_ calling btree_node_lock() - this might not 1177 * be the node we want anymore, and trying to lock the wrong node could 1178 * cause an unneccessary transaction restart: 1179 */ 1180 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 1181 !b || 1182 b->hash_val != btree_ptr_hash_val(k))) 1183 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1184 1185 if (btree_node_read_locked(path, level + 1)) 1186 btree_node_unlock(trans, path, level + 1); 1187 1188 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1189 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1190 return ERR_PTR(ret); 1191 1192 BUG_ON(ret); 1193 1194 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1195 b->c.level != level || 1196 race_fault())) { 1197 six_unlock_type(&b->c.lock, lock_type); 1198 if (bch2_btree_node_relock(trans, path, level + 1)) 1199 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1200 1201 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1202 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1203 } 1204 1205 if (unlikely(btree_node_read_in_flight(b))) { 1206 six_unlock_type(&b->c.lock, lock_type); 1207 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1208 } 1209 1210 prefetch(b->aux_data); 1211 1212 for_each_bset(b, t) { 1213 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1214 1215 prefetch(p + L1_CACHE_BYTES * 0); 1216 prefetch(p + L1_CACHE_BYTES * 1); 1217 prefetch(p + L1_CACHE_BYTES * 2); 1218 } 1219 1220 /* avoid atomic set bit if it's not needed: */ 1221 if (!btree_node_accessed(b)) 1222 set_btree_node_accessed(b); 1223 1224 if (unlikely(btree_node_read_error(b))) { 1225 six_unlock_type(&b->c.lock, lock_type); 1226 return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1227 } 1228 1229 EBUG_ON(b->c.btree_id != path->btree_id); 1230 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1231 btree_check_header(c, b); 1232 1233 return b; 1234 } 1235 1236 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1237 const struct bkey_i *k, 1238 enum btree_id btree_id, 1239 unsigned level, 1240 bool nofill) 1241 { 1242 struct bch_fs *c = trans->c; 1243 struct btree_cache *bc = &c->btree_cache; 1244 struct btree *b; 1245 int ret; 1246 1247 EBUG_ON(level >= BTREE_MAX_DEPTH); 1248 1249 if (c->opts.btree_node_mem_ptr_optimization) { 1250 b = btree_node_mem_ptr(k); 1251 if (b) 1252 goto lock_node; 1253 } 1254 retry: 1255 b = btree_cache_find(bc, k); 1256 if (unlikely(!b)) { 1257 if (nofill) 1258 goto out; 1259 1260 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1261 level, SIX_LOCK_read, true); 1262 1263 /* We raced and found the btree node in the cache */ 1264 if (!b) 1265 goto retry; 1266 1267 if (IS_ERR(b) && 1268 !bch2_btree_cache_cannibalize_lock(trans, NULL)) 1269 goto retry; 1270 1271 if (IS_ERR(b)) 1272 goto out; 1273 } else { 1274 lock_node: 1275 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1276 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1277 return ERR_PTR(ret); 1278 1279 BUG_ON(ret); 1280 1281 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1282 b->c.btree_id != btree_id || 1283 b->c.level != level)) { 1284 six_unlock_read(&b->c.lock); 1285 goto retry; 1286 } 1287 } 1288 1289 /* XXX: waiting on IO with btree locks held: */ 1290 __bch2_btree_node_wait_on_read(b); 1291 1292 prefetch(b->aux_data); 1293 1294 for_each_bset(b, t) { 1295 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1296 1297 prefetch(p + L1_CACHE_BYTES * 0); 1298 prefetch(p + L1_CACHE_BYTES * 1); 1299 prefetch(p + L1_CACHE_BYTES * 2); 1300 } 1301 1302 /* avoid atomic set bit if it's not needed: */ 1303 if (!btree_node_accessed(b)) 1304 set_btree_node_accessed(b); 1305 1306 if (unlikely(btree_node_read_error(b))) { 1307 six_unlock_read(&b->c.lock); 1308 b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1309 goto out; 1310 } 1311 1312 EBUG_ON(b->c.btree_id != btree_id); 1313 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1314 btree_check_header(c, b); 1315 out: 1316 bch2_btree_cache_cannibalize_unlock(trans); 1317 return b; 1318 } 1319 1320 int bch2_btree_node_prefetch(struct btree_trans *trans, 1321 struct btree_path *path, 1322 const struct bkey_i *k, 1323 enum btree_id btree_id, unsigned level) 1324 { 1325 struct bch_fs *c = trans->c; 1326 struct btree_cache *bc = &c->btree_cache; 1327 1328 BUG_ON(path && !btree_node_locked(path, level + 1)); 1329 BUG_ON(level >= BTREE_MAX_DEPTH); 1330 1331 struct btree *b = btree_cache_find(bc, k); 1332 if (b) 1333 return 0; 1334 1335 b = bch2_btree_node_fill(trans, path, k, btree_id, 1336 level, SIX_LOCK_read, false); 1337 int ret = PTR_ERR_OR_ZERO(b); 1338 if (ret) 1339 return ret; 1340 if (b) 1341 six_unlock_read(&b->c.lock); 1342 return 0; 1343 } 1344 1345 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1346 { 1347 struct bch_fs *c = trans->c; 1348 struct btree_cache *bc = &c->btree_cache; 1349 struct btree *b; 1350 1351 b = btree_cache_find(bc, k); 1352 if (!b) 1353 return; 1354 1355 BUG_ON(b == btree_node_root(trans->c, b)); 1356 wait_on_io: 1357 /* not allowed to wait on io with btree locks held: */ 1358 1359 /* XXX we're called from btree_gc which will be holding other btree 1360 * nodes locked 1361 */ 1362 __bch2_btree_node_wait_on_read(b); 1363 __bch2_btree_node_wait_on_write(b); 1364 1365 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1366 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1367 if (unlikely(b->hash_val != btree_ptr_hash_val(k))) 1368 goto out; 1369 1370 if (btree_node_dirty(b)) { 1371 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1372 six_unlock_write(&b->c.lock); 1373 six_unlock_intent(&b->c.lock); 1374 goto wait_on_io; 1375 } 1376 1377 BUG_ON(btree_node_dirty(b)); 1378 1379 mutex_lock(&bc->lock); 1380 bch2_btree_node_hash_remove(bc, b); 1381 btree_node_data_free(bc, b); 1382 mutex_unlock(&bc->lock); 1383 out: 1384 six_unlock_write(&b->c.lock); 1385 six_unlock_intent(&b->c.lock); 1386 } 1387 1388 const char *bch2_btree_id_str(enum btree_id btree) 1389 { 1390 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1391 } 1392 1393 void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) 1394 { 1395 if (btree < BTREE_ID_NR) 1396 prt_str(out, __bch2_btree_ids[btree]); 1397 else 1398 prt_printf(out, "(unknown btree %u)", btree); 1399 } 1400 1401 void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) 1402 { 1403 prt_str(out, "btree="); 1404 bch2_btree_id_to_text(out, btree); 1405 prt_printf(out, " level=%u", level); 1406 } 1407 1408 void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, 1409 enum btree_id btree, unsigned level, struct bkey_s_c k) 1410 { 1411 bch2_btree_id_to_text(out, btree); 1412 prt_printf(out, " level %u/", level); 1413 struct btree_root *r = bch2_btree_id_root(c, btree); 1414 if (r) 1415 prt_printf(out, "%u", r->level); 1416 else 1417 prt_printf(out, "(unknown)"); 1418 prt_printf(out, "\n "); 1419 1420 bch2_bkey_val_to_text(out, c, k); 1421 } 1422 1423 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1424 { 1425 __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); 1426 } 1427 1428 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1429 { 1430 struct bset_stats stats; 1431 1432 memset(&stats, 0, sizeof(stats)); 1433 1434 bch2_btree_keys_stats(b, &stats); 1435 1436 prt_printf(out, "l %u ", b->c.level); 1437 bch2_bpos_to_text(out, b->data->min_key); 1438 prt_printf(out, " - "); 1439 bch2_bpos_to_text(out, b->data->max_key); 1440 prt_printf(out, ":\n" 1441 " ptrs: "); 1442 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1443 prt_newline(out); 1444 1445 prt_printf(out, 1446 " format: "); 1447 bch2_bkey_format_to_text(out, &b->format); 1448 1449 prt_printf(out, 1450 " unpack fn len: %u\n" 1451 " bytes used %zu/%zu (%zu%% full)\n" 1452 " sib u64s: %u, %u (merge threshold %u)\n" 1453 " nr packed keys %u\n" 1454 " nr unpacked keys %u\n" 1455 " floats %zu\n" 1456 " failed unpacked %zu\n", 1457 b->unpack_fn_len, 1458 b->nr.live_u64s * sizeof(u64), 1459 btree_buf_bytes(b) - sizeof(struct btree_node), 1460 b->nr.live_u64s * 100 / btree_max_u64s(c), 1461 b->sib_u64s[0], 1462 b->sib_u64s[1], 1463 c->btree_foreground_merge_threshold, 1464 b->nr.packed_keys, 1465 b->nr.unpacked_keys, 1466 stats.floats, 1467 stats.failed); 1468 } 1469 1470 static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, 1471 const char *label, size_t nr) 1472 { 1473 prt_printf(out, "%s\t", label); 1474 prt_human_readable_u64(out, nr * c->opts.btree_node_size); 1475 prt_printf(out, " (%zu)\n", nr); 1476 } 1477 1478 static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { 1479 #define x(n) #n, 1480 BCH_BTREE_CACHE_NOT_FREED_REASONS() 1481 #undef x 1482 NULL 1483 }; 1484 1485 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) 1486 { 1487 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 1488 1489 if (!out->nr_tabstops) 1490 printbuf_tabstop_push(out, 32); 1491 1492 prt_btree_cache_line(out, c, "live:", bc->live[0].nr); 1493 prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); 1494 prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); 1495 prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); 1496 prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); 1497 prt_newline(out); 1498 1499 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { 1500 bch2_btree_id_to_text(out, i); 1501 prt_printf(out, "\t"); 1502 prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); 1503 prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); 1504 } 1505 1506 prt_newline(out); 1507 prt_printf(out, "freed:\t%zu\n", bc->nr_freed); 1508 prt_printf(out, "not freed:\n"); 1509 1510 for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) 1511 prt_printf(out, " %s\t%llu\n", 1512 bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); 1513 } 1514