1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bbpos.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_iter.h" 9 #include "btree_locking.h" 10 #include "debug.h" 11 #include "errcode.h" 12 #include "error.h" 13 #include "journal.h" 14 #include "trace.h" 15 16 #include <linux/prefetch.h> 17 #include <linux/sched/mm.h> 18 #include <linux/swap.h> 19 20 const char * const bch2_btree_node_flags[] = { 21 "typebit", 22 "typebit", 23 "typebit", 24 #define x(f) [BTREE_NODE_##f] = #f, 25 BTREE_FLAGS() 26 #undef x 27 NULL 28 }; 29 30 void bch2_recalc_btree_reserve(struct bch_fs *c) 31 { 32 unsigned reserve = 16; 33 34 if (!c->btree_roots_known[0].b) 35 reserve += 8; 36 37 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 38 struct btree_root *r = bch2_btree_id_root(c, i); 39 40 if (r->b) 41 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 42 } 43 44 c->btree_cache.nr_reserve = reserve; 45 } 46 47 static inline size_t btree_cache_can_free(struct btree_cache_list *list) 48 { 49 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 50 51 size_t can_free = list->nr; 52 if (!list->idx) 53 can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); 54 return can_free; 55 } 56 57 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 58 { 59 BUG_ON(!list_empty(&b->list)); 60 61 if (b->c.lock.readers) 62 list_add(&b->list, &bc->freed_pcpu); 63 else 64 list_add(&b->list, &bc->freed_nonpcpu); 65 } 66 67 static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) 68 { 69 BUG_ON(!list_empty(&b->list)); 70 BUG_ON(!b->data); 71 72 bc->nr_freeable++; 73 list_add(&b->list, &bc->freeable); 74 } 75 76 void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) 77 { 78 struct btree_cache *bc = &c->btree_cache; 79 80 mutex_lock(&bc->lock); 81 __bch2_btree_node_to_freelist(bc, b); 82 mutex_unlock(&bc->lock); 83 84 six_unlock_write(&b->c.lock); 85 six_unlock_intent(&b->c.lock); 86 } 87 88 void __btree_node_data_free(struct btree *b) 89 { 90 BUG_ON(!list_empty(&b->list)); 91 BUG_ON(btree_node_hashed(b)); 92 93 /* 94 * This should really be done in slub/vmalloc, but we're using the 95 * kmalloc_large() path, so we're working around a slub bug by doing 96 * this here: 97 */ 98 if (b->data) 99 mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); 100 if (b->aux_data) 101 mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); 102 103 EBUG_ON(btree_node_write_in_flight(b)); 104 105 clear_btree_node_just_written(b); 106 107 kvfree(b->data); 108 b->data = NULL; 109 #ifdef __KERNEL__ 110 kvfree(b->aux_data); 111 #else 112 munmap(b->aux_data, btree_aux_data_bytes(b)); 113 #endif 114 b->aux_data = NULL; 115 } 116 117 static void btree_node_data_free(struct btree_cache *bc, struct btree *b) 118 { 119 BUG_ON(list_empty(&b->list)); 120 list_del_init(&b->list); 121 122 __btree_node_data_free(b); 123 124 --bc->nr_freeable; 125 btree_node_to_freedlist(bc, b); 126 } 127 128 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 129 const void *obj) 130 { 131 const struct btree *b = obj; 132 const u64 *v = arg->key; 133 134 return b->hash_val == *v ? 0 : 1; 135 } 136 137 static const struct rhashtable_params bch_btree_cache_params = { 138 .head_offset = offsetof(struct btree, hash), 139 .key_offset = offsetof(struct btree, hash_val), 140 .key_len = sizeof(u64), 141 .obj_cmpfn = bch2_btree_cache_cmp_fn, 142 .automatic_shrinking = true, 143 }; 144 145 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 146 { 147 BUG_ON(b->data || b->aux_data); 148 149 gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; 150 151 b->data = kvmalloc(btree_buf_bytes(b), gfp); 152 if (!b->data) 153 return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); 154 #ifdef __KERNEL__ 155 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 156 #else 157 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 158 PROT_READ|PROT_WRITE|PROT_EXEC, 159 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 160 if (b->aux_data == MAP_FAILED) 161 b->aux_data = NULL; 162 #endif 163 if (!b->aux_data) { 164 kvfree(b->data); 165 b->data = NULL; 166 return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); 167 } 168 169 return 0; 170 } 171 172 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 173 { 174 struct btree *b; 175 176 b = kzalloc(sizeof(struct btree), gfp); 177 if (!b) 178 return NULL; 179 180 bkey_btree_ptr_init(&b->key); 181 INIT_LIST_HEAD(&b->list); 182 INIT_LIST_HEAD(&b->write_blocked); 183 b->byte_order = ilog2(c->opts.btree_node_size); 184 return b; 185 } 186 187 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 188 { 189 struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); 190 if (!b) 191 return NULL; 192 193 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 194 kfree(b); 195 return NULL; 196 } 197 198 bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); 199 return b; 200 } 201 202 static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) 203 { 204 struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 205 206 u64 mask = bc->pinned_nodes_mask[!!b->c.level]; 207 208 return ((mask & BIT_ULL(b->c.btree_id)) && 209 bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 210 bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); 211 } 212 213 void bch2_node_pin(struct bch_fs *c, struct btree *b) 214 { 215 struct btree_cache *bc = &c->btree_cache; 216 217 mutex_lock(&bc->lock); 218 if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { 219 set_btree_node_pinned(b); 220 list_move(&b->list, &bc->live[1].list); 221 bc->live[0].nr--; 222 bc->live[1].nr++; 223 } 224 mutex_unlock(&bc->lock); 225 } 226 227 void bch2_btree_cache_unpin(struct bch_fs *c) 228 { 229 struct btree_cache *bc = &c->btree_cache; 230 struct btree *b, *n; 231 232 mutex_lock(&bc->lock); 233 c->btree_cache.pinned_nodes_mask[0] = 0; 234 c->btree_cache.pinned_nodes_mask[1] = 0; 235 236 list_for_each_entry_safe(b, n, &bc->live[1].list, list) { 237 clear_btree_node_pinned(b); 238 list_move(&b->list, &bc->live[0].list); 239 bc->live[0].nr++; 240 bc->live[1].nr--; 241 } 242 243 mutex_unlock(&bc->lock); 244 } 245 246 /* Btree in memory cache - hash table */ 247 248 void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 249 { 250 lockdep_assert_held(&bc->lock); 251 252 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 253 BUG_ON(ret); 254 255 /* Cause future lookups for this node to fail: */ 256 b->hash_val = 0; 257 258 if (b->c.btree_id < BTREE_ID_NR) 259 --bc->nr_by_btree[b->c.btree_id]; 260 --bc->live[btree_node_pinned(b)].nr; 261 list_del_init(&b->list); 262 } 263 264 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 265 { 266 __bch2_btree_node_hash_remove(bc, b); 267 __bch2_btree_node_to_freelist(bc, b); 268 } 269 270 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 271 { 272 BUG_ON(!list_empty(&b->list)); 273 BUG_ON(b->hash_val); 274 275 b->hash_val = btree_ptr_hash_val(&b->key); 276 int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, 277 bch_btree_cache_params); 278 if (ret) 279 return ret; 280 281 if (b->c.btree_id < BTREE_ID_NR) 282 bc->nr_by_btree[b->c.btree_id]++; 283 284 bool p = __btree_node_pinned(bc, b); 285 mod_bit(BTREE_NODE_pinned, &b->flags, p); 286 287 list_add_tail(&b->list, &bc->live[p].list); 288 bc->live[p].nr++; 289 return 0; 290 } 291 292 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 293 unsigned level, enum btree_id id) 294 { 295 b->c.level = level; 296 b->c.btree_id = id; 297 298 mutex_lock(&bc->lock); 299 int ret = __bch2_btree_node_hash_insert(bc, b); 300 mutex_unlock(&bc->lock); 301 302 return ret; 303 } 304 305 void bch2_btree_node_update_key_early(struct btree_trans *trans, 306 enum btree_id btree, unsigned level, 307 struct bkey_s_c old, struct bkey_i *new) 308 { 309 struct bch_fs *c = trans->c; 310 struct btree *b; 311 struct bkey_buf tmp; 312 int ret; 313 314 bch2_bkey_buf_init(&tmp); 315 bch2_bkey_buf_reassemble(&tmp, c, old); 316 317 b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); 318 if (!IS_ERR_OR_NULL(b)) { 319 mutex_lock(&c->btree_cache.lock); 320 321 __bch2_btree_node_hash_remove(&c->btree_cache, b); 322 323 bkey_copy(&b->key, new); 324 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 325 BUG_ON(ret); 326 327 mutex_unlock(&c->btree_cache.lock); 328 six_unlock_read(&b->c.lock); 329 } 330 331 bch2_bkey_buf_exit(&tmp, c); 332 } 333 334 __flatten 335 static inline struct btree *btree_cache_find(struct btree_cache *bc, 336 const struct bkey_i *k) 337 { 338 u64 v = btree_ptr_hash_val(k); 339 340 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 341 } 342 343 static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, 344 bool flush, bool locked) 345 { 346 struct btree_cache *bc = &c->btree_cache; 347 348 lockdep_assert_held(&bc->lock); 349 350 if (btree_node_noevict(b)) { 351 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; 352 return bch_err_throw(c, ENOMEM_btree_node_reclaim); 353 } 354 if (btree_node_write_blocked(b)) { 355 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; 356 return bch_err_throw(c, ENOMEM_btree_node_reclaim); 357 } 358 if (btree_node_will_make_reachable(b)) { 359 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; 360 return bch_err_throw(c, ENOMEM_btree_node_reclaim); 361 } 362 363 if (btree_node_dirty(b)) { 364 if (!flush) { 365 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; 366 return bch_err_throw(c, ENOMEM_btree_node_reclaim); 367 } 368 369 if (locked) { 370 /* 371 * Using the underscore version because we don't want to compact 372 * bsets after the write, since this node is about to be evicted 373 * - unless btree verify mode is enabled, since it runs out of 374 * the post write cleanup: 375 */ 376 if (static_branch_unlikely(&bch2_verify_btree_ondisk)) 377 bch2_btree_node_write(c, b, SIX_LOCK_intent, 378 BTREE_WRITE_cache_reclaim); 379 else 380 __bch2_btree_node_write(c, b, 381 BTREE_WRITE_cache_reclaim); 382 } 383 } 384 385 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 386 (1U << BTREE_NODE_write_in_flight))) { 387 if (!flush) { 388 if (btree_node_read_in_flight(b)) 389 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; 390 else if (btree_node_write_in_flight(b)) 391 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; 392 return bch_err_throw(c, ENOMEM_btree_node_reclaim); 393 } 394 395 if (locked) 396 return -EINTR; 397 398 /* XXX: waiting on IO with btree cache lock held */ 399 bch2_btree_node_wait_on_read(b); 400 bch2_btree_node_wait_on_write(b); 401 } 402 403 return 0; 404 } 405 406 /* 407 * this version is for btree nodes that have already been freed (we're not 408 * reaping a real btree node) 409 */ 410 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) 411 { 412 struct btree_cache *bc = &c->btree_cache; 413 int ret = 0; 414 415 lockdep_assert_held(&bc->lock); 416 retry_unlocked: 417 ret = __btree_node_reclaim_checks(c, b, flush, false); 418 if (ret) 419 return ret; 420 421 if (!six_trylock_intent(&b->c.lock)) { 422 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; 423 return bch_err_throw(c, ENOMEM_btree_node_reclaim); 424 } 425 426 if (!six_trylock_write(&b->c.lock)) { 427 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; 428 six_unlock_intent(&b->c.lock); 429 return bch_err_throw(c, ENOMEM_btree_node_reclaim); 430 } 431 432 /* recheck under lock */ 433 ret = __btree_node_reclaim_checks(c, b, flush, true); 434 if (ret) { 435 six_unlock_write(&b->c.lock); 436 six_unlock_intent(&b->c.lock); 437 if (ret == -EINTR) 438 goto retry_unlocked; 439 return ret; 440 } 441 442 if (b->hash_val && !ret) 443 trace_and_count(c, btree_cache_reap, c, b); 444 return 0; 445 } 446 447 static int btree_node_reclaim(struct bch_fs *c, struct btree *b) 448 { 449 return __btree_node_reclaim(c, b, false); 450 } 451 452 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 453 { 454 return __btree_node_reclaim(c, b, true); 455 } 456 457 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 458 struct shrink_control *sc) 459 { 460 struct btree_cache_list *list = shrink->private_data; 461 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 462 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 463 struct btree *b, *t; 464 unsigned long nr = sc->nr_to_scan; 465 unsigned long can_free = 0; 466 unsigned long freed = 0; 467 unsigned long touched = 0; 468 unsigned i, flags; 469 unsigned long ret = SHRINK_STOP; 470 bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; 471 472 if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) 473 return SHRINK_STOP; 474 475 mutex_lock(&bc->lock); 476 flags = memalloc_nofs_save(); 477 478 /* 479 * It's _really_ critical that we don't free too many btree nodes - we 480 * have to always leave ourselves a reserve. The reserve is how we 481 * guarantee that allocating memory for a new btree node can always 482 * succeed, so that inserting keys into the btree can always succeed and 483 * IO can always make forward progress: 484 */ 485 can_free = btree_cache_can_free(list); 486 if (nr > can_free) { 487 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; 488 nr = can_free; 489 } 490 491 i = 0; 492 list_for_each_entry_safe(b, t, &bc->freeable, list) { 493 /* 494 * Leave a few nodes on the freeable list, so that a btree split 495 * won't have to hit the system allocator: 496 */ 497 if (++i <= 3) 498 continue; 499 500 touched++; 501 502 if (touched >= nr) 503 goto out; 504 505 if (!btree_node_reclaim(c, b)) { 506 btree_node_data_free(bc, b); 507 six_unlock_write(&b->c.lock); 508 six_unlock_intent(&b->c.lock); 509 freed++; 510 bc->nr_freed++; 511 } 512 } 513 restart: 514 list_for_each_entry_safe(b, t, &list->list, list) { 515 touched++; 516 517 if (btree_node_accessed(b)) { 518 clear_btree_node_accessed(b); 519 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; 520 --touched;; 521 } else if (!btree_node_reclaim(c, b)) { 522 __bch2_btree_node_hash_remove(bc, b); 523 __btree_node_data_free(b); 524 btree_node_to_freedlist(bc, b); 525 526 freed++; 527 bc->nr_freed++; 528 529 six_unlock_write(&b->c.lock); 530 six_unlock_intent(&b->c.lock); 531 532 if (freed == nr) 533 goto out_rotate; 534 } else if (trigger_writes && 535 btree_node_dirty(b) && 536 !btree_node_will_make_reachable(b) && 537 !btree_node_write_blocked(b) && 538 six_trylock_read(&b->c.lock)) { 539 list_move(&list->list, &b->list); 540 mutex_unlock(&bc->lock); 541 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 542 six_unlock_read(&b->c.lock); 543 if (touched >= nr) 544 goto out_nounlock; 545 mutex_lock(&bc->lock); 546 goto restart; 547 } 548 549 if (touched >= nr) 550 break; 551 } 552 out_rotate: 553 if (&t->list != &list->list) 554 list_move_tail(&list->list, &t->list); 555 out: 556 mutex_unlock(&bc->lock); 557 out_nounlock: 558 ret = freed; 559 memalloc_nofs_restore(flags); 560 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 561 return ret; 562 } 563 564 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 565 struct shrink_control *sc) 566 { 567 struct btree_cache_list *list = shrink->private_data; 568 569 if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) 570 return 0; 571 572 return btree_cache_can_free(list); 573 } 574 575 void bch2_fs_btree_cache_exit(struct bch_fs *c) 576 { 577 struct btree_cache *bc = &c->btree_cache; 578 struct btree *b, *t; 579 unsigned long flags; 580 581 shrinker_free(bc->live[1].shrink); 582 shrinker_free(bc->live[0].shrink); 583 584 /* vfree() can allocate memory: */ 585 flags = memalloc_nofs_save(); 586 mutex_lock(&bc->lock); 587 588 if (c->verify_data) 589 list_move(&c->verify_data->list, &bc->live[0].list); 590 591 kvfree(c->verify_ondisk); 592 593 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 594 struct btree_root *r = bch2_btree_id_root(c, i); 595 596 if (r->b) 597 list_add(&r->b->list, &bc->live[0].list); 598 } 599 600 list_for_each_entry_safe(b, t, &bc->live[1].list, list) 601 bch2_btree_node_hash_remove(bc, b); 602 list_for_each_entry_safe(b, t, &bc->live[0].list, list) 603 bch2_btree_node_hash_remove(bc, b); 604 605 list_for_each_entry_safe(b, t, &bc->freeable, list) { 606 BUG_ON(btree_node_read_in_flight(b) || 607 btree_node_write_in_flight(b)); 608 609 btree_node_data_free(bc, b); 610 cond_resched(); 611 } 612 613 BUG_ON(!bch2_journal_error(&c->journal) && 614 atomic_long_read(&c->btree_cache.nr_dirty)); 615 616 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 617 618 list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { 619 list_del(&b->list); 620 six_lock_exit(&b->c.lock); 621 kfree(b); 622 } 623 624 mutex_unlock(&bc->lock); 625 memalloc_nofs_restore(flags); 626 627 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) 628 BUG_ON(bc->nr_by_btree[i]); 629 BUG_ON(bc->live[0].nr); 630 BUG_ON(bc->live[1].nr); 631 BUG_ON(bc->nr_freeable); 632 633 if (bc->table_init_done) 634 rhashtable_destroy(&bc->table); 635 } 636 637 int bch2_fs_btree_cache_init(struct bch_fs *c) 638 { 639 struct btree_cache *bc = &c->btree_cache; 640 struct shrinker *shrink; 641 unsigned i; 642 int ret = 0; 643 644 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 645 if (ret) 646 goto err; 647 648 bc->table_init_done = true; 649 650 bch2_recalc_btree_reserve(c); 651 652 for (i = 0; i < bc->nr_reserve; i++) { 653 struct btree *b = __bch2_btree_node_mem_alloc(c); 654 if (!b) 655 goto err; 656 __bch2_btree_node_to_freelist(bc, b); 657 } 658 659 list_splice_init(&bc->live[0].list, &bc->freeable); 660 661 mutex_init(&c->verify_lock); 662 663 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 664 if (!shrink) 665 goto err; 666 bc->live[0].shrink = shrink; 667 shrink->count_objects = bch2_btree_cache_count; 668 shrink->scan_objects = bch2_btree_cache_scan; 669 shrink->seeks = 2; 670 shrink->private_data = &bc->live[0]; 671 shrinker_register(shrink); 672 673 shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); 674 if (!shrink) 675 goto err; 676 bc->live[1].shrink = shrink; 677 shrink->count_objects = bch2_btree_cache_count; 678 shrink->scan_objects = bch2_btree_cache_scan; 679 shrink->seeks = 8; 680 shrink->private_data = &bc->live[1]; 681 shrinker_register(shrink); 682 683 return 0; 684 err: 685 return bch_err_throw(c, ENOMEM_fs_btree_cache_init); 686 } 687 688 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 689 { 690 mutex_init(&bc->lock); 691 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { 692 bc->live[i].idx = i; 693 INIT_LIST_HEAD(&bc->live[i].list); 694 } 695 INIT_LIST_HEAD(&bc->freeable); 696 INIT_LIST_HEAD(&bc->freed_pcpu); 697 INIT_LIST_HEAD(&bc->freed_nonpcpu); 698 } 699 700 /* 701 * We can only have one thread cannibalizing other cached btree nodes at a time, 702 * or we'll deadlock. We use an open coded mutex to ensure that, which a 703 * cannibalize_bucket() will take. This means every time we unlock the root of 704 * the btree, we need to release this lock if we have it held. 705 */ 706 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) 707 { 708 struct bch_fs *c = trans->c; 709 struct btree_cache *bc = &c->btree_cache; 710 711 if (bc->alloc_lock == current) { 712 trace_and_count(c, btree_cache_cannibalize_unlock, trans); 713 bc->alloc_lock = NULL; 714 closure_wake_up(&bc->alloc_wait); 715 } 716 } 717 718 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) 719 { 720 struct bch_fs *c = trans->c; 721 struct btree_cache *bc = &c->btree_cache; 722 struct task_struct *old; 723 724 old = NULL; 725 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) 726 goto success; 727 728 if (!cl) { 729 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 730 return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock); 731 } 732 733 closure_wait(&bc->alloc_wait, cl); 734 735 /* Try again, after adding ourselves to waitlist */ 736 old = NULL; 737 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { 738 /* We raced */ 739 closure_wake_up(&bc->alloc_wait); 740 goto success; 741 } 742 743 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 744 return bch_err_throw(c, btree_cache_cannibalize_lock_blocked); 745 746 success: 747 trace_and_count(c, btree_cache_cannibalize_lock, trans); 748 return 0; 749 } 750 751 static struct btree *btree_node_cannibalize(struct bch_fs *c) 752 { 753 struct btree_cache *bc = &c->btree_cache; 754 struct btree *b; 755 756 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 757 list_for_each_entry_reverse(b, &bc->live[i].list, list) 758 if (!btree_node_reclaim(c, b)) 759 return b; 760 761 while (1) { 762 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 763 list_for_each_entry_reverse(b, &bc->live[i].list, list) 764 if (!btree_node_write_and_reclaim(c, b)) 765 return b; 766 767 /* 768 * Rare case: all nodes were intent-locked. 769 * Just busy-wait. 770 */ 771 WARN_ONCE(1, "btree cache cannibalize failed\n"); 772 cond_resched(); 773 } 774 } 775 776 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 777 { 778 struct bch_fs *c = trans->c; 779 struct btree_cache *bc = &c->btree_cache; 780 struct list_head *freed = pcpu_read_locks 781 ? &bc->freed_pcpu 782 : &bc->freed_nonpcpu; 783 struct btree *b, *b2; 784 u64 start_time = local_clock(); 785 786 mutex_lock(&bc->lock); 787 788 /* 789 * We never free struct btree itself, just the memory that holds the on 790 * disk node. Check the freed list before allocating a new one: 791 */ 792 list_for_each_entry(b, freed, list) 793 if (!btree_node_reclaim(c, b)) { 794 list_del_init(&b->list); 795 goto got_node; 796 } 797 798 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 799 if (b) { 800 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); 801 } else { 802 mutex_unlock(&bc->lock); 803 bch2_trans_unlock(trans); 804 b = __btree_node_mem_alloc(c, GFP_KERNEL); 805 if (!b) 806 goto err; 807 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); 808 mutex_lock(&bc->lock); 809 } 810 811 BUG_ON(!six_trylock_intent(&b->c.lock)); 812 BUG_ON(!six_trylock_write(&b->c.lock)); 813 814 got_node: 815 /* 816 * btree_free() doesn't free memory; it sticks the node on the end of 817 * the list. Check if there's any freed nodes there: 818 */ 819 list_for_each_entry(b2, &bc->freeable, list) 820 if (!btree_node_reclaim(c, b2)) { 821 swap(b->data, b2->data); 822 swap(b->aux_data, b2->aux_data); 823 824 list_del_init(&b2->list); 825 --bc->nr_freeable; 826 btree_node_to_freedlist(bc, b2); 827 mutex_unlock(&bc->lock); 828 829 six_unlock_write(&b2->c.lock); 830 six_unlock_intent(&b2->c.lock); 831 goto got_mem; 832 } 833 834 mutex_unlock(&bc->lock); 835 836 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 837 bch2_trans_unlock(trans); 838 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 839 goto err; 840 } 841 842 got_mem: 843 BUG_ON(!list_empty(&b->list)); 844 BUG_ON(btree_node_hashed(b)); 845 BUG_ON(btree_node_dirty(b)); 846 BUG_ON(btree_node_write_in_flight(b)); 847 out: 848 b->flags = 0; 849 b->written = 0; 850 b->nsets = 0; 851 b->sib_u64s[0] = 0; 852 b->sib_u64s[1] = 0; 853 b->whiteout_u64s = 0; 854 bch2_btree_keys_init(b); 855 856 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 857 start_time); 858 859 int ret = bch2_trans_relock(trans); 860 if (unlikely(ret)) { 861 bch2_btree_node_to_freelist(c, b); 862 return ERR_PTR(ret); 863 } 864 865 return b; 866 err: 867 mutex_lock(&bc->lock); 868 869 /* Try to cannibalize another cached btree node: */ 870 if (bc->alloc_lock == current) { 871 b2 = btree_node_cannibalize(c); 872 clear_btree_node_just_written(b2); 873 __bch2_btree_node_hash_remove(bc, b2); 874 875 if (b) { 876 swap(b->data, b2->data); 877 swap(b->aux_data, b2->aux_data); 878 btree_node_to_freedlist(bc, b2); 879 six_unlock_write(&b2->c.lock); 880 six_unlock_intent(&b2->c.lock); 881 } else { 882 b = b2; 883 } 884 885 BUG_ON(!list_empty(&b->list)); 886 mutex_unlock(&bc->lock); 887 888 trace_and_count(c, btree_cache_cannibalize, trans); 889 goto out; 890 } 891 892 mutex_unlock(&bc->lock); 893 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 894 } 895 896 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 897 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 898 struct btree_path *path, 899 const struct bkey_i *k, 900 enum btree_id btree_id, 901 unsigned level, 902 enum six_lock_type lock_type, 903 bool sync) 904 { 905 struct bch_fs *c = trans->c; 906 struct btree_cache *bc = &c->btree_cache; 907 struct btree *b; 908 909 if (unlikely(level >= BTREE_MAX_DEPTH)) { 910 int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", 911 level, BTREE_MAX_DEPTH); 912 return ERR_PTR(ret); 913 } 914 915 if (unlikely(!bkey_is_btree_ptr(&k->k))) { 916 struct printbuf buf = PRINTBUF; 917 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 918 919 int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); 920 printbuf_exit(&buf); 921 return ERR_PTR(ret); 922 } 923 924 if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { 925 struct printbuf buf = PRINTBUF; 926 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 927 928 int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); 929 printbuf_exit(&buf); 930 return ERR_PTR(ret); 931 } 932 933 /* 934 * Parent node must be locked, else we could read in a btree node that's 935 * been freed: 936 */ 937 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 938 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 939 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 940 } 941 942 b = bch2_btree_node_mem_alloc(trans, level != 0); 943 944 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 945 if (!path) 946 return b; 947 948 trans->memory_allocation_failure = true; 949 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 950 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 951 } 952 953 if (IS_ERR(b)) 954 return b; 955 956 bkey_copy(&b->key, k); 957 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 958 /* raced with another fill: */ 959 960 /* mark as unhashed... */ 961 b->hash_val = 0; 962 963 mutex_lock(&bc->lock); 964 __bch2_btree_node_to_freelist(bc, b); 965 mutex_unlock(&bc->lock); 966 967 six_unlock_write(&b->c.lock); 968 six_unlock_intent(&b->c.lock); 969 return NULL; 970 } 971 972 set_btree_node_read_in_flight(b); 973 six_unlock_write(&b->c.lock); 974 975 if (path) { 976 u32 seq = six_lock_seq(&b->c.lock); 977 978 /* Unlock before doing IO: */ 979 six_unlock_intent(&b->c.lock); 980 bch2_trans_unlock(trans); 981 982 bch2_btree_node_read(trans, b, sync); 983 984 int ret = bch2_trans_relock(trans); 985 if (ret) 986 return ERR_PTR(ret); 987 988 if (!sync) 989 return NULL; 990 991 if (!six_relock_type(&b->c.lock, lock_type, seq)) 992 b = NULL; 993 } else { 994 bch2_btree_node_read(trans, b, sync); 995 if (lock_type == SIX_LOCK_read) 996 six_lock_downgrade(&b->c.lock); 997 } 998 999 return b; 1000 } 1001 1002 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 1003 { 1004 struct printbuf buf = PRINTBUF; 1005 1006 if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) 1007 return; 1008 1009 prt_printf(&buf, 1010 "btree node header doesn't match ptr: "); 1011 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 1012 prt_str(&buf, "\nptr: "); 1013 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 1014 1015 prt_str(&buf, "\nheader: "); 1016 bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); 1017 prt_str(&buf, "\nmin "); 1018 bch2_bpos_to_text(&buf, b->data->min_key); 1019 1020 prt_printf(&buf, "\nmax "); 1021 bch2_bpos_to_text(&buf, b->data->max_key); 1022 1023 bch2_fs_topology_error(c, "%s", buf.buf); 1024 1025 printbuf_exit(&buf); 1026 } 1027 1028 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 1029 { 1030 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 1031 b->c.level != BTREE_NODE_LEVEL(b->data) || 1032 !bpos_eq(b->data->max_key, b->key.k.p) || 1033 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 1034 !bpos_eq(b->data->min_key, 1035 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 1036 btree_bad_header(c, b); 1037 } 1038 1039 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1040 const struct bkey_i *k, unsigned level, 1041 enum six_lock_type lock_type, 1042 unsigned long trace_ip) 1043 { 1044 struct bch_fs *c = trans->c; 1045 struct btree_cache *bc = &c->btree_cache; 1046 struct btree *b; 1047 bool need_relock = false; 1048 int ret; 1049 1050 EBUG_ON(level >= BTREE_MAX_DEPTH); 1051 retry: 1052 b = btree_cache_find(bc, k); 1053 if (unlikely(!b)) { 1054 /* 1055 * We must have the parent locked to call bch2_btree_node_fill(), 1056 * else we could read in a btree node from disk that's been 1057 * freed: 1058 */ 1059 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 1060 level, lock_type, true); 1061 need_relock = true; 1062 1063 /* We raced and found the btree node in the cache */ 1064 if (!b) 1065 goto retry; 1066 1067 if (IS_ERR(b)) 1068 return b; 1069 } else { 1070 if (btree_node_read_locked(path, level + 1)) 1071 btree_node_unlock(trans, path, level + 1); 1072 1073 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1074 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1075 return ERR_PTR(ret); 1076 1077 BUG_ON(ret); 1078 1079 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1080 b->c.level != level || 1081 race_fault())) { 1082 six_unlock_type(&b->c.lock, lock_type); 1083 if (bch2_btree_node_relock(trans, path, level + 1)) 1084 goto retry; 1085 1086 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1087 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1088 } 1089 1090 /* avoid atomic set bit if it's not needed: */ 1091 if (!btree_node_accessed(b)) 1092 set_btree_node_accessed(b); 1093 } 1094 1095 if (unlikely(btree_node_read_in_flight(b))) { 1096 u32 seq = six_lock_seq(&b->c.lock); 1097 1098 six_unlock_type(&b->c.lock, lock_type); 1099 bch2_trans_unlock(trans); 1100 need_relock = true; 1101 1102 bch2_btree_node_wait_on_read(b); 1103 1104 ret = bch2_trans_relock(trans); 1105 if (ret) 1106 return ERR_PTR(ret); 1107 1108 /* 1109 * should_be_locked is not set on this path yet, so we need to 1110 * relock it specifically: 1111 */ 1112 if (!six_relock_type(&b->c.lock, lock_type, seq)) 1113 goto retry; 1114 } 1115 1116 if (unlikely(need_relock)) { 1117 ret = bch2_trans_relock(trans) ?: 1118 bch2_btree_path_relock_intent(trans, path); 1119 if (ret) { 1120 six_unlock_type(&b->c.lock, lock_type); 1121 return ERR_PTR(ret); 1122 } 1123 } 1124 1125 prefetch(b->aux_data); 1126 1127 for_each_bset(b, t) { 1128 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1129 1130 prefetch(p + L1_CACHE_BYTES * 0); 1131 prefetch(p + L1_CACHE_BYTES * 1); 1132 prefetch(p + L1_CACHE_BYTES * 2); 1133 } 1134 1135 if (unlikely(btree_node_read_error(b))) { 1136 six_unlock_type(&b->c.lock, lock_type); 1137 return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1138 } 1139 1140 EBUG_ON(b->c.btree_id != path->btree_id); 1141 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1142 btree_check_header(c, b); 1143 1144 return b; 1145 } 1146 1147 /** 1148 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 1149 * in from disk if necessary. 1150 * 1151 * @trans: btree transaction object 1152 * @path: btree_path being traversed 1153 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 1154 * @level: level of btree node being looked up (0 == leaf node) 1155 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 1156 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 1157 * 1158 * The btree node will have either a read or a write lock held, depending on 1159 * the @write parameter. 1160 * 1161 * Returns: btree node or ERR_PTR() 1162 */ 1163 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1164 const struct bkey_i *k, unsigned level, 1165 enum six_lock_type lock_type, 1166 unsigned long trace_ip) 1167 { 1168 struct bch_fs *c = trans->c; 1169 struct btree *b; 1170 int ret; 1171 1172 EBUG_ON(level >= BTREE_MAX_DEPTH); 1173 1174 b = btree_node_mem_ptr(k); 1175 1176 /* 1177 * Check b->hash_val _before_ calling btree_node_lock() - this might not 1178 * be the node we want anymore, and trying to lock the wrong node could 1179 * cause an unneccessary transaction restart: 1180 */ 1181 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 1182 !b || 1183 b->hash_val != btree_ptr_hash_val(k))) 1184 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1185 1186 if (btree_node_read_locked(path, level + 1)) 1187 btree_node_unlock(trans, path, level + 1); 1188 1189 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1190 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1191 return ERR_PTR(ret); 1192 1193 BUG_ON(ret); 1194 1195 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1196 b->c.level != level || 1197 race_fault())) { 1198 six_unlock_type(&b->c.lock, lock_type); 1199 if (bch2_btree_node_relock(trans, path, level + 1)) 1200 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1201 1202 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1203 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1204 } 1205 1206 if (unlikely(btree_node_read_in_flight(b))) { 1207 six_unlock_type(&b->c.lock, lock_type); 1208 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1209 } 1210 1211 prefetch(b->aux_data); 1212 1213 for_each_bset(b, t) { 1214 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1215 1216 prefetch(p + L1_CACHE_BYTES * 0); 1217 prefetch(p + L1_CACHE_BYTES * 1); 1218 prefetch(p + L1_CACHE_BYTES * 2); 1219 } 1220 1221 /* avoid atomic set bit if it's not needed: */ 1222 if (!btree_node_accessed(b)) 1223 set_btree_node_accessed(b); 1224 1225 if (unlikely(btree_node_read_error(b))) { 1226 six_unlock_type(&b->c.lock, lock_type); 1227 return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1228 } 1229 1230 EBUG_ON(b->c.btree_id != path->btree_id); 1231 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1232 btree_check_header(c, b); 1233 1234 return b; 1235 } 1236 1237 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1238 const struct bkey_i *k, 1239 enum btree_id btree_id, 1240 unsigned level, 1241 bool nofill) 1242 { 1243 struct bch_fs *c = trans->c; 1244 struct btree_cache *bc = &c->btree_cache; 1245 struct btree *b; 1246 int ret; 1247 1248 EBUG_ON(level >= BTREE_MAX_DEPTH); 1249 1250 if (c->opts.btree_node_mem_ptr_optimization) { 1251 b = btree_node_mem_ptr(k); 1252 if (b) 1253 goto lock_node; 1254 } 1255 retry: 1256 b = btree_cache_find(bc, k); 1257 if (unlikely(!b)) { 1258 if (nofill) 1259 goto out; 1260 1261 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1262 level, SIX_LOCK_read, true); 1263 1264 /* We raced and found the btree node in the cache */ 1265 if (!b) 1266 goto retry; 1267 1268 if (IS_ERR(b) && 1269 !bch2_btree_cache_cannibalize_lock(trans, NULL)) 1270 goto retry; 1271 1272 if (IS_ERR(b)) 1273 goto out; 1274 } else { 1275 lock_node: 1276 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1277 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1278 return ERR_PTR(ret); 1279 1280 BUG_ON(ret); 1281 1282 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1283 b->c.btree_id != btree_id || 1284 b->c.level != level)) { 1285 six_unlock_read(&b->c.lock); 1286 goto retry; 1287 } 1288 1289 /* avoid atomic set bit if it's not needed: */ 1290 if (!btree_node_accessed(b)) 1291 set_btree_node_accessed(b); 1292 } 1293 1294 /* XXX: waiting on IO with btree locks held: */ 1295 __bch2_btree_node_wait_on_read(b); 1296 1297 prefetch(b->aux_data); 1298 1299 for_each_bset(b, t) { 1300 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1301 1302 prefetch(p + L1_CACHE_BYTES * 0); 1303 prefetch(p + L1_CACHE_BYTES * 1); 1304 prefetch(p + L1_CACHE_BYTES * 2); 1305 } 1306 1307 if (unlikely(btree_node_read_error(b))) { 1308 six_unlock_read(&b->c.lock); 1309 b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1310 goto out; 1311 } 1312 1313 EBUG_ON(b->c.btree_id != btree_id); 1314 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1315 btree_check_header(c, b); 1316 out: 1317 bch2_btree_cache_cannibalize_unlock(trans); 1318 return b; 1319 } 1320 1321 int bch2_btree_node_prefetch(struct btree_trans *trans, 1322 struct btree_path *path, 1323 const struct bkey_i *k, 1324 enum btree_id btree_id, unsigned level) 1325 { 1326 struct bch_fs *c = trans->c; 1327 struct btree_cache *bc = &c->btree_cache; 1328 1329 BUG_ON(path && !btree_node_locked(path, level + 1)); 1330 BUG_ON(level >= BTREE_MAX_DEPTH); 1331 1332 struct btree *b = btree_cache_find(bc, k); 1333 if (b) 1334 return 0; 1335 1336 b = bch2_btree_node_fill(trans, path, k, btree_id, 1337 level, SIX_LOCK_read, false); 1338 int ret = PTR_ERR_OR_ZERO(b); 1339 if (ret) 1340 return ret; 1341 if (b) 1342 six_unlock_read(&b->c.lock); 1343 return 0; 1344 } 1345 1346 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1347 { 1348 struct bch_fs *c = trans->c; 1349 struct btree_cache *bc = &c->btree_cache; 1350 struct btree *b; 1351 1352 b = btree_cache_find(bc, k); 1353 if (!b) 1354 return; 1355 1356 BUG_ON(b == btree_node_root(trans->c, b)); 1357 wait_on_io: 1358 /* not allowed to wait on io with btree locks held: */ 1359 1360 /* XXX we're called from btree_gc which will be holding other btree 1361 * nodes locked 1362 */ 1363 __bch2_btree_node_wait_on_read(b); 1364 __bch2_btree_node_wait_on_write(b); 1365 1366 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1367 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1368 if (unlikely(b->hash_val != btree_ptr_hash_val(k))) 1369 goto out; 1370 1371 if (btree_node_dirty(b)) { 1372 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1373 six_unlock_write(&b->c.lock); 1374 six_unlock_intent(&b->c.lock); 1375 goto wait_on_io; 1376 } 1377 1378 BUG_ON(btree_node_dirty(b)); 1379 1380 mutex_lock(&bc->lock); 1381 bch2_btree_node_hash_remove(bc, b); 1382 btree_node_data_free(bc, b); 1383 mutex_unlock(&bc->lock); 1384 out: 1385 six_unlock_write(&b->c.lock); 1386 six_unlock_intent(&b->c.lock); 1387 } 1388 1389 const char *bch2_btree_id_str(enum btree_id btree) 1390 { 1391 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1392 } 1393 1394 void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) 1395 { 1396 if (btree < BTREE_ID_NR) 1397 prt_str(out, __bch2_btree_ids[btree]); 1398 else 1399 prt_printf(out, "(unknown btree %u)", btree); 1400 } 1401 1402 void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) 1403 { 1404 prt_str(out, "btree="); 1405 bch2_btree_id_to_text(out, btree); 1406 prt_printf(out, " level=%u", level); 1407 } 1408 1409 void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, 1410 enum btree_id btree, unsigned level, struct bkey_s_c k) 1411 { 1412 bch2_btree_id_to_text(out, btree); 1413 prt_printf(out, " level %u/", level); 1414 struct btree_root *r = bch2_btree_id_root(c, btree); 1415 if (r) 1416 prt_printf(out, "%u", r->level); 1417 else 1418 prt_printf(out, "(unknown)"); 1419 prt_newline(out); 1420 1421 bch2_bkey_val_to_text(out, c, k); 1422 } 1423 1424 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1425 { 1426 __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); 1427 } 1428 1429 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1430 { 1431 struct bset_stats stats; 1432 1433 memset(&stats, 0, sizeof(stats)); 1434 1435 bch2_btree_keys_stats(b, &stats); 1436 1437 prt_printf(out, "l %u ", b->c.level); 1438 bch2_bpos_to_text(out, b->data->min_key); 1439 prt_printf(out, " - "); 1440 bch2_bpos_to_text(out, b->data->max_key); 1441 prt_printf(out, ":\n" 1442 " ptrs: "); 1443 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1444 prt_newline(out); 1445 1446 prt_printf(out, 1447 " format: "); 1448 bch2_bkey_format_to_text(out, &b->format); 1449 1450 prt_printf(out, 1451 " unpack fn len: %u\n" 1452 " bytes used %zu/%zu (%zu%% full)\n" 1453 " sib u64s: %u, %u (merge threshold %u)\n" 1454 " nr packed keys %u\n" 1455 " nr unpacked keys %u\n" 1456 " floats %zu\n" 1457 " failed unpacked %zu\n", 1458 b->unpack_fn_len, 1459 b->nr.live_u64s * sizeof(u64), 1460 btree_buf_bytes(b) - sizeof(struct btree_node), 1461 b->nr.live_u64s * 100 / btree_max_u64s(c), 1462 b->sib_u64s[0], 1463 b->sib_u64s[1], 1464 c->btree_foreground_merge_threshold, 1465 b->nr.packed_keys, 1466 b->nr.unpacked_keys, 1467 stats.floats, 1468 stats.failed); 1469 } 1470 1471 static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, 1472 const char *label, size_t nr) 1473 { 1474 prt_printf(out, "%s\t", label); 1475 prt_human_readable_u64(out, nr * c->opts.btree_node_size); 1476 prt_printf(out, " (%zu)\n", nr); 1477 } 1478 1479 static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { 1480 #define x(n) #n, 1481 BCH_BTREE_CACHE_NOT_FREED_REASONS() 1482 #undef x 1483 NULL 1484 }; 1485 1486 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) 1487 { 1488 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 1489 1490 if (!out->nr_tabstops) 1491 printbuf_tabstop_push(out, 32); 1492 1493 prt_btree_cache_line(out, c, "live:", bc->live[0].nr); 1494 prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); 1495 prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); 1496 prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); 1497 prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); 1498 prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); 1499 prt_newline(out); 1500 1501 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { 1502 bch2_btree_id_to_text(out, i); 1503 prt_printf(out, "\t"); 1504 prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); 1505 prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); 1506 } 1507 1508 prt_newline(out); 1509 prt_printf(out, "counters since mount:\n"); 1510 prt_printf(out, "freed:\t%zu\n", bc->nr_freed); 1511 prt_printf(out, "not freed:\n"); 1512 1513 for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) 1514 prt_printf(out, " %s\t%llu\n", 1515 bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); 1516 } 1517