1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bbpos.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_iter.h" 9 #include "btree_locking.h" 10 #include "debug.h" 11 #include "errcode.h" 12 #include "error.h" 13 #include "journal.h" 14 #include "trace.h" 15 16 #include <linux/prefetch.h> 17 #include <linux/sched/mm.h> 18 #include <linux/swap.h> 19 20 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ 21 do { \ 22 if (shrinker_counter) \ 23 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ 24 } while (0) 25 26 const char * const bch2_btree_node_flags[] = { 27 #define x(f) #f, 28 BTREE_FLAGS() 29 #undef x 30 NULL 31 }; 32 33 void bch2_recalc_btree_reserve(struct bch_fs *c) 34 { 35 unsigned reserve = 16; 36 37 if (!c->btree_roots_known[0].b) 38 reserve += 8; 39 40 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 41 struct btree_root *r = bch2_btree_id_root(c, i); 42 43 if (r->b) 44 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 45 } 46 47 c->btree_cache.nr_reserve = reserve; 48 } 49 50 static inline size_t btree_cache_can_free(struct btree_cache_list *list) 51 { 52 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 53 54 size_t can_free = list->nr; 55 if (!list->idx) 56 can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); 57 return can_free; 58 } 59 60 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 61 { 62 BUG_ON(!list_empty(&b->list)); 63 64 if (b->c.lock.readers) 65 list_add(&b->list, &bc->freed_pcpu); 66 else 67 list_add(&b->list, &bc->freed_nonpcpu); 68 } 69 70 static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) 71 { 72 BUG_ON(!list_empty(&b->list)); 73 BUG_ON(!b->data); 74 75 bc->nr_freeable++; 76 list_add(&b->list, &bc->freeable); 77 } 78 79 void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) 80 { 81 struct btree_cache *bc = &c->btree_cache; 82 83 mutex_lock(&bc->lock); 84 __bch2_btree_node_to_freelist(bc, b); 85 mutex_unlock(&bc->lock); 86 87 six_unlock_write(&b->c.lock); 88 six_unlock_intent(&b->c.lock); 89 } 90 91 static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) 92 { 93 BUG_ON(!list_empty(&b->list)); 94 BUG_ON(btree_node_hashed(b)); 95 96 /* 97 * This should really be done in slub/vmalloc, but we're using the 98 * kmalloc_large() path, so we're working around a slub bug by doing 99 * this here: 100 */ 101 if (b->data) 102 mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); 103 if (b->aux_data) 104 mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); 105 106 EBUG_ON(btree_node_write_in_flight(b)); 107 108 clear_btree_node_just_written(b); 109 110 kvfree(b->data); 111 b->data = NULL; 112 #ifdef __KERNEL__ 113 kvfree(b->aux_data); 114 #else 115 munmap(b->aux_data, btree_aux_data_bytes(b)); 116 #endif 117 b->aux_data = NULL; 118 119 btree_node_to_freedlist(bc, b); 120 } 121 122 static void btree_node_data_free(struct btree_cache *bc, struct btree *b) 123 { 124 BUG_ON(list_empty(&b->list)); 125 list_del_init(&b->list); 126 --bc->nr_freeable; 127 __btree_node_data_free(bc, b); 128 } 129 130 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 131 const void *obj) 132 { 133 const struct btree *b = obj; 134 const u64 *v = arg->key; 135 136 return b->hash_val == *v ? 0 : 1; 137 } 138 139 static const struct rhashtable_params bch_btree_cache_params = { 140 .head_offset = offsetof(struct btree, hash), 141 .key_offset = offsetof(struct btree, hash_val), 142 .key_len = sizeof(u64), 143 .obj_cmpfn = bch2_btree_cache_cmp_fn, 144 .automatic_shrinking = true, 145 }; 146 147 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 148 { 149 BUG_ON(b->data || b->aux_data); 150 151 gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; 152 153 b->data = kvmalloc(btree_buf_bytes(b), gfp); 154 if (!b->data) 155 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 156 #ifdef __KERNEL__ 157 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 158 #else 159 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 160 PROT_READ|PROT_WRITE|PROT_EXEC, 161 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 162 if (b->aux_data == MAP_FAILED) 163 b->aux_data = NULL; 164 #endif 165 if (!b->aux_data) { 166 kvfree(b->data); 167 b->data = NULL; 168 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 169 } 170 171 return 0; 172 } 173 174 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 175 { 176 struct btree *b; 177 178 b = kzalloc(sizeof(struct btree), gfp); 179 if (!b) 180 return NULL; 181 182 bkey_btree_ptr_init(&b->key); 183 INIT_LIST_HEAD(&b->list); 184 INIT_LIST_HEAD(&b->write_blocked); 185 b->byte_order = ilog2(c->opts.btree_node_size); 186 return b; 187 } 188 189 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 190 { 191 struct btree_cache *bc = &c->btree_cache; 192 struct btree *b; 193 194 b = __btree_node_mem_alloc(c, GFP_KERNEL); 195 if (!b) 196 return NULL; 197 198 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 199 kfree(b); 200 return NULL; 201 } 202 203 bch2_btree_lock_init(&b->c, 0); 204 205 __bch2_btree_node_to_freelist(bc, b); 206 return b; 207 } 208 209 static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) 210 { 211 struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 212 213 u64 mask = bc->pinned_nodes_mask[!!b->c.level]; 214 215 return ((mask & BIT_ULL(b->c.btree_id)) && 216 bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 217 bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); 218 } 219 220 void bch2_node_pin(struct bch_fs *c, struct btree *b) 221 { 222 struct btree_cache *bc = &c->btree_cache; 223 224 mutex_lock(&bc->lock); 225 if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { 226 set_btree_node_pinned(b); 227 list_move(&b->list, &bc->live[1].list); 228 bc->live[0].nr--; 229 bc->live[1].nr++; 230 } 231 mutex_unlock(&bc->lock); 232 } 233 234 void bch2_btree_cache_unpin(struct bch_fs *c) 235 { 236 struct btree_cache *bc = &c->btree_cache; 237 struct btree *b, *n; 238 239 mutex_lock(&bc->lock); 240 c->btree_cache.pinned_nodes_mask[0] = 0; 241 c->btree_cache.pinned_nodes_mask[1] = 0; 242 243 list_for_each_entry_safe(b, n, &bc->live[1].list, list) { 244 clear_btree_node_pinned(b); 245 list_move(&b->list, &bc->live[0].list); 246 bc->live[0].nr++; 247 bc->live[1].nr--; 248 } 249 250 mutex_unlock(&bc->lock); 251 } 252 253 /* Btree in memory cache - hash table */ 254 255 void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 256 { 257 lockdep_assert_held(&bc->lock); 258 259 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 260 BUG_ON(ret); 261 262 /* Cause future lookups for this node to fail: */ 263 b->hash_val = 0; 264 265 if (b->c.btree_id < BTREE_ID_NR) 266 --bc->nr_by_btree[b->c.btree_id]; 267 --bc->live[btree_node_pinned(b)].nr; 268 list_del_init(&b->list); 269 } 270 271 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 272 { 273 __bch2_btree_node_hash_remove(bc, b); 274 __bch2_btree_node_to_freelist(bc, b); 275 } 276 277 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 278 { 279 BUG_ON(!list_empty(&b->list)); 280 BUG_ON(b->hash_val); 281 282 b->hash_val = btree_ptr_hash_val(&b->key); 283 int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, 284 bch_btree_cache_params); 285 if (ret) 286 return ret; 287 288 if (b->c.btree_id < BTREE_ID_NR) 289 bc->nr_by_btree[b->c.btree_id]++; 290 291 bool p = __btree_node_pinned(bc, b); 292 mod_bit(BTREE_NODE_pinned, &b->flags, p); 293 294 list_add_tail(&b->list, &bc->live[p].list); 295 bc->live[p].nr++; 296 return 0; 297 } 298 299 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 300 unsigned level, enum btree_id id) 301 { 302 b->c.level = level; 303 b->c.btree_id = id; 304 305 mutex_lock(&bc->lock); 306 int ret = __bch2_btree_node_hash_insert(bc, b); 307 mutex_unlock(&bc->lock); 308 309 return ret; 310 } 311 312 void bch2_btree_node_update_key_early(struct btree_trans *trans, 313 enum btree_id btree, unsigned level, 314 struct bkey_s_c old, struct bkey_i *new) 315 { 316 struct bch_fs *c = trans->c; 317 struct btree *b; 318 struct bkey_buf tmp; 319 int ret; 320 321 bch2_bkey_buf_init(&tmp); 322 bch2_bkey_buf_reassemble(&tmp, c, old); 323 324 b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); 325 if (!IS_ERR_OR_NULL(b)) { 326 mutex_lock(&c->btree_cache.lock); 327 328 __bch2_btree_node_hash_remove(&c->btree_cache, b); 329 330 bkey_copy(&b->key, new); 331 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 332 BUG_ON(ret); 333 334 mutex_unlock(&c->btree_cache.lock); 335 six_unlock_read(&b->c.lock); 336 } 337 338 bch2_bkey_buf_exit(&tmp, c); 339 } 340 341 __flatten 342 static inline struct btree *btree_cache_find(struct btree_cache *bc, 343 const struct bkey_i *k) 344 { 345 u64 v = btree_ptr_hash_val(k); 346 347 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 348 } 349 350 /* 351 * this version is for btree nodes that have already been freed (we're not 352 * reaping a real btree node) 353 */ 354 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) 355 { 356 struct btree_cache *bc = &c->btree_cache; 357 int ret = 0; 358 359 lockdep_assert_held(&bc->lock); 360 wait_on_io: 361 if (b->flags & ((1U << BTREE_NODE_dirty)| 362 (1U << BTREE_NODE_read_in_flight)| 363 (1U << BTREE_NODE_write_in_flight))) { 364 if (!flush) { 365 if (btree_node_dirty(b)) 366 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 367 else if (btree_node_read_in_flight(b)) 368 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 369 else if (btree_node_write_in_flight(b)) 370 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 371 return -BCH_ERR_ENOMEM_btree_node_reclaim; 372 } 373 374 /* XXX: waiting on IO with btree cache lock held */ 375 bch2_btree_node_wait_on_read(b); 376 bch2_btree_node_wait_on_write(b); 377 } 378 379 if (!six_trylock_intent(&b->c.lock)) { 380 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); 381 return -BCH_ERR_ENOMEM_btree_node_reclaim; 382 } 383 384 if (!six_trylock_write(&b->c.lock)) { 385 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); 386 goto out_unlock_intent; 387 } 388 389 /* recheck under lock */ 390 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 391 (1U << BTREE_NODE_write_in_flight))) { 392 if (!flush) { 393 if (btree_node_read_in_flight(b)) 394 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 395 else if (btree_node_write_in_flight(b)) 396 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 397 goto out_unlock; 398 } 399 six_unlock_write(&b->c.lock); 400 six_unlock_intent(&b->c.lock); 401 goto wait_on_io; 402 } 403 404 if (btree_node_noevict(b)) { 405 BTREE_CACHE_NOT_FREED_INCREMENT(noevict); 406 goto out_unlock; 407 } 408 if (btree_node_write_blocked(b)) { 409 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); 410 goto out_unlock; 411 } 412 if (btree_node_will_make_reachable(b)) { 413 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); 414 goto out_unlock; 415 } 416 417 if (btree_node_dirty(b)) { 418 if (!flush) { 419 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 420 goto out_unlock; 421 } 422 /* 423 * Using the underscore version because we don't want to compact 424 * bsets after the write, since this node is about to be evicted 425 * - unless btree verify mode is enabled, since it runs out of 426 * the post write cleanup: 427 */ 428 if (bch2_verify_btree_ondisk) 429 bch2_btree_node_write(c, b, SIX_LOCK_intent, 430 BTREE_WRITE_cache_reclaim); 431 else 432 __bch2_btree_node_write(c, b, 433 BTREE_WRITE_cache_reclaim); 434 435 six_unlock_write(&b->c.lock); 436 six_unlock_intent(&b->c.lock); 437 goto wait_on_io; 438 } 439 out: 440 if (b->hash_val && !ret) 441 trace_and_count(c, btree_cache_reap, c, b); 442 return ret; 443 out_unlock: 444 six_unlock_write(&b->c.lock); 445 out_unlock_intent: 446 six_unlock_intent(&b->c.lock); 447 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 448 goto out; 449 } 450 451 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) 452 { 453 return __btree_node_reclaim(c, b, false, shrinker_counter); 454 } 455 456 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 457 { 458 return __btree_node_reclaim(c, b, true, false); 459 } 460 461 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 462 struct shrink_control *sc) 463 { 464 struct btree_cache_list *list = shrink->private_data; 465 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 466 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 467 struct btree *b, *t; 468 unsigned long nr = sc->nr_to_scan; 469 unsigned long can_free = 0; 470 unsigned long freed = 0; 471 unsigned long touched = 0; 472 unsigned i, flags; 473 unsigned long ret = SHRINK_STOP; 474 bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; 475 476 if (bch2_btree_shrinker_disabled) 477 return SHRINK_STOP; 478 479 mutex_lock(&bc->lock); 480 flags = memalloc_nofs_save(); 481 482 /* 483 * It's _really_ critical that we don't free too many btree nodes - we 484 * have to always leave ourselves a reserve. The reserve is how we 485 * guarantee that allocating memory for a new btree node can always 486 * succeed, so that inserting keys into the btree can always succeed and 487 * IO can always make forward progress: 488 */ 489 can_free = btree_cache_can_free(list); 490 nr = min_t(unsigned long, nr, can_free); 491 492 i = 0; 493 list_for_each_entry_safe(b, t, &bc->freeable, list) { 494 /* 495 * Leave a few nodes on the freeable list, so that a btree split 496 * won't have to hit the system allocator: 497 */ 498 if (++i <= 3) 499 continue; 500 501 touched++; 502 503 if (touched >= nr) 504 goto out; 505 506 if (!btree_node_reclaim(c, b, true)) { 507 btree_node_data_free(bc, b); 508 six_unlock_write(&b->c.lock); 509 six_unlock_intent(&b->c.lock); 510 freed++; 511 bc->nr_freed++; 512 } 513 } 514 restart: 515 list_for_each_entry_safe(b, t, &list->list, list) { 516 touched++; 517 518 if (btree_node_accessed(b)) { 519 clear_btree_node_accessed(b); 520 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; 521 --touched;; 522 } else if (!btree_node_reclaim(c, b, true)) { 523 __bch2_btree_node_hash_remove(bc, b); 524 __btree_node_data_free(bc, b); 525 526 freed++; 527 bc->nr_freed++; 528 529 six_unlock_write(&b->c.lock); 530 six_unlock_intent(&b->c.lock); 531 532 if (freed == nr) 533 goto out_rotate; 534 } else if (trigger_writes && 535 btree_node_dirty(b) && 536 !btree_node_will_make_reachable(b) && 537 !btree_node_write_blocked(b) && 538 six_trylock_read(&b->c.lock)) { 539 list_move(&list->list, &b->list); 540 mutex_unlock(&bc->lock); 541 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 542 six_unlock_read(&b->c.lock); 543 if (touched >= nr) 544 goto out_nounlock; 545 mutex_lock(&bc->lock); 546 goto restart; 547 } 548 549 if (touched >= nr) 550 break; 551 } 552 out_rotate: 553 if (&t->list != &list->list) 554 list_move_tail(&list->list, &t->list); 555 out: 556 mutex_unlock(&bc->lock); 557 out_nounlock: 558 ret = freed; 559 memalloc_nofs_restore(flags); 560 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 561 return ret; 562 } 563 564 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 565 struct shrink_control *sc) 566 { 567 struct btree_cache_list *list = shrink->private_data; 568 569 if (bch2_btree_shrinker_disabled) 570 return 0; 571 572 return btree_cache_can_free(list); 573 } 574 575 void bch2_fs_btree_cache_exit(struct bch_fs *c) 576 { 577 struct btree_cache *bc = &c->btree_cache; 578 struct btree *b, *t; 579 unsigned long flags; 580 581 shrinker_free(bc->live[1].shrink); 582 shrinker_free(bc->live[0].shrink); 583 584 /* vfree() can allocate memory: */ 585 flags = memalloc_nofs_save(); 586 mutex_lock(&bc->lock); 587 588 if (c->verify_data) 589 list_move(&c->verify_data->list, &bc->live[0].list); 590 591 kvfree(c->verify_ondisk); 592 593 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 594 struct btree_root *r = bch2_btree_id_root(c, i); 595 596 if (r->b) 597 list_add(&r->b->list, &bc->live[0].list); 598 } 599 600 list_for_each_entry_safe(b, t, &bc->live[1].list, list) 601 bch2_btree_node_hash_remove(bc, b); 602 list_for_each_entry_safe(b, t, &bc->live[0].list, list) 603 bch2_btree_node_hash_remove(bc, b); 604 605 list_for_each_entry_safe(b, t, &bc->freeable, list) { 606 BUG_ON(btree_node_read_in_flight(b) || 607 btree_node_write_in_flight(b)); 608 609 btree_node_data_free(bc, b); 610 } 611 612 BUG_ON(!bch2_journal_error(&c->journal) && 613 atomic_long_read(&c->btree_cache.nr_dirty)); 614 615 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 616 617 list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { 618 list_del(&b->list); 619 six_lock_exit(&b->c.lock); 620 kfree(b); 621 } 622 623 mutex_unlock(&bc->lock); 624 memalloc_nofs_restore(flags); 625 626 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) 627 BUG_ON(bc->nr_by_btree[i]); 628 BUG_ON(bc->live[0].nr); 629 BUG_ON(bc->live[1].nr); 630 BUG_ON(bc->nr_freeable); 631 632 if (bc->table_init_done) 633 rhashtable_destroy(&bc->table); 634 } 635 636 int bch2_fs_btree_cache_init(struct bch_fs *c) 637 { 638 struct btree_cache *bc = &c->btree_cache; 639 struct shrinker *shrink; 640 unsigned i; 641 int ret = 0; 642 643 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 644 if (ret) 645 goto err; 646 647 bc->table_init_done = true; 648 649 bch2_recalc_btree_reserve(c); 650 651 for (i = 0; i < bc->nr_reserve; i++) 652 if (!__bch2_btree_node_mem_alloc(c)) 653 goto err; 654 655 list_splice_init(&bc->live[0].list, &bc->freeable); 656 657 mutex_init(&c->verify_lock); 658 659 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 660 if (!shrink) 661 goto err; 662 bc->live[0].shrink = shrink; 663 shrink->count_objects = bch2_btree_cache_count; 664 shrink->scan_objects = bch2_btree_cache_scan; 665 shrink->seeks = 2; 666 shrink->private_data = &bc->live[0]; 667 shrinker_register(shrink); 668 669 shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); 670 if (!shrink) 671 goto err; 672 bc->live[1].shrink = shrink; 673 shrink->count_objects = bch2_btree_cache_count; 674 shrink->scan_objects = bch2_btree_cache_scan; 675 shrink->seeks = 8; 676 shrink->private_data = &bc->live[1]; 677 shrinker_register(shrink); 678 679 return 0; 680 err: 681 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 682 } 683 684 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 685 { 686 mutex_init(&bc->lock); 687 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { 688 bc->live[i].idx = i; 689 INIT_LIST_HEAD(&bc->live[i].list); 690 } 691 INIT_LIST_HEAD(&bc->freeable); 692 INIT_LIST_HEAD(&bc->freed_pcpu); 693 INIT_LIST_HEAD(&bc->freed_nonpcpu); 694 } 695 696 /* 697 * We can only have one thread cannibalizing other cached btree nodes at a time, 698 * or we'll deadlock. We use an open coded mutex to ensure that, which a 699 * cannibalize_bucket() will take. This means every time we unlock the root of 700 * the btree, we need to release this lock if we have it held. 701 */ 702 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) 703 { 704 struct bch_fs *c = trans->c; 705 struct btree_cache *bc = &c->btree_cache; 706 707 if (bc->alloc_lock == current) { 708 trace_and_count(c, btree_cache_cannibalize_unlock, trans); 709 bc->alloc_lock = NULL; 710 closure_wake_up(&bc->alloc_wait); 711 } 712 } 713 714 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) 715 { 716 struct bch_fs *c = trans->c; 717 struct btree_cache *bc = &c->btree_cache; 718 struct task_struct *old; 719 720 old = NULL; 721 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) 722 goto success; 723 724 if (!cl) { 725 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 726 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 727 } 728 729 closure_wait(&bc->alloc_wait, cl); 730 731 /* Try again, after adding ourselves to waitlist */ 732 old = NULL; 733 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { 734 /* We raced */ 735 closure_wake_up(&bc->alloc_wait); 736 goto success; 737 } 738 739 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 740 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 741 742 success: 743 trace_and_count(c, btree_cache_cannibalize_lock, trans); 744 return 0; 745 } 746 747 static struct btree *btree_node_cannibalize(struct bch_fs *c) 748 { 749 struct btree_cache *bc = &c->btree_cache; 750 struct btree *b; 751 752 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 753 list_for_each_entry_reverse(b, &bc->live[i].list, list) 754 if (!btree_node_reclaim(c, b, false)) 755 return b; 756 757 while (1) { 758 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 759 list_for_each_entry_reverse(b, &bc->live[i].list, list) 760 if (!btree_node_write_and_reclaim(c, b)) 761 return b; 762 763 /* 764 * Rare case: all nodes were intent-locked. 765 * Just busy-wait. 766 */ 767 WARN_ONCE(1, "btree cache cannibalize failed\n"); 768 cond_resched(); 769 } 770 } 771 772 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 773 { 774 struct bch_fs *c = trans->c; 775 struct btree_cache *bc = &c->btree_cache; 776 struct list_head *freed = pcpu_read_locks 777 ? &bc->freed_pcpu 778 : &bc->freed_nonpcpu; 779 struct btree *b, *b2; 780 u64 start_time = local_clock(); 781 782 mutex_lock(&bc->lock); 783 784 /* 785 * We never free struct btree itself, just the memory that holds the on 786 * disk node. Check the freed list before allocating a new one: 787 */ 788 list_for_each_entry(b, freed, list) 789 if (!btree_node_reclaim(c, b, false)) { 790 list_del_init(&b->list); 791 goto got_node; 792 } 793 794 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 795 if (!b) { 796 mutex_unlock(&bc->lock); 797 bch2_trans_unlock(trans); 798 b = __btree_node_mem_alloc(c, GFP_KERNEL); 799 if (!b) 800 goto err; 801 mutex_lock(&bc->lock); 802 } 803 804 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); 805 806 BUG_ON(!six_trylock_intent(&b->c.lock)); 807 BUG_ON(!six_trylock_write(&b->c.lock)); 808 809 got_node: 810 /* 811 * btree_free() doesn't free memory; it sticks the node on the end of 812 * the list. Check if there's any freed nodes there: 813 */ 814 list_for_each_entry(b2, &bc->freeable, list) 815 if (!btree_node_reclaim(c, b2, false)) { 816 swap(b->data, b2->data); 817 swap(b->aux_data, b2->aux_data); 818 819 list_del_init(&b2->list); 820 --bc->nr_freeable; 821 btree_node_to_freedlist(bc, b2); 822 mutex_unlock(&bc->lock); 823 824 six_unlock_write(&b2->c.lock); 825 six_unlock_intent(&b2->c.lock); 826 goto got_mem; 827 } 828 829 mutex_unlock(&bc->lock); 830 831 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 832 bch2_trans_unlock(trans); 833 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 834 goto err; 835 } 836 837 got_mem: 838 BUG_ON(!list_empty(&b->list)); 839 BUG_ON(btree_node_hashed(b)); 840 BUG_ON(btree_node_dirty(b)); 841 BUG_ON(btree_node_write_in_flight(b)); 842 out: 843 b->flags = 0; 844 b->written = 0; 845 b->nsets = 0; 846 b->sib_u64s[0] = 0; 847 b->sib_u64s[1] = 0; 848 b->whiteout_u64s = 0; 849 bch2_btree_keys_init(b); 850 set_btree_node_accessed(b); 851 852 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 853 start_time); 854 855 int ret = bch2_trans_relock(trans); 856 if (unlikely(ret)) { 857 bch2_btree_node_to_freelist(c, b); 858 return ERR_PTR(ret); 859 } 860 861 return b; 862 err: 863 mutex_lock(&bc->lock); 864 865 /* Try to cannibalize another cached btree node: */ 866 if (bc->alloc_lock == current) { 867 b2 = btree_node_cannibalize(c); 868 clear_btree_node_just_written(b2); 869 __bch2_btree_node_hash_remove(bc, b2); 870 871 if (b) { 872 swap(b->data, b2->data); 873 swap(b->aux_data, b2->aux_data); 874 btree_node_to_freedlist(bc, b2); 875 six_unlock_write(&b2->c.lock); 876 six_unlock_intent(&b2->c.lock); 877 } else { 878 b = b2; 879 } 880 881 BUG_ON(!list_empty(&b->list)); 882 mutex_unlock(&bc->lock); 883 884 trace_and_count(c, btree_cache_cannibalize, trans); 885 goto out; 886 } 887 888 mutex_unlock(&bc->lock); 889 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 890 } 891 892 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 893 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 894 struct btree_path *path, 895 const struct bkey_i *k, 896 enum btree_id btree_id, 897 unsigned level, 898 enum six_lock_type lock_type, 899 bool sync) 900 { 901 struct bch_fs *c = trans->c; 902 struct btree_cache *bc = &c->btree_cache; 903 struct btree *b; 904 905 if (unlikely(level >= BTREE_MAX_DEPTH)) { 906 int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", 907 level, BTREE_MAX_DEPTH); 908 return ERR_PTR(ret); 909 } 910 911 if (unlikely(!bkey_is_btree_ptr(&k->k))) { 912 struct printbuf buf = PRINTBUF; 913 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 914 915 int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); 916 printbuf_exit(&buf); 917 return ERR_PTR(ret); 918 } 919 920 if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { 921 struct printbuf buf = PRINTBUF; 922 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 923 924 int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); 925 printbuf_exit(&buf); 926 return ERR_PTR(ret); 927 } 928 929 /* 930 * Parent node must be locked, else we could read in a btree node that's 931 * been freed: 932 */ 933 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 934 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 935 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 936 } 937 938 b = bch2_btree_node_mem_alloc(trans, level != 0); 939 940 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 941 if (!path) 942 return b; 943 944 trans->memory_allocation_failure = true; 945 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 946 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 947 } 948 949 if (IS_ERR(b)) 950 return b; 951 952 bkey_copy(&b->key, k); 953 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 954 /* raced with another fill: */ 955 956 /* mark as unhashed... */ 957 b->hash_val = 0; 958 959 mutex_lock(&bc->lock); 960 __bch2_btree_node_to_freelist(bc, b); 961 mutex_unlock(&bc->lock); 962 963 six_unlock_write(&b->c.lock); 964 six_unlock_intent(&b->c.lock); 965 return NULL; 966 } 967 968 set_btree_node_read_in_flight(b); 969 six_unlock_write(&b->c.lock); 970 971 if (path) { 972 u32 seq = six_lock_seq(&b->c.lock); 973 974 /* Unlock before doing IO: */ 975 six_unlock_intent(&b->c.lock); 976 bch2_trans_unlock_noassert(trans); 977 978 bch2_btree_node_read(trans, b, sync); 979 980 int ret = bch2_trans_relock(trans); 981 if (ret) 982 return ERR_PTR(ret); 983 984 if (!sync) 985 return NULL; 986 987 if (!six_relock_type(&b->c.lock, lock_type, seq)) 988 b = NULL; 989 } else { 990 bch2_btree_node_read(trans, b, sync); 991 if (lock_type == SIX_LOCK_read) 992 six_lock_downgrade(&b->c.lock); 993 } 994 995 return b; 996 } 997 998 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 999 { 1000 struct printbuf buf = PRINTBUF; 1001 1002 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 1003 return; 1004 1005 prt_printf(&buf, 1006 "btree node header doesn't match ptr: "); 1007 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 1008 prt_str(&buf, "\nptr: "); 1009 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 1010 1011 prt_str(&buf, "\nheader: "); 1012 bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); 1013 prt_str(&buf, "\nmin "); 1014 bch2_bpos_to_text(&buf, b->data->min_key); 1015 1016 prt_printf(&buf, "\nmax "); 1017 bch2_bpos_to_text(&buf, b->data->max_key); 1018 1019 bch2_fs_topology_error(c, "%s", buf.buf); 1020 1021 printbuf_exit(&buf); 1022 } 1023 1024 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 1025 { 1026 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 1027 b->c.level != BTREE_NODE_LEVEL(b->data) || 1028 !bpos_eq(b->data->max_key, b->key.k.p) || 1029 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 1030 !bpos_eq(b->data->min_key, 1031 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 1032 btree_bad_header(c, b); 1033 } 1034 1035 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1036 const struct bkey_i *k, unsigned level, 1037 enum six_lock_type lock_type, 1038 unsigned long trace_ip) 1039 { 1040 struct bch_fs *c = trans->c; 1041 struct btree_cache *bc = &c->btree_cache; 1042 struct btree *b; 1043 bool need_relock = false; 1044 int ret; 1045 1046 EBUG_ON(level >= BTREE_MAX_DEPTH); 1047 retry: 1048 b = btree_cache_find(bc, k); 1049 if (unlikely(!b)) { 1050 /* 1051 * We must have the parent locked to call bch2_btree_node_fill(), 1052 * else we could read in a btree node from disk that's been 1053 * freed: 1054 */ 1055 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 1056 level, lock_type, true); 1057 need_relock = true; 1058 1059 /* We raced and found the btree node in the cache */ 1060 if (!b) 1061 goto retry; 1062 1063 if (IS_ERR(b)) 1064 return b; 1065 } else { 1066 if (btree_node_read_locked(path, level + 1)) 1067 btree_node_unlock(trans, path, level + 1); 1068 1069 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1070 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1071 return ERR_PTR(ret); 1072 1073 BUG_ON(ret); 1074 1075 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1076 b->c.level != level || 1077 race_fault())) { 1078 six_unlock_type(&b->c.lock, lock_type); 1079 if (bch2_btree_node_relock(trans, path, level + 1)) 1080 goto retry; 1081 1082 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1083 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1084 } 1085 1086 /* avoid atomic set bit if it's not needed: */ 1087 if (!btree_node_accessed(b)) 1088 set_btree_node_accessed(b); 1089 } 1090 1091 if (unlikely(btree_node_read_in_flight(b))) { 1092 u32 seq = six_lock_seq(&b->c.lock); 1093 1094 six_unlock_type(&b->c.lock, lock_type); 1095 bch2_trans_unlock(trans); 1096 need_relock = true; 1097 1098 bch2_btree_node_wait_on_read(b); 1099 1100 ret = bch2_trans_relock(trans); 1101 if (ret) 1102 return ERR_PTR(ret); 1103 1104 /* 1105 * should_be_locked is not set on this path yet, so we need to 1106 * relock it specifically: 1107 */ 1108 if (!six_relock_type(&b->c.lock, lock_type, seq)) 1109 goto retry; 1110 } 1111 1112 if (unlikely(need_relock)) { 1113 ret = bch2_trans_relock(trans) ?: 1114 bch2_btree_path_relock_intent(trans, path); 1115 if (ret) { 1116 six_unlock_type(&b->c.lock, lock_type); 1117 return ERR_PTR(ret); 1118 } 1119 } 1120 1121 prefetch(b->aux_data); 1122 1123 for_each_bset(b, t) { 1124 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1125 1126 prefetch(p + L1_CACHE_BYTES * 0); 1127 prefetch(p + L1_CACHE_BYTES * 1); 1128 prefetch(p + L1_CACHE_BYTES * 2); 1129 } 1130 1131 if (unlikely(btree_node_read_error(b))) { 1132 six_unlock_type(&b->c.lock, lock_type); 1133 return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1134 } 1135 1136 EBUG_ON(b->c.btree_id != path->btree_id); 1137 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1138 btree_check_header(c, b); 1139 1140 return b; 1141 } 1142 1143 /** 1144 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 1145 * in from disk if necessary. 1146 * 1147 * @trans: btree transaction object 1148 * @path: btree_path being traversed 1149 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 1150 * @level: level of btree node being looked up (0 == leaf node) 1151 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 1152 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 1153 * 1154 * The btree node will have either a read or a write lock held, depending on 1155 * the @write parameter. 1156 * 1157 * Returns: btree node or ERR_PTR() 1158 */ 1159 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1160 const struct bkey_i *k, unsigned level, 1161 enum six_lock_type lock_type, 1162 unsigned long trace_ip) 1163 { 1164 struct bch_fs *c = trans->c; 1165 struct btree *b; 1166 int ret; 1167 1168 EBUG_ON(level >= BTREE_MAX_DEPTH); 1169 1170 b = btree_node_mem_ptr(k); 1171 1172 /* 1173 * Check b->hash_val _before_ calling btree_node_lock() - this might not 1174 * be the node we want anymore, and trying to lock the wrong node could 1175 * cause an unneccessary transaction restart: 1176 */ 1177 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 1178 !b || 1179 b->hash_val != btree_ptr_hash_val(k))) 1180 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1181 1182 if (btree_node_read_locked(path, level + 1)) 1183 btree_node_unlock(trans, path, level + 1); 1184 1185 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1186 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1187 return ERR_PTR(ret); 1188 1189 BUG_ON(ret); 1190 1191 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1192 b->c.level != level || 1193 race_fault())) { 1194 six_unlock_type(&b->c.lock, lock_type); 1195 if (bch2_btree_node_relock(trans, path, level + 1)) 1196 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1197 1198 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1199 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1200 } 1201 1202 if (unlikely(btree_node_read_in_flight(b))) { 1203 six_unlock_type(&b->c.lock, lock_type); 1204 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1205 } 1206 1207 prefetch(b->aux_data); 1208 1209 for_each_bset(b, t) { 1210 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1211 1212 prefetch(p + L1_CACHE_BYTES * 0); 1213 prefetch(p + L1_CACHE_BYTES * 1); 1214 prefetch(p + L1_CACHE_BYTES * 2); 1215 } 1216 1217 /* avoid atomic set bit if it's not needed: */ 1218 if (!btree_node_accessed(b)) 1219 set_btree_node_accessed(b); 1220 1221 if (unlikely(btree_node_read_error(b))) { 1222 six_unlock_type(&b->c.lock, lock_type); 1223 return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1224 } 1225 1226 EBUG_ON(b->c.btree_id != path->btree_id); 1227 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1228 btree_check_header(c, b); 1229 1230 return b; 1231 } 1232 1233 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1234 const struct bkey_i *k, 1235 enum btree_id btree_id, 1236 unsigned level, 1237 bool nofill) 1238 { 1239 struct bch_fs *c = trans->c; 1240 struct btree_cache *bc = &c->btree_cache; 1241 struct btree *b; 1242 int ret; 1243 1244 EBUG_ON(level >= BTREE_MAX_DEPTH); 1245 1246 if (c->opts.btree_node_mem_ptr_optimization) { 1247 b = btree_node_mem_ptr(k); 1248 if (b) 1249 goto lock_node; 1250 } 1251 retry: 1252 b = btree_cache_find(bc, k); 1253 if (unlikely(!b)) { 1254 if (nofill) 1255 goto out; 1256 1257 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1258 level, SIX_LOCK_read, true); 1259 1260 /* We raced and found the btree node in the cache */ 1261 if (!b) 1262 goto retry; 1263 1264 if (IS_ERR(b) && 1265 !bch2_btree_cache_cannibalize_lock(trans, NULL)) 1266 goto retry; 1267 1268 if (IS_ERR(b)) 1269 goto out; 1270 } else { 1271 lock_node: 1272 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1273 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1274 return ERR_PTR(ret); 1275 1276 BUG_ON(ret); 1277 1278 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1279 b->c.btree_id != btree_id || 1280 b->c.level != level)) { 1281 six_unlock_read(&b->c.lock); 1282 goto retry; 1283 } 1284 } 1285 1286 /* XXX: waiting on IO with btree locks held: */ 1287 __bch2_btree_node_wait_on_read(b); 1288 1289 prefetch(b->aux_data); 1290 1291 for_each_bset(b, t) { 1292 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1293 1294 prefetch(p + L1_CACHE_BYTES * 0); 1295 prefetch(p + L1_CACHE_BYTES * 1); 1296 prefetch(p + L1_CACHE_BYTES * 2); 1297 } 1298 1299 /* avoid atomic set bit if it's not needed: */ 1300 if (!btree_node_accessed(b)) 1301 set_btree_node_accessed(b); 1302 1303 if (unlikely(btree_node_read_error(b))) { 1304 six_unlock_read(&b->c.lock); 1305 b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1306 goto out; 1307 } 1308 1309 EBUG_ON(b->c.btree_id != btree_id); 1310 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1311 btree_check_header(c, b); 1312 out: 1313 bch2_btree_cache_cannibalize_unlock(trans); 1314 return b; 1315 } 1316 1317 int bch2_btree_node_prefetch(struct btree_trans *trans, 1318 struct btree_path *path, 1319 const struct bkey_i *k, 1320 enum btree_id btree_id, unsigned level) 1321 { 1322 struct bch_fs *c = trans->c; 1323 struct btree_cache *bc = &c->btree_cache; 1324 1325 BUG_ON(path && !btree_node_locked(path, level + 1)); 1326 BUG_ON(level >= BTREE_MAX_DEPTH); 1327 1328 struct btree *b = btree_cache_find(bc, k); 1329 if (b) 1330 return 0; 1331 1332 b = bch2_btree_node_fill(trans, path, k, btree_id, 1333 level, SIX_LOCK_read, false); 1334 int ret = PTR_ERR_OR_ZERO(b); 1335 if (ret) 1336 return ret; 1337 if (b) 1338 six_unlock_read(&b->c.lock); 1339 return 0; 1340 } 1341 1342 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1343 { 1344 struct bch_fs *c = trans->c; 1345 struct btree_cache *bc = &c->btree_cache; 1346 struct btree *b; 1347 1348 b = btree_cache_find(bc, k); 1349 if (!b) 1350 return; 1351 1352 BUG_ON(b == btree_node_root(trans->c, b)); 1353 wait_on_io: 1354 /* not allowed to wait on io with btree locks held: */ 1355 1356 /* XXX we're called from btree_gc which will be holding other btree 1357 * nodes locked 1358 */ 1359 __bch2_btree_node_wait_on_read(b); 1360 __bch2_btree_node_wait_on_write(b); 1361 1362 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1363 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1364 if (unlikely(b->hash_val != btree_ptr_hash_val(k))) 1365 goto out; 1366 1367 if (btree_node_dirty(b)) { 1368 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1369 six_unlock_write(&b->c.lock); 1370 six_unlock_intent(&b->c.lock); 1371 goto wait_on_io; 1372 } 1373 1374 BUG_ON(btree_node_dirty(b)); 1375 1376 mutex_lock(&bc->lock); 1377 bch2_btree_node_hash_remove(bc, b); 1378 btree_node_data_free(bc, b); 1379 mutex_unlock(&bc->lock); 1380 out: 1381 six_unlock_write(&b->c.lock); 1382 six_unlock_intent(&b->c.lock); 1383 } 1384 1385 const char *bch2_btree_id_str(enum btree_id btree) 1386 { 1387 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1388 } 1389 1390 void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) 1391 { 1392 if (btree < BTREE_ID_NR) 1393 prt_str(out, __bch2_btree_ids[btree]); 1394 else 1395 prt_printf(out, "(unknown btree %u)", btree); 1396 } 1397 1398 void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) 1399 { 1400 prt_str(out, "btree="); 1401 bch2_btree_id_to_text(out, btree); 1402 prt_printf(out, " level=%u", level); 1403 } 1404 1405 void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, 1406 enum btree_id btree, unsigned level, struct bkey_s_c k) 1407 { 1408 bch2_btree_id_to_text(out, btree); 1409 prt_printf(out, " level %u/", level); 1410 struct btree_root *r = bch2_btree_id_root(c, btree); 1411 if (r) 1412 prt_printf(out, "%u", r->level); 1413 else 1414 prt_printf(out, "(unknown)"); 1415 prt_printf(out, "\n "); 1416 1417 bch2_bkey_val_to_text(out, c, k); 1418 } 1419 1420 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1421 { 1422 __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); 1423 } 1424 1425 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1426 { 1427 struct bset_stats stats; 1428 1429 memset(&stats, 0, sizeof(stats)); 1430 1431 bch2_btree_keys_stats(b, &stats); 1432 1433 prt_printf(out, "l %u ", b->c.level); 1434 bch2_bpos_to_text(out, b->data->min_key); 1435 prt_printf(out, " - "); 1436 bch2_bpos_to_text(out, b->data->max_key); 1437 prt_printf(out, ":\n" 1438 " ptrs: "); 1439 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1440 prt_newline(out); 1441 1442 prt_printf(out, 1443 " format: "); 1444 bch2_bkey_format_to_text(out, &b->format); 1445 1446 prt_printf(out, 1447 " unpack fn len: %u\n" 1448 " bytes used %zu/%zu (%zu%% full)\n" 1449 " sib u64s: %u, %u (merge threshold %u)\n" 1450 " nr packed keys %u\n" 1451 " nr unpacked keys %u\n" 1452 " floats %zu\n" 1453 " failed unpacked %zu\n", 1454 b->unpack_fn_len, 1455 b->nr.live_u64s * sizeof(u64), 1456 btree_buf_bytes(b) - sizeof(struct btree_node), 1457 b->nr.live_u64s * 100 / btree_max_u64s(c), 1458 b->sib_u64s[0], 1459 b->sib_u64s[1], 1460 c->btree_foreground_merge_threshold, 1461 b->nr.packed_keys, 1462 b->nr.unpacked_keys, 1463 stats.floats, 1464 stats.failed); 1465 } 1466 1467 static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, 1468 const char *label, size_t nr) 1469 { 1470 prt_printf(out, "%s\t", label); 1471 prt_human_readable_u64(out, nr * c->opts.btree_node_size); 1472 prt_printf(out, " (%zu)\n", nr); 1473 } 1474 1475 static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { 1476 #define x(n) #n, 1477 BCH_BTREE_CACHE_NOT_FREED_REASONS() 1478 #undef x 1479 NULL 1480 }; 1481 1482 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) 1483 { 1484 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 1485 1486 if (!out->nr_tabstops) 1487 printbuf_tabstop_push(out, 32); 1488 1489 prt_btree_cache_line(out, c, "live:", bc->live[0].nr); 1490 prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); 1491 prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); 1492 prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); 1493 prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); 1494 prt_newline(out); 1495 1496 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { 1497 bch2_btree_id_to_text(out, i); 1498 prt_printf(out, "\t"); 1499 prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); 1500 prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); 1501 } 1502 1503 prt_newline(out); 1504 prt_printf(out, "freed:\t%zu\n", bc->nr_freed); 1505 prt_printf(out, "not freed:\n"); 1506 1507 for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) 1508 prt_printf(out, " %s\t%llu\n", 1509 bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); 1510 } 1511