1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bbpos.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_iter.h" 9 #include "btree_locking.h" 10 #include "debug.h" 11 #include "errcode.h" 12 #include "error.h" 13 #include "journal.h" 14 #include "trace.h" 15 16 #include <linux/prefetch.h> 17 #include <linux/sched/mm.h> 18 #include <linux/swap.h> 19 20 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ 21 do { \ 22 if (shrinker_counter) \ 23 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ 24 } while (0) 25 26 const char * const bch2_btree_node_flags[] = { 27 "typebit", 28 "typebit", 29 "typebit", 30 #define x(f) [BTREE_NODE_##f] = #f, 31 BTREE_FLAGS() 32 #undef x 33 NULL 34 }; 35 36 void bch2_recalc_btree_reserve(struct bch_fs *c) 37 { 38 unsigned reserve = 16; 39 40 if (!c->btree_roots_known[0].b) 41 reserve += 8; 42 43 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 44 struct btree_root *r = bch2_btree_id_root(c, i); 45 46 if (r->b) 47 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 48 } 49 50 c->btree_cache.nr_reserve = reserve; 51 } 52 53 static inline size_t btree_cache_can_free(struct btree_cache_list *list) 54 { 55 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 56 57 size_t can_free = list->nr; 58 if (!list->idx) 59 can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); 60 return can_free; 61 } 62 63 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 64 { 65 BUG_ON(!list_empty(&b->list)); 66 67 if (b->c.lock.readers) 68 list_add(&b->list, &bc->freed_pcpu); 69 else 70 list_add(&b->list, &bc->freed_nonpcpu); 71 } 72 73 static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) 74 { 75 BUG_ON(!list_empty(&b->list)); 76 BUG_ON(!b->data); 77 78 bc->nr_freeable++; 79 list_add(&b->list, &bc->freeable); 80 } 81 82 void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) 83 { 84 struct btree_cache *bc = &c->btree_cache; 85 86 mutex_lock(&bc->lock); 87 __bch2_btree_node_to_freelist(bc, b); 88 mutex_unlock(&bc->lock); 89 90 six_unlock_write(&b->c.lock); 91 six_unlock_intent(&b->c.lock); 92 } 93 94 static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) 95 { 96 BUG_ON(!list_empty(&b->list)); 97 BUG_ON(btree_node_hashed(b)); 98 99 /* 100 * This should really be done in slub/vmalloc, but we're using the 101 * kmalloc_large() path, so we're working around a slub bug by doing 102 * this here: 103 */ 104 if (b->data) 105 mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); 106 if (b->aux_data) 107 mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); 108 109 EBUG_ON(btree_node_write_in_flight(b)); 110 111 clear_btree_node_just_written(b); 112 113 kvfree(b->data); 114 b->data = NULL; 115 #ifdef __KERNEL__ 116 kvfree(b->aux_data); 117 #else 118 munmap(b->aux_data, btree_aux_data_bytes(b)); 119 #endif 120 b->aux_data = NULL; 121 122 btree_node_to_freedlist(bc, b); 123 } 124 125 static void btree_node_data_free(struct btree_cache *bc, struct btree *b) 126 { 127 BUG_ON(list_empty(&b->list)); 128 list_del_init(&b->list); 129 --bc->nr_freeable; 130 __btree_node_data_free(bc, b); 131 } 132 133 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 134 const void *obj) 135 { 136 const struct btree *b = obj; 137 const u64 *v = arg->key; 138 139 return b->hash_val == *v ? 0 : 1; 140 } 141 142 static const struct rhashtable_params bch_btree_cache_params = { 143 .head_offset = offsetof(struct btree, hash), 144 .key_offset = offsetof(struct btree, hash_val), 145 .key_len = sizeof(u64), 146 .obj_cmpfn = bch2_btree_cache_cmp_fn, 147 .automatic_shrinking = true, 148 }; 149 150 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 151 { 152 BUG_ON(b->data || b->aux_data); 153 154 gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; 155 156 b->data = kvmalloc(btree_buf_bytes(b), gfp); 157 if (!b->data) 158 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 159 #ifdef __KERNEL__ 160 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 161 #else 162 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 163 PROT_READ|PROT_WRITE|PROT_EXEC, 164 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 165 if (b->aux_data == MAP_FAILED) 166 b->aux_data = NULL; 167 #endif 168 if (!b->aux_data) { 169 kvfree(b->data); 170 b->data = NULL; 171 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 172 } 173 174 return 0; 175 } 176 177 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 178 { 179 struct btree *b; 180 181 b = kzalloc(sizeof(struct btree), gfp); 182 if (!b) 183 return NULL; 184 185 bkey_btree_ptr_init(&b->key); 186 INIT_LIST_HEAD(&b->list); 187 INIT_LIST_HEAD(&b->write_blocked); 188 b->byte_order = ilog2(c->opts.btree_node_size); 189 return b; 190 } 191 192 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 193 { 194 struct btree_cache *bc = &c->btree_cache; 195 struct btree *b; 196 197 b = __btree_node_mem_alloc(c, GFP_KERNEL); 198 if (!b) 199 return NULL; 200 201 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 202 kfree(b); 203 return NULL; 204 } 205 206 bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); 207 208 __bch2_btree_node_to_freelist(bc, b); 209 return b; 210 } 211 212 static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) 213 { 214 struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 215 216 u64 mask = bc->pinned_nodes_mask[!!b->c.level]; 217 218 return ((mask & BIT_ULL(b->c.btree_id)) && 219 bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 220 bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); 221 } 222 223 void bch2_node_pin(struct bch_fs *c, struct btree *b) 224 { 225 struct btree_cache *bc = &c->btree_cache; 226 227 mutex_lock(&bc->lock); 228 if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { 229 set_btree_node_pinned(b); 230 list_move(&b->list, &bc->live[1].list); 231 bc->live[0].nr--; 232 bc->live[1].nr++; 233 } 234 mutex_unlock(&bc->lock); 235 } 236 237 void bch2_btree_cache_unpin(struct bch_fs *c) 238 { 239 struct btree_cache *bc = &c->btree_cache; 240 struct btree *b, *n; 241 242 mutex_lock(&bc->lock); 243 c->btree_cache.pinned_nodes_mask[0] = 0; 244 c->btree_cache.pinned_nodes_mask[1] = 0; 245 246 list_for_each_entry_safe(b, n, &bc->live[1].list, list) { 247 clear_btree_node_pinned(b); 248 list_move(&b->list, &bc->live[0].list); 249 bc->live[0].nr++; 250 bc->live[1].nr--; 251 } 252 253 mutex_unlock(&bc->lock); 254 } 255 256 /* Btree in memory cache - hash table */ 257 258 void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 259 { 260 lockdep_assert_held(&bc->lock); 261 262 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 263 BUG_ON(ret); 264 265 /* Cause future lookups for this node to fail: */ 266 b->hash_val = 0; 267 268 if (b->c.btree_id < BTREE_ID_NR) 269 --bc->nr_by_btree[b->c.btree_id]; 270 --bc->live[btree_node_pinned(b)].nr; 271 list_del_init(&b->list); 272 } 273 274 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 275 { 276 __bch2_btree_node_hash_remove(bc, b); 277 __bch2_btree_node_to_freelist(bc, b); 278 } 279 280 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 281 { 282 BUG_ON(!list_empty(&b->list)); 283 BUG_ON(b->hash_val); 284 285 b->hash_val = btree_ptr_hash_val(&b->key); 286 int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, 287 bch_btree_cache_params); 288 if (ret) 289 return ret; 290 291 if (b->c.btree_id < BTREE_ID_NR) 292 bc->nr_by_btree[b->c.btree_id]++; 293 294 bool p = __btree_node_pinned(bc, b); 295 mod_bit(BTREE_NODE_pinned, &b->flags, p); 296 297 list_add_tail(&b->list, &bc->live[p].list); 298 bc->live[p].nr++; 299 return 0; 300 } 301 302 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 303 unsigned level, enum btree_id id) 304 { 305 b->c.level = level; 306 b->c.btree_id = id; 307 308 mutex_lock(&bc->lock); 309 int ret = __bch2_btree_node_hash_insert(bc, b); 310 mutex_unlock(&bc->lock); 311 312 return ret; 313 } 314 315 void bch2_btree_node_update_key_early(struct btree_trans *trans, 316 enum btree_id btree, unsigned level, 317 struct bkey_s_c old, struct bkey_i *new) 318 { 319 struct bch_fs *c = trans->c; 320 struct btree *b; 321 struct bkey_buf tmp; 322 int ret; 323 324 bch2_bkey_buf_init(&tmp); 325 bch2_bkey_buf_reassemble(&tmp, c, old); 326 327 b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); 328 if (!IS_ERR_OR_NULL(b)) { 329 mutex_lock(&c->btree_cache.lock); 330 331 __bch2_btree_node_hash_remove(&c->btree_cache, b); 332 333 bkey_copy(&b->key, new); 334 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 335 BUG_ON(ret); 336 337 mutex_unlock(&c->btree_cache.lock); 338 six_unlock_read(&b->c.lock); 339 } 340 341 bch2_bkey_buf_exit(&tmp, c); 342 } 343 344 __flatten 345 static inline struct btree *btree_cache_find(struct btree_cache *bc, 346 const struct bkey_i *k) 347 { 348 u64 v = btree_ptr_hash_val(k); 349 350 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 351 } 352 353 /* 354 * this version is for btree nodes that have already been freed (we're not 355 * reaping a real btree node) 356 */ 357 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) 358 { 359 struct btree_cache *bc = &c->btree_cache; 360 int ret = 0; 361 362 lockdep_assert_held(&bc->lock); 363 wait_on_io: 364 if (b->flags & ((1U << BTREE_NODE_dirty)| 365 (1U << BTREE_NODE_read_in_flight)| 366 (1U << BTREE_NODE_write_in_flight))) { 367 if (!flush) { 368 if (btree_node_dirty(b)) 369 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 370 else if (btree_node_read_in_flight(b)) 371 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 372 else if (btree_node_write_in_flight(b)) 373 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 374 return -BCH_ERR_ENOMEM_btree_node_reclaim; 375 } 376 377 /* XXX: waiting on IO with btree cache lock held */ 378 bch2_btree_node_wait_on_read(b); 379 bch2_btree_node_wait_on_write(b); 380 } 381 382 if (!six_trylock_intent(&b->c.lock)) { 383 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); 384 return -BCH_ERR_ENOMEM_btree_node_reclaim; 385 } 386 387 if (!six_trylock_write(&b->c.lock)) { 388 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); 389 goto out_unlock_intent; 390 } 391 392 /* recheck under lock */ 393 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 394 (1U << BTREE_NODE_write_in_flight))) { 395 if (!flush) { 396 if (btree_node_read_in_flight(b)) 397 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 398 else if (btree_node_write_in_flight(b)) 399 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 400 goto out_unlock; 401 } 402 six_unlock_write(&b->c.lock); 403 six_unlock_intent(&b->c.lock); 404 goto wait_on_io; 405 } 406 407 if (btree_node_noevict(b)) { 408 BTREE_CACHE_NOT_FREED_INCREMENT(noevict); 409 goto out_unlock; 410 } 411 if (btree_node_write_blocked(b)) { 412 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); 413 goto out_unlock; 414 } 415 if (btree_node_will_make_reachable(b)) { 416 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); 417 goto out_unlock; 418 } 419 420 if (btree_node_dirty(b)) { 421 if (!flush) { 422 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 423 goto out_unlock; 424 } 425 /* 426 * Using the underscore version because we don't want to compact 427 * bsets after the write, since this node is about to be evicted 428 * - unless btree verify mode is enabled, since it runs out of 429 * the post write cleanup: 430 */ 431 if (bch2_verify_btree_ondisk) 432 bch2_btree_node_write(c, b, SIX_LOCK_intent, 433 BTREE_WRITE_cache_reclaim); 434 else 435 __bch2_btree_node_write(c, b, 436 BTREE_WRITE_cache_reclaim); 437 438 six_unlock_write(&b->c.lock); 439 six_unlock_intent(&b->c.lock); 440 goto wait_on_io; 441 } 442 out: 443 if (b->hash_val && !ret) 444 trace_and_count(c, btree_cache_reap, c, b); 445 return ret; 446 out_unlock: 447 six_unlock_write(&b->c.lock); 448 out_unlock_intent: 449 six_unlock_intent(&b->c.lock); 450 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 451 goto out; 452 } 453 454 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) 455 { 456 return __btree_node_reclaim(c, b, false, shrinker_counter); 457 } 458 459 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 460 { 461 return __btree_node_reclaim(c, b, true, false); 462 } 463 464 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 465 struct shrink_control *sc) 466 { 467 struct btree_cache_list *list = shrink->private_data; 468 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 469 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 470 struct btree *b, *t; 471 unsigned long nr = sc->nr_to_scan; 472 unsigned long can_free = 0; 473 unsigned long freed = 0; 474 unsigned long touched = 0; 475 unsigned i, flags; 476 unsigned long ret = SHRINK_STOP; 477 bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; 478 479 if (bch2_btree_shrinker_disabled) 480 return SHRINK_STOP; 481 482 mutex_lock(&bc->lock); 483 flags = memalloc_nofs_save(); 484 485 /* 486 * It's _really_ critical that we don't free too many btree nodes - we 487 * have to always leave ourselves a reserve. The reserve is how we 488 * guarantee that allocating memory for a new btree node can always 489 * succeed, so that inserting keys into the btree can always succeed and 490 * IO can always make forward progress: 491 */ 492 can_free = btree_cache_can_free(list); 493 nr = min_t(unsigned long, nr, can_free); 494 495 i = 0; 496 list_for_each_entry_safe(b, t, &bc->freeable, list) { 497 /* 498 * Leave a few nodes on the freeable list, so that a btree split 499 * won't have to hit the system allocator: 500 */ 501 if (++i <= 3) 502 continue; 503 504 touched++; 505 506 if (touched >= nr) 507 goto out; 508 509 if (!btree_node_reclaim(c, b, true)) { 510 btree_node_data_free(bc, b); 511 six_unlock_write(&b->c.lock); 512 six_unlock_intent(&b->c.lock); 513 freed++; 514 bc->nr_freed++; 515 } 516 } 517 restart: 518 list_for_each_entry_safe(b, t, &list->list, list) { 519 touched++; 520 521 if (btree_node_accessed(b)) { 522 clear_btree_node_accessed(b); 523 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; 524 --touched;; 525 } else if (!btree_node_reclaim(c, b, true)) { 526 __bch2_btree_node_hash_remove(bc, b); 527 __btree_node_data_free(bc, b); 528 529 freed++; 530 bc->nr_freed++; 531 532 six_unlock_write(&b->c.lock); 533 six_unlock_intent(&b->c.lock); 534 535 if (freed == nr) 536 goto out_rotate; 537 } else if (trigger_writes && 538 btree_node_dirty(b) && 539 !btree_node_will_make_reachable(b) && 540 !btree_node_write_blocked(b) && 541 six_trylock_read(&b->c.lock)) { 542 list_move(&list->list, &b->list); 543 mutex_unlock(&bc->lock); 544 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 545 six_unlock_read(&b->c.lock); 546 if (touched >= nr) 547 goto out_nounlock; 548 mutex_lock(&bc->lock); 549 goto restart; 550 } 551 552 if (touched >= nr) 553 break; 554 } 555 out_rotate: 556 if (&t->list != &list->list) 557 list_move_tail(&list->list, &t->list); 558 out: 559 mutex_unlock(&bc->lock); 560 out_nounlock: 561 ret = freed; 562 memalloc_nofs_restore(flags); 563 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 564 return ret; 565 } 566 567 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 568 struct shrink_control *sc) 569 { 570 struct btree_cache_list *list = shrink->private_data; 571 572 if (bch2_btree_shrinker_disabled) 573 return 0; 574 575 return btree_cache_can_free(list); 576 } 577 578 void bch2_fs_btree_cache_exit(struct bch_fs *c) 579 { 580 struct btree_cache *bc = &c->btree_cache; 581 struct btree *b, *t; 582 unsigned long flags; 583 584 shrinker_free(bc->live[1].shrink); 585 shrinker_free(bc->live[0].shrink); 586 587 /* vfree() can allocate memory: */ 588 flags = memalloc_nofs_save(); 589 mutex_lock(&bc->lock); 590 591 if (c->verify_data) 592 list_move(&c->verify_data->list, &bc->live[0].list); 593 594 kvfree(c->verify_ondisk); 595 596 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 597 struct btree_root *r = bch2_btree_id_root(c, i); 598 599 if (r->b) 600 list_add(&r->b->list, &bc->live[0].list); 601 } 602 603 list_for_each_entry_safe(b, t, &bc->live[1].list, list) 604 bch2_btree_node_hash_remove(bc, b); 605 list_for_each_entry_safe(b, t, &bc->live[0].list, list) 606 bch2_btree_node_hash_remove(bc, b); 607 608 list_for_each_entry_safe(b, t, &bc->freeable, list) { 609 BUG_ON(btree_node_read_in_flight(b) || 610 btree_node_write_in_flight(b)); 611 612 btree_node_data_free(bc, b); 613 cond_resched(); 614 } 615 616 BUG_ON(!bch2_journal_error(&c->journal) && 617 atomic_long_read(&c->btree_cache.nr_dirty)); 618 619 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 620 621 list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { 622 list_del(&b->list); 623 six_lock_exit(&b->c.lock); 624 kfree(b); 625 } 626 627 mutex_unlock(&bc->lock); 628 memalloc_nofs_restore(flags); 629 630 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) 631 BUG_ON(bc->nr_by_btree[i]); 632 BUG_ON(bc->live[0].nr); 633 BUG_ON(bc->live[1].nr); 634 BUG_ON(bc->nr_freeable); 635 636 if (bc->table_init_done) 637 rhashtable_destroy(&bc->table); 638 } 639 640 int bch2_fs_btree_cache_init(struct bch_fs *c) 641 { 642 struct btree_cache *bc = &c->btree_cache; 643 struct shrinker *shrink; 644 unsigned i; 645 int ret = 0; 646 647 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 648 if (ret) 649 goto err; 650 651 bc->table_init_done = true; 652 653 bch2_recalc_btree_reserve(c); 654 655 for (i = 0; i < bc->nr_reserve; i++) 656 if (!__bch2_btree_node_mem_alloc(c)) 657 goto err; 658 659 list_splice_init(&bc->live[0].list, &bc->freeable); 660 661 mutex_init(&c->verify_lock); 662 663 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 664 if (!shrink) 665 goto err; 666 bc->live[0].shrink = shrink; 667 shrink->count_objects = bch2_btree_cache_count; 668 shrink->scan_objects = bch2_btree_cache_scan; 669 shrink->seeks = 2; 670 shrink->private_data = &bc->live[0]; 671 shrinker_register(shrink); 672 673 shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); 674 if (!shrink) 675 goto err; 676 bc->live[1].shrink = shrink; 677 shrink->count_objects = bch2_btree_cache_count; 678 shrink->scan_objects = bch2_btree_cache_scan; 679 shrink->seeks = 8; 680 shrink->private_data = &bc->live[1]; 681 shrinker_register(shrink); 682 683 return 0; 684 err: 685 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 686 } 687 688 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 689 { 690 mutex_init(&bc->lock); 691 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { 692 bc->live[i].idx = i; 693 INIT_LIST_HEAD(&bc->live[i].list); 694 } 695 INIT_LIST_HEAD(&bc->freeable); 696 INIT_LIST_HEAD(&bc->freed_pcpu); 697 INIT_LIST_HEAD(&bc->freed_nonpcpu); 698 } 699 700 /* 701 * We can only have one thread cannibalizing other cached btree nodes at a time, 702 * or we'll deadlock. We use an open coded mutex to ensure that, which a 703 * cannibalize_bucket() will take. This means every time we unlock the root of 704 * the btree, we need to release this lock if we have it held. 705 */ 706 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) 707 { 708 struct bch_fs *c = trans->c; 709 struct btree_cache *bc = &c->btree_cache; 710 711 if (bc->alloc_lock == current) { 712 trace_and_count(c, btree_cache_cannibalize_unlock, trans); 713 bc->alloc_lock = NULL; 714 closure_wake_up(&bc->alloc_wait); 715 } 716 } 717 718 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) 719 { 720 struct bch_fs *c = trans->c; 721 struct btree_cache *bc = &c->btree_cache; 722 struct task_struct *old; 723 724 old = NULL; 725 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) 726 goto success; 727 728 if (!cl) { 729 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 730 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 731 } 732 733 closure_wait(&bc->alloc_wait, cl); 734 735 /* Try again, after adding ourselves to waitlist */ 736 old = NULL; 737 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { 738 /* We raced */ 739 closure_wake_up(&bc->alloc_wait); 740 goto success; 741 } 742 743 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 744 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 745 746 success: 747 trace_and_count(c, btree_cache_cannibalize_lock, trans); 748 return 0; 749 } 750 751 static struct btree *btree_node_cannibalize(struct bch_fs *c) 752 { 753 struct btree_cache *bc = &c->btree_cache; 754 struct btree *b; 755 756 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 757 list_for_each_entry_reverse(b, &bc->live[i].list, list) 758 if (!btree_node_reclaim(c, b, false)) 759 return b; 760 761 while (1) { 762 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 763 list_for_each_entry_reverse(b, &bc->live[i].list, list) 764 if (!btree_node_write_and_reclaim(c, b)) 765 return b; 766 767 /* 768 * Rare case: all nodes were intent-locked. 769 * Just busy-wait. 770 */ 771 WARN_ONCE(1, "btree cache cannibalize failed\n"); 772 cond_resched(); 773 } 774 } 775 776 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 777 { 778 struct bch_fs *c = trans->c; 779 struct btree_cache *bc = &c->btree_cache; 780 struct list_head *freed = pcpu_read_locks 781 ? &bc->freed_pcpu 782 : &bc->freed_nonpcpu; 783 struct btree *b, *b2; 784 u64 start_time = local_clock(); 785 786 mutex_lock(&bc->lock); 787 788 /* 789 * We never free struct btree itself, just the memory that holds the on 790 * disk node. Check the freed list before allocating a new one: 791 */ 792 list_for_each_entry(b, freed, list) 793 if (!btree_node_reclaim(c, b, false)) { 794 list_del_init(&b->list); 795 goto got_node; 796 } 797 798 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 799 if (b) { 800 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); 801 } else { 802 mutex_unlock(&bc->lock); 803 bch2_trans_unlock(trans); 804 b = __btree_node_mem_alloc(c, GFP_KERNEL); 805 if (!b) 806 goto err; 807 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); 808 mutex_lock(&bc->lock); 809 } 810 811 BUG_ON(!six_trylock_intent(&b->c.lock)); 812 BUG_ON(!six_trylock_write(&b->c.lock)); 813 814 got_node: 815 /* 816 * btree_free() doesn't free memory; it sticks the node on the end of 817 * the list. Check if there's any freed nodes there: 818 */ 819 list_for_each_entry(b2, &bc->freeable, list) 820 if (!btree_node_reclaim(c, b2, false)) { 821 swap(b->data, b2->data); 822 swap(b->aux_data, b2->aux_data); 823 824 list_del_init(&b2->list); 825 --bc->nr_freeable; 826 btree_node_to_freedlist(bc, b2); 827 mutex_unlock(&bc->lock); 828 829 six_unlock_write(&b2->c.lock); 830 six_unlock_intent(&b2->c.lock); 831 goto got_mem; 832 } 833 834 mutex_unlock(&bc->lock); 835 836 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 837 bch2_trans_unlock(trans); 838 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 839 goto err; 840 } 841 842 got_mem: 843 BUG_ON(!list_empty(&b->list)); 844 BUG_ON(btree_node_hashed(b)); 845 BUG_ON(btree_node_dirty(b)); 846 BUG_ON(btree_node_write_in_flight(b)); 847 out: 848 b->flags = 0; 849 b->written = 0; 850 b->nsets = 0; 851 b->sib_u64s[0] = 0; 852 b->sib_u64s[1] = 0; 853 b->whiteout_u64s = 0; 854 bch2_btree_keys_init(b); 855 set_btree_node_accessed(b); 856 857 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 858 start_time); 859 860 int ret = bch2_trans_relock(trans); 861 if (unlikely(ret)) { 862 bch2_btree_node_to_freelist(c, b); 863 return ERR_PTR(ret); 864 } 865 866 return b; 867 err: 868 mutex_lock(&bc->lock); 869 870 /* Try to cannibalize another cached btree node: */ 871 if (bc->alloc_lock == current) { 872 b2 = btree_node_cannibalize(c); 873 clear_btree_node_just_written(b2); 874 __bch2_btree_node_hash_remove(bc, b2); 875 876 if (b) { 877 swap(b->data, b2->data); 878 swap(b->aux_data, b2->aux_data); 879 btree_node_to_freedlist(bc, b2); 880 six_unlock_write(&b2->c.lock); 881 six_unlock_intent(&b2->c.lock); 882 } else { 883 b = b2; 884 } 885 886 BUG_ON(!list_empty(&b->list)); 887 mutex_unlock(&bc->lock); 888 889 trace_and_count(c, btree_cache_cannibalize, trans); 890 goto out; 891 } 892 893 mutex_unlock(&bc->lock); 894 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 895 } 896 897 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 898 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 899 struct btree_path *path, 900 const struct bkey_i *k, 901 enum btree_id btree_id, 902 unsigned level, 903 enum six_lock_type lock_type, 904 bool sync) 905 { 906 struct bch_fs *c = trans->c; 907 struct btree_cache *bc = &c->btree_cache; 908 struct btree *b; 909 910 if (unlikely(level >= BTREE_MAX_DEPTH)) { 911 int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", 912 level, BTREE_MAX_DEPTH); 913 return ERR_PTR(ret); 914 } 915 916 if (unlikely(!bkey_is_btree_ptr(&k->k))) { 917 struct printbuf buf = PRINTBUF; 918 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 919 920 int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); 921 printbuf_exit(&buf); 922 return ERR_PTR(ret); 923 } 924 925 if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { 926 struct printbuf buf = PRINTBUF; 927 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 928 929 int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); 930 printbuf_exit(&buf); 931 return ERR_PTR(ret); 932 } 933 934 /* 935 * Parent node must be locked, else we could read in a btree node that's 936 * been freed: 937 */ 938 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 939 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 940 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 941 } 942 943 b = bch2_btree_node_mem_alloc(trans, level != 0); 944 945 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 946 if (!path) 947 return b; 948 949 trans->memory_allocation_failure = true; 950 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 951 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 952 } 953 954 if (IS_ERR(b)) 955 return b; 956 957 bkey_copy(&b->key, k); 958 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 959 /* raced with another fill: */ 960 961 /* mark as unhashed... */ 962 b->hash_val = 0; 963 964 mutex_lock(&bc->lock); 965 __bch2_btree_node_to_freelist(bc, b); 966 mutex_unlock(&bc->lock); 967 968 six_unlock_write(&b->c.lock); 969 six_unlock_intent(&b->c.lock); 970 return NULL; 971 } 972 973 set_btree_node_read_in_flight(b); 974 six_unlock_write(&b->c.lock); 975 976 if (path) { 977 u32 seq = six_lock_seq(&b->c.lock); 978 979 /* Unlock before doing IO: */ 980 six_unlock_intent(&b->c.lock); 981 bch2_trans_unlock_noassert(trans); 982 983 bch2_btree_node_read(trans, b, sync); 984 985 int ret = bch2_trans_relock(trans); 986 if (ret) 987 return ERR_PTR(ret); 988 989 if (!sync) 990 return NULL; 991 992 if (!six_relock_type(&b->c.lock, lock_type, seq)) 993 b = NULL; 994 } else { 995 bch2_btree_node_read(trans, b, sync); 996 if (lock_type == SIX_LOCK_read) 997 six_lock_downgrade(&b->c.lock); 998 } 999 1000 return b; 1001 } 1002 1003 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 1004 { 1005 struct printbuf buf = PRINTBUF; 1006 1007 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 1008 return; 1009 1010 prt_printf(&buf, 1011 "btree node header doesn't match ptr: "); 1012 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 1013 prt_str(&buf, "\nptr: "); 1014 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 1015 1016 prt_str(&buf, "\nheader: "); 1017 bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); 1018 prt_str(&buf, "\nmin "); 1019 bch2_bpos_to_text(&buf, b->data->min_key); 1020 1021 prt_printf(&buf, "\nmax "); 1022 bch2_bpos_to_text(&buf, b->data->max_key); 1023 1024 bch2_fs_topology_error(c, "%s", buf.buf); 1025 1026 printbuf_exit(&buf); 1027 } 1028 1029 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 1030 { 1031 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 1032 b->c.level != BTREE_NODE_LEVEL(b->data) || 1033 !bpos_eq(b->data->max_key, b->key.k.p) || 1034 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 1035 !bpos_eq(b->data->min_key, 1036 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 1037 btree_bad_header(c, b); 1038 } 1039 1040 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1041 const struct bkey_i *k, unsigned level, 1042 enum six_lock_type lock_type, 1043 unsigned long trace_ip) 1044 { 1045 struct bch_fs *c = trans->c; 1046 struct btree_cache *bc = &c->btree_cache; 1047 struct btree *b; 1048 bool need_relock = false; 1049 int ret; 1050 1051 EBUG_ON(level >= BTREE_MAX_DEPTH); 1052 retry: 1053 b = btree_cache_find(bc, k); 1054 if (unlikely(!b)) { 1055 /* 1056 * We must have the parent locked to call bch2_btree_node_fill(), 1057 * else we could read in a btree node from disk that's been 1058 * freed: 1059 */ 1060 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 1061 level, lock_type, true); 1062 need_relock = true; 1063 1064 /* We raced and found the btree node in the cache */ 1065 if (!b) 1066 goto retry; 1067 1068 if (IS_ERR(b)) 1069 return b; 1070 } else { 1071 if (btree_node_read_locked(path, level + 1)) 1072 btree_node_unlock(trans, path, level + 1); 1073 1074 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1075 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1076 return ERR_PTR(ret); 1077 1078 BUG_ON(ret); 1079 1080 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1081 b->c.level != level || 1082 race_fault())) { 1083 six_unlock_type(&b->c.lock, lock_type); 1084 if (bch2_btree_node_relock(trans, path, level + 1)) 1085 goto retry; 1086 1087 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1088 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1089 } 1090 1091 /* avoid atomic set bit if it's not needed: */ 1092 if (!btree_node_accessed(b)) 1093 set_btree_node_accessed(b); 1094 } 1095 1096 if (unlikely(btree_node_read_in_flight(b))) { 1097 u32 seq = six_lock_seq(&b->c.lock); 1098 1099 six_unlock_type(&b->c.lock, lock_type); 1100 bch2_trans_unlock(trans); 1101 need_relock = true; 1102 1103 bch2_btree_node_wait_on_read(b); 1104 1105 ret = bch2_trans_relock(trans); 1106 if (ret) 1107 return ERR_PTR(ret); 1108 1109 /* 1110 * should_be_locked is not set on this path yet, so we need to 1111 * relock it specifically: 1112 */ 1113 if (!six_relock_type(&b->c.lock, lock_type, seq)) 1114 goto retry; 1115 } 1116 1117 if (unlikely(need_relock)) { 1118 ret = bch2_trans_relock(trans) ?: 1119 bch2_btree_path_relock_intent(trans, path); 1120 if (ret) { 1121 six_unlock_type(&b->c.lock, lock_type); 1122 return ERR_PTR(ret); 1123 } 1124 } 1125 1126 prefetch(b->aux_data); 1127 1128 for_each_bset(b, t) { 1129 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1130 1131 prefetch(p + L1_CACHE_BYTES * 0); 1132 prefetch(p + L1_CACHE_BYTES * 1); 1133 prefetch(p + L1_CACHE_BYTES * 2); 1134 } 1135 1136 if (unlikely(btree_node_read_error(b))) { 1137 six_unlock_type(&b->c.lock, lock_type); 1138 return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1139 } 1140 1141 EBUG_ON(b->c.btree_id != path->btree_id); 1142 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1143 btree_check_header(c, b); 1144 1145 return b; 1146 } 1147 1148 /** 1149 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 1150 * in from disk if necessary. 1151 * 1152 * @trans: btree transaction object 1153 * @path: btree_path being traversed 1154 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 1155 * @level: level of btree node being looked up (0 == leaf node) 1156 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 1157 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 1158 * 1159 * The btree node will have either a read or a write lock held, depending on 1160 * the @write parameter. 1161 * 1162 * Returns: btree node or ERR_PTR() 1163 */ 1164 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1165 const struct bkey_i *k, unsigned level, 1166 enum six_lock_type lock_type, 1167 unsigned long trace_ip) 1168 { 1169 struct bch_fs *c = trans->c; 1170 struct btree *b; 1171 int ret; 1172 1173 EBUG_ON(level >= BTREE_MAX_DEPTH); 1174 1175 b = btree_node_mem_ptr(k); 1176 1177 /* 1178 * Check b->hash_val _before_ calling btree_node_lock() - this might not 1179 * be the node we want anymore, and trying to lock the wrong node could 1180 * cause an unneccessary transaction restart: 1181 */ 1182 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 1183 !b || 1184 b->hash_val != btree_ptr_hash_val(k))) 1185 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1186 1187 if (btree_node_read_locked(path, level + 1)) 1188 btree_node_unlock(trans, path, level + 1); 1189 1190 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1191 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1192 return ERR_PTR(ret); 1193 1194 BUG_ON(ret); 1195 1196 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1197 b->c.level != level || 1198 race_fault())) { 1199 six_unlock_type(&b->c.lock, lock_type); 1200 if (bch2_btree_node_relock(trans, path, level + 1)) 1201 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1202 1203 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1204 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1205 } 1206 1207 if (unlikely(btree_node_read_in_flight(b))) { 1208 six_unlock_type(&b->c.lock, lock_type); 1209 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1210 } 1211 1212 prefetch(b->aux_data); 1213 1214 for_each_bset(b, t) { 1215 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1216 1217 prefetch(p + L1_CACHE_BYTES * 0); 1218 prefetch(p + L1_CACHE_BYTES * 1); 1219 prefetch(p + L1_CACHE_BYTES * 2); 1220 } 1221 1222 /* avoid atomic set bit if it's not needed: */ 1223 if (!btree_node_accessed(b)) 1224 set_btree_node_accessed(b); 1225 1226 if (unlikely(btree_node_read_error(b))) { 1227 six_unlock_type(&b->c.lock, lock_type); 1228 return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1229 } 1230 1231 EBUG_ON(b->c.btree_id != path->btree_id); 1232 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1233 btree_check_header(c, b); 1234 1235 return b; 1236 } 1237 1238 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1239 const struct bkey_i *k, 1240 enum btree_id btree_id, 1241 unsigned level, 1242 bool nofill) 1243 { 1244 struct bch_fs *c = trans->c; 1245 struct btree_cache *bc = &c->btree_cache; 1246 struct btree *b; 1247 int ret; 1248 1249 EBUG_ON(level >= BTREE_MAX_DEPTH); 1250 1251 if (c->opts.btree_node_mem_ptr_optimization) { 1252 b = btree_node_mem_ptr(k); 1253 if (b) 1254 goto lock_node; 1255 } 1256 retry: 1257 b = btree_cache_find(bc, k); 1258 if (unlikely(!b)) { 1259 if (nofill) 1260 goto out; 1261 1262 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1263 level, SIX_LOCK_read, true); 1264 1265 /* We raced and found the btree node in the cache */ 1266 if (!b) 1267 goto retry; 1268 1269 if (IS_ERR(b) && 1270 !bch2_btree_cache_cannibalize_lock(trans, NULL)) 1271 goto retry; 1272 1273 if (IS_ERR(b)) 1274 goto out; 1275 } else { 1276 lock_node: 1277 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1278 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1279 return ERR_PTR(ret); 1280 1281 BUG_ON(ret); 1282 1283 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1284 b->c.btree_id != btree_id || 1285 b->c.level != level)) { 1286 six_unlock_read(&b->c.lock); 1287 goto retry; 1288 } 1289 } 1290 1291 /* XXX: waiting on IO with btree locks held: */ 1292 __bch2_btree_node_wait_on_read(b); 1293 1294 prefetch(b->aux_data); 1295 1296 for_each_bset(b, t) { 1297 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1298 1299 prefetch(p + L1_CACHE_BYTES * 0); 1300 prefetch(p + L1_CACHE_BYTES * 1); 1301 prefetch(p + L1_CACHE_BYTES * 2); 1302 } 1303 1304 /* avoid atomic set bit if it's not needed: */ 1305 if (!btree_node_accessed(b)) 1306 set_btree_node_accessed(b); 1307 1308 if (unlikely(btree_node_read_error(b))) { 1309 six_unlock_read(&b->c.lock); 1310 b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1311 goto out; 1312 } 1313 1314 EBUG_ON(b->c.btree_id != btree_id); 1315 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1316 btree_check_header(c, b); 1317 out: 1318 bch2_btree_cache_cannibalize_unlock(trans); 1319 return b; 1320 } 1321 1322 int bch2_btree_node_prefetch(struct btree_trans *trans, 1323 struct btree_path *path, 1324 const struct bkey_i *k, 1325 enum btree_id btree_id, unsigned level) 1326 { 1327 struct bch_fs *c = trans->c; 1328 struct btree_cache *bc = &c->btree_cache; 1329 1330 BUG_ON(path && !btree_node_locked(path, level + 1)); 1331 BUG_ON(level >= BTREE_MAX_DEPTH); 1332 1333 struct btree *b = btree_cache_find(bc, k); 1334 if (b) 1335 return 0; 1336 1337 b = bch2_btree_node_fill(trans, path, k, btree_id, 1338 level, SIX_LOCK_read, false); 1339 int ret = PTR_ERR_OR_ZERO(b); 1340 if (ret) 1341 return ret; 1342 if (b) 1343 six_unlock_read(&b->c.lock); 1344 return 0; 1345 } 1346 1347 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1348 { 1349 struct bch_fs *c = trans->c; 1350 struct btree_cache *bc = &c->btree_cache; 1351 struct btree *b; 1352 1353 b = btree_cache_find(bc, k); 1354 if (!b) 1355 return; 1356 1357 BUG_ON(b == btree_node_root(trans->c, b)); 1358 wait_on_io: 1359 /* not allowed to wait on io with btree locks held: */ 1360 1361 /* XXX we're called from btree_gc which will be holding other btree 1362 * nodes locked 1363 */ 1364 __bch2_btree_node_wait_on_read(b); 1365 __bch2_btree_node_wait_on_write(b); 1366 1367 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1368 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1369 if (unlikely(b->hash_val != btree_ptr_hash_val(k))) 1370 goto out; 1371 1372 if (btree_node_dirty(b)) { 1373 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1374 six_unlock_write(&b->c.lock); 1375 six_unlock_intent(&b->c.lock); 1376 goto wait_on_io; 1377 } 1378 1379 BUG_ON(btree_node_dirty(b)); 1380 1381 mutex_lock(&bc->lock); 1382 bch2_btree_node_hash_remove(bc, b); 1383 btree_node_data_free(bc, b); 1384 mutex_unlock(&bc->lock); 1385 out: 1386 six_unlock_write(&b->c.lock); 1387 six_unlock_intent(&b->c.lock); 1388 } 1389 1390 const char *bch2_btree_id_str(enum btree_id btree) 1391 { 1392 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1393 } 1394 1395 void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) 1396 { 1397 if (btree < BTREE_ID_NR) 1398 prt_str(out, __bch2_btree_ids[btree]); 1399 else 1400 prt_printf(out, "(unknown btree %u)", btree); 1401 } 1402 1403 void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) 1404 { 1405 prt_str(out, "btree="); 1406 bch2_btree_id_to_text(out, btree); 1407 prt_printf(out, " level=%u", level); 1408 } 1409 1410 void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, 1411 enum btree_id btree, unsigned level, struct bkey_s_c k) 1412 { 1413 bch2_btree_id_to_text(out, btree); 1414 prt_printf(out, " level %u/", level); 1415 struct btree_root *r = bch2_btree_id_root(c, btree); 1416 if (r) 1417 prt_printf(out, "%u", r->level); 1418 else 1419 prt_printf(out, "(unknown)"); 1420 prt_newline(out); 1421 1422 bch2_bkey_val_to_text(out, c, k); 1423 } 1424 1425 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1426 { 1427 __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); 1428 } 1429 1430 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1431 { 1432 struct bset_stats stats; 1433 1434 memset(&stats, 0, sizeof(stats)); 1435 1436 bch2_btree_keys_stats(b, &stats); 1437 1438 prt_printf(out, "l %u ", b->c.level); 1439 bch2_bpos_to_text(out, b->data->min_key); 1440 prt_printf(out, " - "); 1441 bch2_bpos_to_text(out, b->data->max_key); 1442 prt_printf(out, ":\n" 1443 " ptrs: "); 1444 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1445 prt_newline(out); 1446 1447 prt_printf(out, 1448 " format: "); 1449 bch2_bkey_format_to_text(out, &b->format); 1450 1451 prt_printf(out, 1452 " unpack fn len: %u\n" 1453 " bytes used %zu/%zu (%zu%% full)\n" 1454 " sib u64s: %u, %u (merge threshold %u)\n" 1455 " nr packed keys %u\n" 1456 " nr unpacked keys %u\n" 1457 " floats %zu\n" 1458 " failed unpacked %zu\n", 1459 b->unpack_fn_len, 1460 b->nr.live_u64s * sizeof(u64), 1461 btree_buf_bytes(b) - sizeof(struct btree_node), 1462 b->nr.live_u64s * 100 / btree_max_u64s(c), 1463 b->sib_u64s[0], 1464 b->sib_u64s[1], 1465 c->btree_foreground_merge_threshold, 1466 b->nr.packed_keys, 1467 b->nr.unpacked_keys, 1468 stats.floats, 1469 stats.failed); 1470 } 1471 1472 static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, 1473 const char *label, size_t nr) 1474 { 1475 prt_printf(out, "%s\t", label); 1476 prt_human_readable_u64(out, nr * c->opts.btree_node_size); 1477 prt_printf(out, " (%zu)\n", nr); 1478 } 1479 1480 static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { 1481 #define x(n) #n, 1482 BCH_BTREE_CACHE_NOT_FREED_REASONS() 1483 #undef x 1484 NULL 1485 }; 1486 1487 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) 1488 { 1489 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 1490 1491 if (!out->nr_tabstops) 1492 printbuf_tabstop_push(out, 32); 1493 1494 prt_btree_cache_line(out, c, "live:", bc->live[0].nr); 1495 prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); 1496 prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); 1497 prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); 1498 prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); 1499 prt_newline(out); 1500 1501 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { 1502 bch2_btree_id_to_text(out, i); 1503 prt_printf(out, "\t"); 1504 prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); 1505 prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); 1506 } 1507 1508 prt_newline(out); 1509 prt_printf(out, "freed:\t%zu\n", bc->nr_freed); 1510 prt_printf(out, "not freed:\n"); 1511 1512 for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) 1513 prt_printf(out, " %s\t%llu\n", 1514 bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); 1515 } 1516