1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_cache.h" 6 #include "btree_io.h" 7 #include "btree_iter.h" 8 #include "btree_locking.h" 9 #include "debug.h" 10 #include "errcode.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "trace.h" 14 15 #include <linux/prefetch.h> 16 #include <linux/sched/mm.h> 17 18 const char * const bch2_btree_node_flags[] = { 19 #define x(f) #f, 20 BTREE_FLAGS() 21 #undef x 22 NULL 23 }; 24 25 void bch2_recalc_btree_reserve(struct bch_fs *c) 26 { 27 unsigned i, reserve = 16; 28 29 if (!c->btree_roots_known[0].b) 30 reserve += 8; 31 32 for (i = 0; i < btree_id_nr_alive(c); i++) { 33 struct btree_root *r = bch2_btree_id_root(c, i); 34 35 if (r->b) 36 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 37 } 38 39 c->btree_cache.reserve = reserve; 40 } 41 42 static inline unsigned btree_cache_can_free(struct btree_cache *bc) 43 { 44 return max_t(int, 0, bc->used - bc->reserve); 45 } 46 47 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 48 { 49 if (b->c.lock.readers) 50 list_move(&b->list, &bc->freed_pcpu); 51 else 52 list_move(&b->list, &bc->freed_nonpcpu); 53 } 54 55 static void btree_node_data_free(struct bch_fs *c, struct btree *b) 56 { 57 struct btree_cache *bc = &c->btree_cache; 58 59 EBUG_ON(btree_node_write_in_flight(b)); 60 61 clear_btree_node_just_written(b); 62 63 kvpfree(b->data, btree_buf_bytes(b)); 64 b->data = NULL; 65 #ifdef __KERNEL__ 66 kvfree(b->aux_data); 67 #else 68 munmap(b->aux_data, btree_aux_data_bytes(b)); 69 #endif 70 b->aux_data = NULL; 71 72 bc->used--; 73 74 btree_node_to_freedlist(bc, b); 75 } 76 77 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 78 const void *obj) 79 { 80 const struct btree *b = obj; 81 const u64 *v = arg->key; 82 83 return b->hash_val == *v ? 0 : 1; 84 } 85 86 static const struct rhashtable_params bch_btree_cache_params = { 87 .head_offset = offsetof(struct btree, hash), 88 .key_offset = offsetof(struct btree, hash_val), 89 .key_len = sizeof(u64), 90 .obj_cmpfn = bch2_btree_cache_cmp_fn, 91 }; 92 93 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 94 { 95 BUG_ON(b->data || b->aux_data); 96 97 b->data = kvpmalloc(btree_buf_bytes(b), gfp); 98 if (!b->data) 99 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 100 #ifdef __KERNEL__ 101 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 102 #else 103 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 104 PROT_READ|PROT_WRITE|PROT_EXEC, 105 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 106 if (b->aux_data == MAP_FAILED) 107 b->aux_data = NULL; 108 #endif 109 if (!b->aux_data) { 110 kvpfree(b->data, btree_buf_bytes(b)); 111 b->data = NULL; 112 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 113 } 114 115 return 0; 116 } 117 118 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 119 { 120 struct btree *b; 121 122 b = kzalloc(sizeof(struct btree), gfp); 123 if (!b) 124 return NULL; 125 126 bkey_btree_ptr_init(&b->key); 127 INIT_LIST_HEAD(&b->list); 128 INIT_LIST_HEAD(&b->write_blocked); 129 b->byte_order = ilog2(c->opts.btree_node_size); 130 return b; 131 } 132 133 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 134 { 135 struct btree_cache *bc = &c->btree_cache; 136 struct btree *b; 137 138 b = __btree_node_mem_alloc(c, GFP_KERNEL); 139 if (!b) 140 return NULL; 141 142 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 143 kfree(b); 144 return NULL; 145 } 146 147 bch2_btree_lock_init(&b->c, 0); 148 149 bc->used++; 150 list_add(&b->list, &bc->freeable); 151 return b; 152 } 153 154 /* Btree in memory cache - hash table */ 155 156 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 157 { 158 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 159 160 BUG_ON(ret); 161 162 /* Cause future lookups for this node to fail: */ 163 b->hash_val = 0; 164 } 165 166 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 167 { 168 BUG_ON(b->hash_val); 169 b->hash_val = btree_ptr_hash_val(&b->key); 170 171 return rhashtable_lookup_insert_fast(&bc->table, &b->hash, 172 bch_btree_cache_params); 173 } 174 175 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 176 unsigned level, enum btree_id id) 177 { 178 int ret; 179 180 b->c.level = level; 181 b->c.btree_id = id; 182 183 mutex_lock(&bc->lock); 184 ret = __bch2_btree_node_hash_insert(bc, b); 185 if (!ret) 186 list_add_tail(&b->list, &bc->live); 187 mutex_unlock(&bc->lock); 188 189 return ret; 190 } 191 192 __flatten 193 static inline struct btree *btree_cache_find(struct btree_cache *bc, 194 const struct bkey_i *k) 195 { 196 u64 v = btree_ptr_hash_val(k); 197 198 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 199 } 200 201 /* 202 * this version is for btree nodes that have already been freed (we're not 203 * reaping a real btree node) 204 */ 205 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) 206 { 207 struct btree_cache *bc = &c->btree_cache; 208 int ret = 0; 209 210 lockdep_assert_held(&bc->lock); 211 wait_on_io: 212 if (b->flags & ((1U << BTREE_NODE_dirty)| 213 (1U << BTREE_NODE_read_in_flight)| 214 (1U << BTREE_NODE_write_in_flight))) { 215 if (!flush) 216 return -BCH_ERR_ENOMEM_btree_node_reclaim; 217 218 /* XXX: waiting on IO with btree cache lock held */ 219 bch2_btree_node_wait_on_read(b); 220 bch2_btree_node_wait_on_write(b); 221 } 222 223 if (!six_trylock_intent(&b->c.lock)) 224 return -BCH_ERR_ENOMEM_btree_node_reclaim; 225 226 if (!six_trylock_write(&b->c.lock)) 227 goto out_unlock_intent; 228 229 /* recheck under lock */ 230 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 231 (1U << BTREE_NODE_write_in_flight))) { 232 if (!flush) 233 goto out_unlock; 234 six_unlock_write(&b->c.lock); 235 six_unlock_intent(&b->c.lock); 236 goto wait_on_io; 237 } 238 239 if (btree_node_noevict(b) || 240 btree_node_write_blocked(b) || 241 btree_node_will_make_reachable(b)) 242 goto out_unlock; 243 244 if (btree_node_dirty(b)) { 245 if (!flush) 246 goto out_unlock; 247 /* 248 * Using the underscore version because we don't want to compact 249 * bsets after the write, since this node is about to be evicted 250 * - unless btree verify mode is enabled, since it runs out of 251 * the post write cleanup: 252 */ 253 if (bch2_verify_btree_ondisk) 254 bch2_btree_node_write(c, b, SIX_LOCK_intent, 255 BTREE_WRITE_cache_reclaim); 256 else 257 __bch2_btree_node_write(c, b, 258 BTREE_WRITE_cache_reclaim); 259 260 six_unlock_write(&b->c.lock); 261 six_unlock_intent(&b->c.lock); 262 goto wait_on_io; 263 } 264 out: 265 if (b->hash_val && !ret) 266 trace_and_count(c, btree_cache_reap, c, b); 267 return ret; 268 out_unlock: 269 six_unlock_write(&b->c.lock); 270 out_unlock_intent: 271 six_unlock_intent(&b->c.lock); 272 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 273 goto out; 274 } 275 276 static int btree_node_reclaim(struct bch_fs *c, struct btree *b) 277 { 278 return __btree_node_reclaim(c, b, false); 279 } 280 281 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 282 { 283 return __btree_node_reclaim(c, b, true); 284 } 285 286 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 287 struct shrink_control *sc) 288 { 289 struct bch_fs *c = shrink->private_data; 290 struct btree_cache *bc = &c->btree_cache; 291 struct btree *b, *t; 292 unsigned long nr = sc->nr_to_scan; 293 unsigned long can_free = 0; 294 unsigned long freed = 0; 295 unsigned long touched = 0; 296 unsigned i, flags; 297 unsigned long ret = SHRINK_STOP; 298 bool trigger_writes = atomic_read(&bc->dirty) + nr >= 299 bc->used * 3 / 4; 300 301 if (bch2_btree_shrinker_disabled) 302 return SHRINK_STOP; 303 304 mutex_lock(&bc->lock); 305 flags = memalloc_nofs_save(); 306 307 /* 308 * It's _really_ critical that we don't free too many btree nodes - we 309 * have to always leave ourselves a reserve. The reserve is how we 310 * guarantee that allocating memory for a new btree node can always 311 * succeed, so that inserting keys into the btree can always succeed and 312 * IO can always make forward progress: 313 */ 314 can_free = btree_cache_can_free(bc); 315 nr = min_t(unsigned long, nr, can_free); 316 317 i = 0; 318 list_for_each_entry_safe(b, t, &bc->freeable, list) { 319 /* 320 * Leave a few nodes on the freeable list, so that a btree split 321 * won't have to hit the system allocator: 322 */ 323 if (++i <= 3) 324 continue; 325 326 touched++; 327 328 if (touched >= nr) 329 goto out; 330 331 if (!btree_node_reclaim(c, b)) { 332 btree_node_data_free(c, b); 333 six_unlock_write(&b->c.lock); 334 six_unlock_intent(&b->c.lock); 335 freed++; 336 } 337 } 338 restart: 339 list_for_each_entry_safe(b, t, &bc->live, list) { 340 touched++; 341 342 if (btree_node_accessed(b)) { 343 clear_btree_node_accessed(b); 344 } else if (!btree_node_reclaim(c, b)) { 345 freed++; 346 btree_node_data_free(c, b); 347 348 bch2_btree_node_hash_remove(bc, b); 349 six_unlock_write(&b->c.lock); 350 six_unlock_intent(&b->c.lock); 351 352 if (freed == nr) 353 goto out_rotate; 354 } else if (trigger_writes && 355 btree_node_dirty(b) && 356 !btree_node_will_make_reachable(b) && 357 !btree_node_write_blocked(b) && 358 six_trylock_read(&b->c.lock)) { 359 list_move(&bc->live, &b->list); 360 mutex_unlock(&bc->lock); 361 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 362 six_unlock_read(&b->c.lock); 363 if (touched >= nr) 364 goto out_nounlock; 365 mutex_lock(&bc->lock); 366 goto restart; 367 } 368 369 if (touched >= nr) 370 break; 371 } 372 out_rotate: 373 if (&t->list != &bc->live) 374 list_move_tail(&bc->live, &t->list); 375 out: 376 mutex_unlock(&bc->lock); 377 out_nounlock: 378 ret = freed; 379 memalloc_nofs_restore(flags); 380 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 381 return ret; 382 } 383 384 static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 385 struct shrink_control *sc) 386 { 387 struct bch_fs *c = shrink->private_data; 388 struct btree_cache *bc = &c->btree_cache; 389 390 if (bch2_btree_shrinker_disabled) 391 return 0; 392 393 return btree_cache_can_free(bc); 394 } 395 396 void bch2_fs_btree_cache_exit(struct bch_fs *c) 397 { 398 struct btree_cache *bc = &c->btree_cache; 399 struct btree *b; 400 unsigned i, flags; 401 402 shrinker_free(bc->shrink); 403 404 /* vfree() can allocate memory: */ 405 flags = memalloc_nofs_save(); 406 mutex_lock(&bc->lock); 407 408 if (c->verify_data) 409 list_move(&c->verify_data->list, &bc->live); 410 411 kvpfree(c->verify_ondisk, c->opts.btree_node_size); 412 413 for (i = 0; i < btree_id_nr_alive(c); i++) { 414 struct btree_root *r = bch2_btree_id_root(c, i); 415 416 if (r->b) 417 list_add(&r->b->list, &bc->live); 418 } 419 420 list_splice(&bc->freeable, &bc->live); 421 422 while (!list_empty(&bc->live)) { 423 b = list_first_entry(&bc->live, struct btree, list); 424 425 BUG_ON(btree_node_read_in_flight(b) || 426 btree_node_write_in_flight(b)); 427 428 btree_node_data_free(c, b); 429 } 430 431 BUG_ON(!bch2_journal_error(&c->journal) && 432 atomic_read(&c->btree_cache.dirty)); 433 434 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 435 436 while (!list_empty(&bc->freed_nonpcpu)) { 437 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); 438 list_del(&b->list); 439 six_lock_exit(&b->c.lock); 440 kfree(b); 441 } 442 443 mutex_unlock(&bc->lock); 444 memalloc_nofs_restore(flags); 445 446 if (bc->table_init_done) 447 rhashtable_destroy(&bc->table); 448 } 449 450 int bch2_fs_btree_cache_init(struct bch_fs *c) 451 { 452 struct btree_cache *bc = &c->btree_cache; 453 struct shrinker *shrink; 454 unsigned i; 455 int ret = 0; 456 457 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 458 if (ret) 459 goto err; 460 461 bc->table_init_done = true; 462 463 bch2_recalc_btree_reserve(c); 464 465 for (i = 0; i < bc->reserve; i++) 466 if (!__bch2_btree_node_mem_alloc(c)) 467 goto err; 468 469 list_splice_init(&bc->live, &bc->freeable); 470 471 mutex_init(&c->verify_lock); 472 473 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 474 if (!shrink) 475 goto err; 476 bc->shrink = shrink; 477 shrink->count_objects = bch2_btree_cache_count; 478 shrink->scan_objects = bch2_btree_cache_scan; 479 shrink->seeks = 4; 480 shrink->private_data = c; 481 shrinker_register(shrink); 482 483 return 0; 484 err: 485 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 486 } 487 488 void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 489 { 490 mutex_init(&bc->lock); 491 INIT_LIST_HEAD(&bc->live); 492 INIT_LIST_HEAD(&bc->freeable); 493 INIT_LIST_HEAD(&bc->freed_pcpu); 494 INIT_LIST_HEAD(&bc->freed_nonpcpu); 495 } 496 497 /* 498 * We can only have one thread cannibalizing other cached btree nodes at a time, 499 * or we'll deadlock. We use an open coded mutex to ensure that, which a 500 * cannibalize_bucket() will take. This means every time we unlock the root of 501 * the btree, we need to release this lock if we have it held. 502 */ 503 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) 504 { 505 struct bch_fs *c = trans->c; 506 struct btree_cache *bc = &c->btree_cache; 507 508 if (bc->alloc_lock == current) { 509 trace_and_count(c, btree_cache_cannibalize_unlock, trans); 510 bc->alloc_lock = NULL; 511 closure_wake_up(&bc->alloc_wait); 512 } 513 } 514 515 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) 516 { 517 struct bch_fs *c = trans->c; 518 struct btree_cache *bc = &c->btree_cache; 519 struct task_struct *old; 520 521 old = cmpxchg(&bc->alloc_lock, NULL, current); 522 if (old == NULL || old == current) 523 goto success; 524 525 if (!cl) { 526 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 527 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 528 } 529 530 closure_wait(&bc->alloc_wait, cl); 531 532 /* Try again, after adding ourselves to waitlist */ 533 old = cmpxchg(&bc->alloc_lock, NULL, current); 534 if (old == NULL || old == current) { 535 /* We raced */ 536 closure_wake_up(&bc->alloc_wait); 537 goto success; 538 } 539 540 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 541 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 542 543 success: 544 trace_and_count(c, btree_cache_cannibalize_lock, trans); 545 return 0; 546 } 547 548 static struct btree *btree_node_cannibalize(struct bch_fs *c) 549 { 550 struct btree_cache *bc = &c->btree_cache; 551 struct btree *b; 552 553 list_for_each_entry_reverse(b, &bc->live, list) 554 if (!btree_node_reclaim(c, b)) 555 return b; 556 557 while (1) { 558 list_for_each_entry_reverse(b, &bc->live, list) 559 if (!btree_node_write_and_reclaim(c, b)) 560 return b; 561 562 /* 563 * Rare case: all nodes were intent-locked. 564 * Just busy-wait. 565 */ 566 WARN_ONCE(1, "btree cache cannibalize failed\n"); 567 cond_resched(); 568 } 569 } 570 571 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 572 { 573 struct bch_fs *c = trans->c; 574 struct btree_cache *bc = &c->btree_cache; 575 struct list_head *freed = pcpu_read_locks 576 ? &bc->freed_pcpu 577 : &bc->freed_nonpcpu; 578 struct btree *b, *b2; 579 u64 start_time = local_clock(); 580 unsigned flags; 581 582 flags = memalloc_nofs_save(); 583 mutex_lock(&bc->lock); 584 585 /* 586 * We never free struct btree itself, just the memory that holds the on 587 * disk node. Check the freed list before allocating a new one: 588 */ 589 list_for_each_entry(b, freed, list) 590 if (!btree_node_reclaim(c, b)) { 591 list_del_init(&b->list); 592 goto got_node; 593 } 594 595 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 596 if (!b) { 597 mutex_unlock(&bc->lock); 598 bch2_trans_unlock(trans); 599 b = __btree_node_mem_alloc(c, GFP_KERNEL); 600 if (!b) 601 goto err; 602 mutex_lock(&bc->lock); 603 } 604 605 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); 606 607 BUG_ON(!six_trylock_intent(&b->c.lock)); 608 BUG_ON(!six_trylock_write(&b->c.lock)); 609 got_node: 610 611 /* 612 * btree_free() doesn't free memory; it sticks the node on the end of 613 * the list. Check if there's any freed nodes there: 614 */ 615 list_for_each_entry(b2, &bc->freeable, list) 616 if (!btree_node_reclaim(c, b2)) { 617 swap(b->data, b2->data); 618 swap(b->aux_data, b2->aux_data); 619 btree_node_to_freedlist(bc, b2); 620 six_unlock_write(&b2->c.lock); 621 six_unlock_intent(&b2->c.lock); 622 goto got_mem; 623 } 624 625 mutex_unlock(&bc->lock); 626 627 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 628 bch2_trans_unlock(trans); 629 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 630 goto err; 631 } 632 633 mutex_lock(&bc->lock); 634 bc->used++; 635 got_mem: 636 mutex_unlock(&bc->lock); 637 638 BUG_ON(btree_node_hashed(b)); 639 BUG_ON(btree_node_dirty(b)); 640 BUG_ON(btree_node_write_in_flight(b)); 641 out: 642 b->flags = 0; 643 b->written = 0; 644 b->nsets = 0; 645 b->sib_u64s[0] = 0; 646 b->sib_u64s[1] = 0; 647 b->whiteout_u64s = 0; 648 bch2_btree_keys_init(b); 649 set_btree_node_accessed(b); 650 651 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 652 start_time); 653 654 memalloc_nofs_restore(flags); 655 return b; 656 err: 657 mutex_lock(&bc->lock); 658 659 /* Try to cannibalize another cached btree node: */ 660 if (bc->alloc_lock == current) { 661 b2 = btree_node_cannibalize(c); 662 clear_btree_node_just_written(b2); 663 bch2_btree_node_hash_remove(bc, b2); 664 665 if (b) { 666 swap(b->data, b2->data); 667 swap(b->aux_data, b2->aux_data); 668 btree_node_to_freedlist(bc, b2); 669 six_unlock_write(&b2->c.lock); 670 six_unlock_intent(&b2->c.lock); 671 } else { 672 b = b2; 673 list_del_init(&b->list); 674 } 675 676 mutex_unlock(&bc->lock); 677 678 trace_and_count(c, btree_cache_cannibalize, trans); 679 goto out; 680 } 681 682 mutex_unlock(&bc->lock); 683 memalloc_nofs_restore(flags); 684 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 685 } 686 687 /* Slowpath, don't want it inlined into btree_iter_traverse() */ 688 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 689 struct btree_path *path, 690 const struct bkey_i *k, 691 enum btree_id btree_id, 692 unsigned level, 693 enum six_lock_type lock_type, 694 bool sync) 695 { 696 struct bch_fs *c = trans->c; 697 struct btree_cache *bc = &c->btree_cache; 698 struct btree *b; 699 u32 seq; 700 701 BUG_ON(level + 1 >= BTREE_MAX_DEPTH); 702 /* 703 * Parent node must be locked, else we could read in a btree node that's 704 * been freed: 705 */ 706 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 707 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 708 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 709 } 710 711 b = bch2_btree_node_mem_alloc(trans, level != 0); 712 713 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 714 trans->memory_allocation_failure = true; 715 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 716 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 717 } 718 719 if (IS_ERR(b)) 720 return b; 721 722 bkey_copy(&b->key, k); 723 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 724 /* raced with another fill: */ 725 726 /* mark as unhashed... */ 727 b->hash_val = 0; 728 729 mutex_lock(&bc->lock); 730 list_add(&b->list, &bc->freeable); 731 mutex_unlock(&bc->lock); 732 733 six_unlock_write(&b->c.lock); 734 six_unlock_intent(&b->c.lock); 735 return NULL; 736 } 737 738 set_btree_node_read_in_flight(b); 739 740 six_unlock_write(&b->c.lock); 741 seq = six_lock_seq(&b->c.lock); 742 six_unlock_intent(&b->c.lock); 743 744 /* Unlock before doing IO: */ 745 if (path && sync) 746 bch2_trans_unlock_noassert(trans); 747 748 bch2_btree_node_read(trans, b, sync); 749 750 if (!sync) 751 return NULL; 752 753 if (path) { 754 int ret = bch2_trans_relock(trans) ?: 755 bch2_btree_path_relock_intent(trans, path); 756 if (ret) { 757 BUG_ON(!trans->restarted); 758 return ERR_PTR(ret); 759 } 760 } 761 762 if (!six_relock_type(&b->c.lock, lock_type, seq)) { 763 if (path) 764 trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); 765 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); 766 } 767 768 return b; 769 } 770 771 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 772 { 773 struct printbuf buf = PRINTBUF; 774 775 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 776 return; 777 778 prt_printf(&buf, 779 "btree node header doesn't match ptr\n" 780 "btree %s level %u\n" 781 "ptr: ", 782 bch2_btree_id_str(b->c.btree_id), b->c.level); 783 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 784 785 prt_printf(&buf, "\nheader: btree %s level %llu\n" 786 "min ", 787 bch2_btree_id_str(BTREE_NODE_ID(b->data)), 788 BTREE_NODE_LEVEL(b->data)); 789 bch2_bpos_to_text(&buf, b->data->min_key); 790 791 prt_printf(&buf, "\nmax "); 792 bch2_bpos_to_text(&buf, b->data->max_key); 793 794 bch2_fs_inconsistent(c, "%s", buf.buf); 795 printbuf_exit(&buf); 796 } 797 798 static inline void btree_check_header(struct bch_fs *c, struct btree *b) 799 { 800 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 801 b->c.level != BTREE_NODE_LEVEL(b->data) || 802 !bpos_eq(b->data->max_key, b->key.k.p) || 803 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 804 !bpos_eq(b->data->min_key, 805 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 806 btree_bad_header(c, b); 807 } 808 809 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 810 const struct bkey_i *k, unsigned level, 811 enum six_lock_type lock_type, 812 unsigned long trace_ip) 813 { 814 struct bch_fs *c = trans->c; 815 struct btree_cache *bc = &c->btree_cache; 816 struct btree *b; 817 struct bset_tree *t; 818 bool need_relock = false; 819 int ret; 820 821 EBUG_ON(level >= BTREE_MAX_DEPTH); 822 retry: 823 b = btree_cache_find(bc, k); 824 if (unlikely(!b)) { 825 /* 826 * We must have the parent locked to call bch2_btree_node_fill(), 827 * else we could read in a btree node from disk that's been 828 * freed: 829 */ 830 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 831 level, lock_type, true); 832 need_relock = true; 833 834 /* We raced and found the btree node in the cache */ 835 if (!b) 836 goto retry; 837 838 if (IS_ERR(b)) 839 return b; 840 } else { 841 if (btree_node_read_locked(path, level + 1)) 842 btree_node_unlock(trans, path, level + 1); 843 844 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 845 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 846 return ERR_PTR(ret); 847 848 BUG_ON(ret); 849 850 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 851 b->c.level != level || 852 race_fault())) { 853 six_unlock_type(&b->c.lock, lock_type); 854 if (bch2_btree_node_relock(trans, path, level + 1)) 855 goto retry; 856 857 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 858 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 859 } 860 861 /* avoid atomic set bit if it's not needed: */ 862 if (!btree_node_accessed(b)) 863 set_btree_node_accessed(b); 864 } 865 866 if (unlikely(btree_node_read_in_flight(b))) { 867 u32 seq = six_lock_seq(&b->c.lock); 868 869 six_unlock_type(&b->c.lock, lock_type); 870 bch2_trans_unlock(trans); 871 need_relock = true; 872 873 bch2_btree_node_wait_on_read(b); 874 875 /* 876 * should_be_locked is not set on this path yet, so we need to 877 * relock it specifically: 878 */ 879 if (!six_relock_type(&b->c.lock, lock_type, seq)) 880 goto retry; 881 } 882 883 if (unlikely(need_relock)) { 884 ret = bch2_trans_relock(trans) ?: 885 bch2_btree_path_relock_intent(trans, path); 886 if (ret) { 887 six_unlock_type(&b->c.lock, lock_type); 888 return ERR_PTR(ret); 889 } 890 } 891 892 prefetch(b->aux_data); 893 894 for_each_bset(b, t) { 895 void *p = (u64 *) b->aux_data + t->aux_data_offset; 896 897 prefetch(p + L1_CACHE_BYTES * 0); 898 prefetch(p + L1_CACHE_BYTES * 1); 899 prefetch(p + L1_CACHE_BYTES * 2); 900 } 901 902 if (unlikely(btree_node_read_error(b))) { 903 six_unlock_type(&b->c.lock, lock_type); 904 return ERR_PTR(-EIO); 905 } 906 907 EBUG_ON(b->c.btree_id != path->btree_id); 908 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 909 btree_check_header(c, b); 910 911 return b; 912 } 913 914 /** 915 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 916 * in from disk if necessary. 917 * 918 * @trans: btree transaction object 919 * @path: btree_path being traversed 920 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 921 * @level: level of btree node being looked up (0 == leaf node) 922 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 923 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 924 * 925 * The btree node will have either a read or a write lock held, depending on 926 * the @write parameter. 927 * 928 * Returns: btree node or ERR_PTR() 929 */ 930 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 931 const struct bkey_i *k, unsigned level, 932 enum six_lock_type lock_type, 933 unsigned long trace_ip) 934 { 935 struct bch_fs *c = trans->c; 936 struct btree *b; 937 struct bset_tree *t; 938 int ret; 939 940 EBUG_ON(level >= BTREE_MAX_DEPTH); 941 942 b = btree_node_mem_ptr(k); 943 944 /* 945 * Check b->hash_val _before_ calling btree_node_lock() - this might not 946 * be the node we want anymore, and trying to lock the wrong node could 947 * cause an unneccessary transaction restart: 948 */ 949 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 950 !b || 951 b->hash_val != btree_ptr_hash_val(k))) 952 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 953 954 if (btree_node_read_locked(path, level + 1)) 955 btree_node_unlock(trans, path, level + 1); 956 957 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 958 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 959 return ERR_PTR(ret); 960 961 BUG_ON(ret); 962 963 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 964 b->c.level != level || 965 race_fault())) { 966 six_unlock_type(&b->c.lock, lock_type); 967 if (bch2_btree_node_relock(trans, path, level + 1)) 968 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 969 970 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 971 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 972 } 973 974 if (unlikely(btree_node_read_in_flight(b))) { 975 six_unlock_type(&b->c.lock, lock_type); 976 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 977 } 978 979 prefetch(b->aux_data); 980 981 for_each_bset(b, t) { 982 void *p = (u64 *) b->aux_data + t->aux_data_offset; 983 984 prefetch(p + L1_CACHE_BYTES * 0); 985 prefetch(p + L1_CACHE_BYTES * 1); 986 prefetch(p + L1_CACHE_BYTES * 2); 987 } 988 989 /* avoid atomic set bit if it's not needed: */ 990 if (!btree_node_accessed(b)) 991 set_btree_node_accessed(b); 992 993 if (unlikely(btree_node_read_error(b))) { 994 six_unlock_type(&b->c.lock, lock_type); 995 return ERR_PTR(-EIO); 996 } 997 998 EBUG_ON(b->c.btree_id != path->btree_id); 999 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1000 btree_check_header(c, b); 1001 1002 return b; 1003 } 1004 1005 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1006 const struct bkey_i *k, 1007 enum btree_id btree_id, 1008 unsigned level, 1009 bool nofill) 1010 { 1011 struct bch_fs *c = trans->c; 1012 struct btree_cache *bc = &c->btree_cache; 1013 struct btree *b; 1014 struct bset_tree *t; 1015 int ret; 1016 1017 EBUG_ON(level >= BTREE_MAX_DEPTH); 1018 1019 if (c->opts.btree_node_mem_ptr_optimization) { 1020 b = btree_node_mem_ptr(k); 1021 if (b) 1022 goto lock_node; 1023 } 1024 retry: 1025 b = btree_cache_find(bc, k); 1026 if (unlikely(!b)) { 1027 if (nofill) 1028 goto out; 1029 1030 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1031 level, SIX_LOCK_read, true); 1032 1033 /* We raced and found the btree node in the cache */ 1034 if (!b) 1035 goto retry; 1036 1037 if (IS_ERR(b) && 1038 !bch2_btree_cache_cannibalize_lock(trans, NULL)) 1039 goto retry; 1040 1041 if (IS_ERR(b)) 1042 goto out; 1043 } else { 1044 lock_node: 1045 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1046 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1047 return ERR_PTR(ret); 1048 1049 BUG_ON(ret); 1050 1051 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1052 b->c.btree_id != btree_id || 1053 b->c.level != level)) { 1054 six_unlock_read(&b->c.lock); 1055 goto retry; 1056 } 1057 } 1058 1059 /* XXX: waiting on IO with btree locks held: */ 1060 __bch2_btree_node_wait_on_read(b); 1061 1062 prefetch(b->aux_data); 1063 1064 for_each_bset(b, t) { 1065 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1066 1067 prefetch(p + L1_CACHE_BYTES * 0); 1068 prefetch(p + L1_CACHE_BYTES * 1); 1069 prefetch(p + L1_CACHE_BYTES * 2); 1070 } 1071 1072 /* avoid atomic set bit if it's not needed: */ 1073 if (!btree_node_accessed(b)) 1074 set_btree_node_accessed(b); 1075 1076 if (unlikely(btree_node_read_error(b))) { 1077 six_unlock_read(&b->c.lock); 1078 b = ERR_PTR(-EIO); 1079 goto out; 1080 } 1081 1082 EBUG_ON(b->c.btree_id != btree_id); 1083 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1084 btree_check_header(c, b); 1085 out: 1086 bch2_btree_cache_cannibalize_unlock(trans); 1087 return b; 1088 } 1089 1090 int bch2_btree_node_prefetch(struct btree_trans *trans, 1091 struct btree_path *path, 1092 const struct bkey_i *k, 1093 enum btree_id btree_id, unsigned level) 1094 { 1095 struct bch_fs *c = trans->c; 1096 struct btree_cache *bc = &c->btree_cache; 1097 struct btree *b; 1098 1099 BUG_ON(trans && !btree_node_locked(path, level + 1)); 1100 BUG_ON(level >= BTREE_MAX_DEPTH); 1101 1102 b = btree_cache_find(bc, k); 1103 if (b) 1104 return 0; 1105 1106 b = bch2_btree_node_fill(trans, path, k, btree_id, 1107 level, SIX_LOCK_read, false); 1108 return PTR_ERR_OR_ZERO(b); 1109 } 1110 1111 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1112 { 1113 struct bch_fs *c = trans->c; 1114 struct btree_cache *bc = &c->btree_cache; 1115 struct btree *b; 1116 1117 b = btree_cache_find(bc, k); 1118 if (!b) 1119 return; 1120 wait_on_io: 1121 /* not allowed to wait on io with btree locks held: */ 1122 1123 /* XXX we're called from btree_gc which will be holding other btree 1124 * nodes locked 1125 */ 1126 __bch2_btree_node_wait_on_read(b); 1127 __bch2_btree_node_wait_on_write(b); 1128 1129 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1130 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1131 1132 if (btree_node_dirty(b)) { 1133 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1134 six_unlock_write(&b->c.lock); 1135 six_unlock_intent(&b->c.lock); 1136 goto wait_on_io; 1137 } 1138 1139 BUG_ON(btree_node_dirty(b)); 1140 1141 mutex_lock(&bc->lock); 1142 btree_node_data_free(c, b); 1143 bch2_btree_node_hash_remove(bc, b); 1144 mutex_unlock(&bc->lock); 1145 1146 six_unlock_write(&b->c.lock); 1147 six_unlock_intent(&b->c.lock); 1148 } 1149 1150 const char *bch2_btree_id_str(enum btree_id btree) 1151 { 1152 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1153 } 1154 1155 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1156 { 1157 prt_printf(out, "%s level %u/%u\n ", 1158 bch2_btree_id_str(b->c.btree_id), 1159 b->c.level, 1160 bch2_btree_id_root(c, b->c.btree_id)->level); 1161 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1162 } 1163 1164 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1165 { 1166 struct bset_stats stats; 1167 1168 memset(&stats, 0, sizeof(stats)); 1169 1170 bch2_btree_keys_stats(b, &stats); 1171 1172 prt_printf(out, "l %u ", b->c.level); 1173 bch2_bpos_to_text(out, b->data->min_key); 1174 prt_printf(out, " - "); 1175 bch2_bpos_to_text(out, b->data->max_key); 1176 prt_printf(out, ":\n" 1177 " ptrs: "); 1178 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1179 prt_newline(out); 1180 1181 prt_printf(out, 1182 " format: "); 1183 bch2_bkey_format_to_text(out, &b->format); 1184 1185 prt_printf(out, 1186 " unpack fn len: %u\n" 1187 " bytes used %zu/%zu (%zu%% full)\n" 1188 " sib u64s: %u, %u (merge threshold %u)\n" 1189 " nr packed keys %u\n" 1190 " nr unpacked keys %u\n" 1191 " floats %zu\n" 1192 " failed unpacked %zu\n", 1193 b->unpack_fn_len, 1194 b->nr.live_u64s * sizeof(u64), 1195 btree_buf_bytes(b) - sizeof(struct btree_node), 1196 b->nr.live_u64s * 100 / btree_max_u64s(c), 1197 b->sib_u64s[0], 1198 b->sib_u64s[1], 1199 c->btree_foreground_merge_threshold, 1200 b->nr.packed_keys, 1201 b->nr.unpacked_keys, 1202 stats.floats, 1203 stats.failed); 1204 } 1205 1206 void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) 1207 { 1208 prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); 1209 prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); 1210 prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); 1211 } 1212