1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 .automatic_shrinking = true, 40 }; 41 42 static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, 43 struct bkey_cached *ck, 44 enum btree_node_locked_type lock_held) 45 { 46 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 47 path->l[0].b = (void *) ck; 48 mark_btree_node_locked(trans, path, 0, lock_held); 49 } 50 51 __flatten 52 inline struct bkey_cached * 53 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 54 { 55 struct bkey_cached_key key = { 56 .btree_id = btree_id, 57 .pos = pos, 58 }; 59 60 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 61 bch2_btree_key_cache_params); 62 } 63 64 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 65 { 66 if (!six_trylock_intent(&ck->c.lock)) 67 return false; 68 69 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 70 six_unlock_intent(&ck->c.lock); 71 return false; 72 } 73 74 if (!six_trylock_write(&ck->c.lock)) { 75 six_unlock_intent(&ck->c.lock); 76 return false; 77 } 78 79 return true; 80 } 81 82 static bool bkey_cached_evict(struct btree_key_cache *c, 83 struct bkey_cached *ck) 84 { 85 bool ret = !rhashtable_remove_fast(&c->table, &ck->hash, 86 bch2_btree_key_cache_params); 87 if (ret) { 88 memset(&ck->key, ~0, sizeof(ck->key)); 89 atomic_long_dec(&c->nr_keys); 90 } 91 92 return ret; 93 } 94 95 static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu) 96 { 97 struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier); 98 struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu); 99 100 this_cpu_dec(*c->btree_key_cache.nr_pending); 101 kmem_cache_free(bch2_key_cache, ck); 102 } 103 104 static void bkey_cached_free(struct btree_key_cache *bc, 105 struct bkey_cached *ck) 106 { 107 kfree(ck->k); 108 ck->k = NULL; 109 ck->u64s = 0; 110 111 six_unlock_write(&ck->c.lock); 112 six_unlock_intent(&ck->c.lock); 113 114 bool pcpu_readers = ck->c.lock.readers != NULL; 115 rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu); 116 this_cpu_inc(*bc->nr_pending); 117 } 118 119 static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) 120 { 121 gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; 122 123 struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); 124 if (unlikely(!ck)) 125 return NULL; 126 ck->k = kmalloc(key_u64s * sizeof(u64), gfp); 127 if (unlikely(!ck->k)) { 128 kmem_cache_free(bch2_key_cache, ck); 129 return NULL; 130 } 131 ck->u64s = key_u64s; 132 return ck; 133 } 134 135 static struct bkey_cached * 136 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) 137 { 138 struct bch_fs *c = trans->c; 139 struct btree_key_cache *bc = &c->btree_key_cache; 140 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 141 int ret; 142 143 struct bkey_cached *ck = container_of_or_null( 144 rcu_pending_dequeue(&bc->pending[pcpu_readers]), 145 struct bkey_cached, rcu); 146 if (ck) 147 goto lock; 148 149 ck = allocate_dropping_locks(trans, ret, 150 __bkey_cached_alloc(key_u64s, _gfp)); 151 if (ret) { 152 if (ck) 153 kfree(ck->k); 154 kmem_cache_free(bch2_key_cache, ck); 155 return ERR_PTR(ret); 156 } 157 158 if (ck) { 159 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 160 ck->c.cached = true; 161 goto lock; 162 } 163 164 ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]), 165 struct bkey_cached, rcu); 166 if (ck) 167 goto lock; 168 lock: 169 six_lock_intent(&ck->c.lock, NULL, NULL); 170 six_lock_write(&ck->c.lock, NULL, NULL); 171 return ck; 172 } 173 174 static struct bkey_cached * 175 bkey_cached_reuse(struct btree_key_cache *c) 176 { 177 struct bucket_table *tbl; 178 struct rhash_head *pos; 179 struct bkey_cached *ck; 180 unsigned i; 181 182 rcu_read_lock(); 183 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 184 for (i = 0; i < tbl->size; i++) 185 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 186 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 187 bkey_cached_lock_for_evict(ck)) { 188 if (bkey_cached_evict(c, ck)) 189 goto out; 190 six_unlock_write(&ck->c.lock); 191 six_unlock_intent(&ck->c.lock); 192 } 193 } 194 ck = NULL; 195 out: 196 rcu_read_unlock(); 197 return ck; 198 } 199 200 static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path, 201 struct bkey_s_c k) 202 { 203 struct bch_fs *c = trans->c; 204 struct btree_key_cache *bc = &c->btree_key_cache; 205 206 /* 207 * bch2_varint_decode can read past the end of the buffer by at 208 * most 7 bytes (it won't be used): 209 */ 210 unsigned key_u64s = k.k->u64s + 1; 211 212 /* 213 * Allocate some extra space so that the transaction commit path is less 214 * likely to have to reallocate, since that requires a transaction 215 * restart: 216 */ 217 key_u64s = min(256U, (key_u64s * 3) / 2); 218 key_u64s = roundup_pow_of_two(key_u64s); 219 220 struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s); 221 int ret = PTR_ERR_OR_ZERO(ck); 222 if (ret) 223 return ret; 224 225 if (unlikely(!ck)) { 226 ck = bkey_cached_reuse(bc); 227 if (unlikely(!ck)) { 228 bch_err(c, "error allocating memory for key cache item, btree %s", 229 bch2_btree_id_str(path->btree_id)); 230 return -BCH_ERR_ENOMEM_btree_key_cache_create; 231 } 232 } 233 234 ck->c.level = 0; 235 ck->c.btree_id = path->btree_id; 236 ck->key.btree_id = path->btree_id; 237 ck->key.pos = path->pos; 238 ck->flags = 1U << BKEY_CACHED_ACCESSED; 239 240 if (unlikely(key_u64s > ck->u64s)) { 241 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 242 243 struct bkey_i *new_k = allocate_dropping_locks(trans, ret, 244 kmalloc(key_u64s * sizeof(u64), _gfp)); 245 if (unlikely(!new_k)) { 246 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 247 bch2_btree_id_str(ck->key.btree_id), key_u64s); 248 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 249 } else if (ret) { 250 kfree(new_k); 251 goto err; 252 } 253 254 kfree(ck->k); 255 ck->k = new_k; 256 ck->u64s = key_u64s; 257 } 258 259 bkey_reassemble(ck->k, k); 260 261 ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); 262 if (unlikely(ret)) /* raced with another fill? */ 263 goto err; 264 265 atomic_long_inc(&bc->nr_keys); 266 six_unlock_write(&ck->c.lock); 267 268 enum six_lock_type lock_want = __btree_lock_want(path, 0); 269 if (lock_want == SIX_LOCK_read) 270 six_lock_downgrade(&ck->c.lock); 271 btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); 272 path->uptodate = BTREE_ITER_UPTODATE; 273 return 0; 274 err: 275 bkey_cached_free(bc, ck); 276 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 277 278 return ret; 279 } 280 281 static noinline int btree_key_cache_fill(struct btree_trans *trans, 282 struct btree_path *ck_path, 283 unsigned flags) 284 { 285 if (flags & BTREE_ITER_cached_nofill) { 286 ck_path->uptodate = BTREE_ITER_UPTODATE; 287 return 0; 288 } 289 290 struct bch_fs *c = trans->c; 291 struct btree_iter iter; 292 struct bkey_s_c k; 293 int ret; 294 295 bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, 296 BTREE_ITER_key_cache_fill| 297 BTREE_ITER_cached_nofill); 298 iter.flags &= ~BTREE_ITER_with_journal; 299 k = bch2_btree_iter_peek_slot(&iter); 300 ret = bkey_err(k); 301 if (ret) 302 goto err; 303 304 /* Recheck after btree lookup, before allocating: */ 305 ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; 306 if (unlikely(ret)) 307 goto out; 308 309 ret = btree_key_cache_create(trans, ck_path, k); 310 if (ret) 311 goto err; 312 out: 313 /* We're not likely to need this iterator again: */ 314 bch2_set_btree_iter_dontneed(&iter); 315 err: 316 bch2_trans_iter_exit(trans, &iter); 317 return ret; 318 } 319 320 static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, 321 struct btree_path *path) 322 { 323 struct bch_fs *c = trans->c; 324 struct bkey_cached *ck; 325 retry: 326 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 327 if (!ck) 328 return -ENOENT; 329 330 enum six_lock_type lock_want = __btree_lock_want(path, 0); 331 332 int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); 333 if (ret) 334 return ret; 335 336 if (ck->key.btree_id != path->btree_id || 337 !bpos_eq(ck->key.pos, path->pos)) { 338 six_unlock_type(&ck->c.lock, lock_want); 339 goto retry; 340 } 341 342 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 343 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 344 345 btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); 346 path->uptodate = BTREE_ITER_UPTODATE; 347 return 0; 348 } 349 350 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 351 unsigned flags) 352 { 353 EBUG_ON(path->level); 354 355 path->l[1].b = NULL; 356 357 int ret; 358 do { 359 ret = btree_path_traverse_cached_fast(trans, path); 360 if (unlikely(ret == -ENOENT)) 361 ret = btree_key_cache_fill(trans, path, flags); 362 } while (ret == -EEXIST); 363 364 if (unlikely(ret)) { 365 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 366 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 367 btree_node_unlock(trans, path, 0); 368 path->l[0].b = ERR_PTR(ret); 369 } 370 } 371 return ret; 372 } 373 374 static int btree_key_cache_flush_pos(struct btree_trans *trans, 375 struct bkey_cached_key key, 376 u64 journal_seq, 377 unsigned commit_flags, 378 bool evict) 379 { 380 struct bch_fs *c = trans->c; 381 struct journal *j = &c->journal; 382 struct btree_iter c_iter, b_iter; 383 struct bkey_cached *ck = NULL; 384 int ret; 385 386 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 387 BTREE_ITER_slots| 388 BTREE_ITER_intent| 389 BTREE_ITER_all_snapshots); 390 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 391 BTREE_ITER_cached| 392 BTREE_ITER_intent); 393 b_iter.flags &= ~BTREE_ITER_with_key_cache; 394 395 ret = bch2_btree_iter_traverse(&c_iter); 396 if (ret) 397 goto out; 398 399 ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; 400 if (!ck) 401 goto out; 402 403 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 404 if (evict) 405 goto evict; 406 goto out; 407 } 408 409 if (journal_seq && ck->journal.seq != journal_seq) 410 goto out; 411 412 trans->journal_res.seq = ck->journal.seq; 413 414 /* 415 * If we're at the end of the journal, we really want to free up space 416 * in the journal right away - we don't want to pin that old journal 417 * sequence number with a new btree node write, we want to re-journal 418 * the update 419 */ 420 if (ck->journal.seq == journal_last_seq(j)) 421 commit_flags |= BCH_WATERMARK_reclaim; 422 423 if (ck->journal.seq != journal_last_seq(j) || 424 !test_bit(JOURNAL_space_low, &c->journal.flags)) 425 commit_flags |= BCH_TRANS_COMMIT_no_journal_res; 426 427 ret = bch2_btree_iter_traverse(&b_iter) ?: 428 bch2_trans_update(trans, &b_iter, ck->k, 429 BTREE_UPDATE_key_cache_reclaim| 430 BTREE_UPDATE_internal_snapshot_node| 431 BTREE_TRIGGER_norun) ?: 432 bch2_trans_commit(trans, NULL, NULL, 433 BCH_TRANS_COMMIT_no_check_rw| 434 BCH_TRANS_COMMIT_no_enospc| 435 commit_flags); 436 437 bch2_fs_fatal_err_on(ret && 438 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 439 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 440 !bch2_journal_error(j), c, 441 "flushing key cache: %s", bch2_err_str(ret)); 442 if (ret) 443 goto out; 444 445 bch2_journal_pin_drop(j, &ck->journal); 446 447 struct btree_path *path = btree_iter_path(trans, &c_iter); 448 BUG_ON(!btree_node_locked(path, 0)); 449 450 if (!evict) { 451 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 452 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 453 atomic_long_dec(&c->btree_key_cache.nr_dirty); 454 } 455 } else { 456 struct btree_path *path2; 457 unsigned i; 458 evict: 459 trans_for_each_path(trans, path2, i) 460 if (path2 != path) 461 __bch2_btree_path_unlock(trans, path2); 462 463 bch2_btree_node_lock_write_nofail(trans, path, &ck->c); 464 465 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 466 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 467 atomic_long_dec(&c->btree_key_cache.nr_dirty); 468 } 469 470 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 471 if (bkey_cached_evict(&c->btree_key_cache, ck)) { 472 bkey_cached_free(&c->btree_key_cache, ck); 473 } else { 474 six_unlock_write(&ck->c.lock); 475 six_unlock_intent(&ck->c.lock); 476 } 477 } 478 out: 479 bch2_trans_iter_exit(trans, &b_iter); 480 bch2_trans_iter_exit(trans, &c_iter); 481 return ret; 482 } 483 484 int bch2_btree_key_cache_journal_flush(struct journal *j, 485 struct journal_entry_pin *pin, u64 seq) 486 { 487 struct bch_fs *c = container_of(j, struct bch_fs, journal); 488 struct bkey_cached *ck = 489 container_of(pin, struct bkey_cached, journal); 490 struct bkey_cached_key key; 491 struct btree_trans *trans = bch2_trans_get(c); 492 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 493 int ret = 0; 494 495 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 496 key = ck->key; 497 498 if (ck->journal.seq != seq || 499 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 500 six_unlock_read(&ck->c.lock); 501 goto unlock; 502 } 503 504 if (ck->seq != seq) { 505 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 506 bch2_btree_key_cache_journal_flush); 507 six_unlock_read(&ck->c.lock); 508 goto unlock; 509 } 510 six_unlock_read(&ck->c.lock); 511 512 ret = lockrestart_do(trans, 513 btree_key_cache_flush_pos(trans, key, seq, 514 BCH_TRANS_COMMIT_journal_reclaim, false)); 515 unlock: 516 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 517 518 bch2_trans_put(trans); 519 return ret; 520 } 521 522 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 523 unsigned flags, 524 struct btree_insert_entry *insert_entry) 525 { 526 struct bch_fs *c = trans->c; 527 struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; 528 struct bkey_i *insert = insert_entry->k; 529 bool kick_reclaim = false; 530 531 BUG_ON(insert->k.u64s > ck->u64s); 532 533 bkey_copy(ck->k, insert); 534 535 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 536 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); 537 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 538 atomic_long_inc(&c->btree_key_cache.nr_dirty); 539 540 if (bch2_nr_btree_keys_need_flush(c)) 541 kick_reclaim = true; 542 } 543 544 /* 545 * To minimize lock contention, we only add the journal pin here and 546 * defer pin updates to the flush callback via ->seq. Be careful not to 547 * update ->seq on nojournal commits because we don't want to update the 548 * pin to a seq that doesn't include journal updates on disk. Otherwise 549 * we risk losing the update after a crash. 550 * 551 * The only exception is if the pin is not active in the first place. We 552 * have to add the pin because journal reclaim drives key cache 553 * flushing. The flush callback will not proceed unless ->seq matches 554 * the latest pin, so make sure it starts with a consistent value. 555 */ 556 if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || 557 !journal_pin_active(&ck->journal)) { 558 ck->seq = trans->journal_res.seq; 559 } 560 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 561 &ck->journal, bch2_btree_key_cache_journal_flush); 562 563 if (kick_reclaim) 564 journal_reclaim_kick(&c->journal); 565 return true; 566 } 567 568 void bch2_btree_key_cache_drop(struct btree_trans *trans, 569 struct btree_path *path) 570 { 571 struct bch_fs *c = trans->c; 572 struct btree_key_cache *bc = &c->btree_key_cache; 573 struct bkey_cached *ck = (void *) path->l[0].b; 574 575 /* 576 * We just did an update to the btree, bypassing the key cache: the key 577 * cache key is now stale and must be dropped, even if dirty: 578 */ 579 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 580 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 581 atomic_long_dec(&c->btree_key_cache.nr_dirty); 582 bch2_journal_pin_drop(&c->journal, &ck->journal); 583 } 584 585 bkey_cached_evict(bc, ck); 586 bkey_cached_free(bc, ck); 587 588 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 589 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); 590 path->should_be_locked = false; 591 } 592 593 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 594 struct shrink_control *sc) 595 { 596 struct bch_fs *c = shrink->private_data; 597 struct btree_key_cache *bc = &c->btree_key_cache; 598 struct bucket_table *tbl; 599 struct bkey_cached *ck; 600 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 601 unsigned iter, start; 602 int srcu_idx; 603 604 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 605 rcu_read_lock(); 606 607 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 608 609 /* 610 * Scanning is expensive while a rehash is in progress - most elements 611 * will be on the new hashtable, if it's in progress 612 * 613 * A rehash could still start while we're scanning - that's ok, we'll 614 * still see most elements. 615 */ 616 if (unlikely(tbl->nest)) { 617 rcu_read_unlock(); 618 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 619 return SHRINK_STOP; 620 } 621 622 iter = bc->shrink_iter; 623 if (iter >= tbl->size) 624 iter = 0; 625 start = iter; 626 627 do { 628 struct rhash_head *pos, *next; 629 630 pos = rht_ptr_rcu(&tbl->buckets[iter]); 631 632 while (!rht_is_a_nulls(pos)) { 633 next = rht_dereference_bucket_rcu(pos->next, tbl, iter); 634 ck = container_of(pos, struct bkey_cached, hash); 635 636 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 637 bc->skipped_dirty++; 638 } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { 639 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 640 bc->skipped_accessed++; 641 } else if (!bkey_cached_lock_for_evict(ck)) { 642 bc->skipped_lock_fail++; 643 } else if (bkey_cached_evict(bc, ck)) { 644 bkey_cached_free(bc, ck); 645 bc->freed++; 646 freed++; 647 } else { 648 six_unlock_write(&ck->c.lock); 649 six_unlock_intent(&ck->c.lock); 650 } 651 652 scanned++; 653 if (scanned >= nr) 654 goto out; 655 656 pos = next; 657 } 658 659 iter++; 660 if (iter >= tbl->size) 661 iter = 0; 662 } while (scanned < nr && iter != start); 663 out: 664 bc->shrink_iter = iter; 665 666 rcu_read_unlock(); 667 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 668 669 return freed; 670 } 671 672 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 673 struct shrink_control *sc) 674 { 675 struct bch_fs *c = shrink->private_data; 676 struct btree_key_cache *bc = &c->btree_key_cache; 677 long nr = atomic_long_read(&bc->nr_keys) - 678 atomic_long_read(&bc->nr_dirty); 679 680 /* 681 * Avoid hammering our shrinker too much if it's nearly empty - the 682 * shrinker code doesn't take into account how big our cache is, if it's 683 * mostly empty but the system is under memory pressure it causes nasty 684 * lock contention: 685 */ 686 nr -= 128; 687 688 return max(0L, nr); 689 } 690 691 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 692 { 693 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 694 struct bucket_table *tbl; 695 struct bkey_cached *ck; 696 struct rhash_head *pos; 697 LIST_HEAD(items); 698 unsigned i; 699 700 shrinker_free(bc->shrink); 701 702 /* 703 * The loop is needed to guard against racing with rehash: 704 */ 705 while (atomic_long_read(&bc->nr_keys)) { 706 rcu_read_lock(); 707 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 708 if (tbl) { 709 if (tbl->nest) { 710 /* wait for in progress rehash */ 711 rcu_read_unlock(); 712 mutex_lock(&bc->table.mutex); 713 mutex_unlock(&bc->table.mutex); 714 rcu_read_lock(); 715 continue; 716 } 717 for (i = 0; i < tbl->size; i++) 718 while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { 719 ck = container_of(pos, struct bkey_cached, hash); 720 BUG_ON(!bkey_cached_evict(bc, ck)); 721 kfree(ck->k); 722 kmem_cache_free(bch2_key_cache, ck); 723 } 724 } 725 rcu_read_unlock(); 726 } 727 728 if (atomic_long_read(&bc->nr_dirty) && 729 !bch2_journal_error(&c->journal) && 730 test_bit(BCH_FS_was_rw, &c->flags)) 731 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 732 atomic_long_read(&bc->nr_dirty)); 733 734 if (atomic_long_read(&bc->nr_keys)) 735 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 736 atomic_long_read(&bc->nr_keys)); 737 738 if (bc->table_init_done) 739 rhashtable_destroy(&bc->table); 740 741 rcu_pending_exit(&bc->pending[0]); 742 rcu_pending_exit(&bc->pending[1]); 743 744 free_percpu(bc->nr_pending); 745 } 746 747 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 748 { 749 } 750 751 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 752 { 753 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 754 struct shrinker *shrink; 755 756 bc->nr_pending = alloc_percpu(size_t); 757 if (!bc->nr_pending) 758 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 759 760 if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || 761 rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) 762 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 763 764 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 765 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 766 767 bc->table_init_done = true; 768 769 shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); 770 if (!shrink) 771 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 772 bc->shrink = shrink; 773 shrink->count_objects = bch2_btree_key_cache_count; 774 shrink->scan_objects = bch2_btree_key_cache_scan; 775 shrink->batch = 1 << 14; 776 shrink->seeks = 0; 777 shrink->private_data = c; 778 shrinker_register(shrink); 779 return 0; 780 } 781 782 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) 783 { 784 printbuf_tabstop_push(out, 24); 785 printbuf_tabstop_push(out, 12); 786 787 prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); 788 prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); 789 prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size); 790 prt_newline(out); 791 prt_printf(out, "shrinker:\n"); 792 prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); 793 prt_printf(out, "freed:\t%lu\r\n", bc->freed); 794 prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); 795 prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); 796 prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); 797 prt_newline(out); 798 prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending)); 799 } 800 801 void bch2_btree_key_cache_exit(void) 802 { 803 kmem_cache_destroy(bch2_key_cache); 804 } 805 806 int __init bch2_btree_key_cache_init(void) 807 { 808 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 809 if (!bch2_key_cache) 810 return -ENOMEM; 811 812 return 0; 813 } 814