1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 .automatic_shrinking = true, 40 }; 41 42 static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, 43 struct bkey_cached *ck, 44 enum btree_node_locked_type lock_held) 45 { 46 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 47 path->l[0].b = (void *) ck; 48 mark_btree_node_locked(trans, path, 0, lock_held); 49 } 50 51 __flatten 52 inline struct bkey_cached * 53 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 54 { 55 struct bkey_cached_key key = { 56 .btree_id = btree_id, 57 .pos = pos, 58 }; 59 60 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 61 bch2_btree_key_cache_params); 62 } 63 64 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 65 { 66 if (!six_trylock_intent(&ck->c.lock)) 67 return false; 68 69 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 70 six_unlock_intent(&ck->c.lock); 71 return false; 72 } 73 74 if (!six_trylock_write(&ck->c.lock)) { 75 six_unlock_intent(&ck->c.lock); 76 return false; 77 } 78 79 return true; 80 } 81 82 static bool bkey_cached_evict(struct btree_key_cache *c, 83 struct bkey_cached *ck) 84 { 85 bool ret = !rhashtable_remove_fast(&c->table, &ck->hash, 86 bch2_btree_key_cache_params); 87 if (ret) { 88 memset(&ck->key, ~0, sizeof(ck->key)); 89 atomic_long_dec(&c->nr_keys); 90 } 91 92 return ret; 93 } 94 95 static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu) 96 { 97 struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier); 98 struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu); 99 100 this_cpu_dec(*c->btree_key_cache.nr_pending); 101 kmem_cache_free(bch2_key_cache, ck); 102 } 103 104 static void bkey_cached_free(struct btree_key_cache *bc, 105 struct bkey_cached *ck) 106 { 107 kfree(ck->k); 108 ck->k = NULL; 109 ck->u64s = 0; 110 111 six_unlock_write(&ck->c.lock); 112 six_unlock_intent(&ck->c.lock); 113 114 bool pcpu_readers = ck->c.lock.readers != NULL; 115 rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu); 116 this_cpu_inc(*bc->nr_pending); 117 } 118 119 static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) 120 { 121 gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; 122 123 struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); 124 if (unlikely(!ck)) 125 return NULL; 126 ck->k = kmalloc(key_u64s * sizeof(u64), gfp); 127 if (unlikely(!ck->k)) { 128 kmem_cache_free(bch2_key_cache, ck); 129 return NULL; 130 } 131 ck->u64s = key_u64s; 132 return ck; 133 } 134 135 static struct bkey_cached * 136 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) 137 { 138 struct bch_fs *c = trans->c; 139 struct btree_key_cache *bc = &c->btree_key_cache; 140 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 141 int ret; 142 143 struct bkey_cached *ck = container_of_or_null( 144 rcu_pending_dequeue(&bc->pending[pcpu_readers]), 145 struct bkey_cached, rcu); 146 if (ck) 147 goto lock; 148 149 ck = allocate_dropping_locks(trans, ret, 150 __bkey_cached_alloc(key_u64s, _gfp)); 151 if (ret) { 152 if (ck) 153 kfree(ck->k); 154 kmem_cache_free(bch2_key_cache, ck); 155 return ERR_PTR(ret); 156 } 157 158 if (ck) { 159 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 160 ck->c.cached = true; 161 goto lock; 162 } 163 164 ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]), 165 struct bkey_cached, rcu); 166 if (ck) 167 goto lock; 168 lock: 169 six_lock_intent(&ck->c.lock, NULL, NULL); 170 six_lock_write(&ck->c.lock, NULL, NULL); 171 return ck; 172 } 173 174 static struct bkey_cached * 175 bkey_cached_reuse(struct btree_key_cache *c) 176 { 177 struct bucket_table *tbl; 178 struct rhash_head *pos; 179 struct bkey_cached *ck; 180 unsigned i; 181 182 rcu_read_lock(); 183 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 184 for (i = 0; i < tbl->size; i++) 185 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 186 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 187 bkey_cached_lock_for_evict(ck)) { 188 if (bkey_cached_evict(c, ck)) 189 goto out; 190 six_unlock_write(&ck->c.lock); 191 six_unlock_intent(&ck->c.lock); 192 } 193 } 194 ck = NULL; 195 out: 196 rcu_read_unlock(); 197 return ck; 198 } 199 200 static int btree_key_cache_create(struct btree_trans *trans, 201 struct btree_path *path, 202 struct btree_path *ck_path, 203 struct bkey_s_c k) 204 { 205 struct bch_fs *c = trans->c; 206 struct btree_key_cache *bc = &c->btree_key_cache; 207 208 /* 209 * bch2_varint_decode can read past the end of the buffer by at 210 * most 7 bytes (it won't be used): 211 */ 212 unsigned key_u64s = k.k->u64s + 1; 213 214 /* 215 * Allocate some extra space so that the transaction commit path is less 216 * likely to have to reallocate, since that requires a transaction 217 * restart: 218 */ 219 key_u64s = min(256U, (key_u64s * 3) / 2); 220 key_u64s = roundup_pow_of_two(key_u64s); 221 222 struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); 223 int ret = PTR_ERR_OR_ZERO(ck); 224 if (ret) 225 return ret; 226 227 if (unlikely(!ck)) { 228 ck = bkey_cached_reuse(bc); 229 if (unlikely(!ck)) { 230 bch_err(c, "error allocating memory for key cache item, btree %s", 231 bch2_btree_id_str(ck_path->btree_id)); 232 return -BCH_ERR_ENOMEM_btree_key_cache_create; 233 } 234 } 235 236 ck->c.level = 0; 237 ck->c.btree_id = ck_path->btree_id; 238 ck->key.btree_id = ck_path->btree_id; 239 ck->key.pos = ck_path->pos; 240 ck->flags = 1U << BKEY_CACHED_ACCESSED; 241 242 if (unlikely(key_u64s > ck->u64s)) { 243 mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); 244 245 struct bkey_i *new_k = allocate_dropping_locks(trans, ret, 246 kmalloc(key_u64s * sizeof(u64), _gfp)); 247 if (unlikely(!new_k)) { 248 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 249 bch2_btree_id_str(ck->key.btree_id), key_u64s); 250 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 251 } else if (ret) { 252 kfree(new_k); 253 goto err; 254 } 255 256 kfree(ck->k); 257 ck->k = new_k; 258 ck->u64s = key_u64s; 259 } 260 261 bkey_reassemble(ck->k, k); 262 263 ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); 264 if (unlikely(ret)) 265 goto err; 266 267 ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); 268 269 bch2_btree_node_unlock_write(trans, path, path_l(path)->b); 270 271 if (unlikely(ret)) /* raced with another fill? */ 272 goto err; 273 274 atomic_long_inc(&bc->nr_keys); 275 six_unlock_write(&ck->c.lock); 276 277 enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); 278 if (lock_want == SIX_LOCK_read) 279 six_lock_downgrade(&ck->c.lock); 280 btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); 281 ck_path->uptodate = BTREE_ITER_UPTODATE; 282 return 0; 283 err: 284 bkey_cached_free(bc, ck); 285 mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); 286 287 return ret; 288 } 289 290 static noinline int btree_key_cache_fill(struct btree_trans *trans, 291 struct btree_path *ck_path, 292 unsigned flags) 293 { 294 if (flags & BTREE_ITER_cached_nofill) 295 return 0; 296 297 struct bch_fs *c = trans->c; 298 struct btree_iter iter; 299 struct bkey_s_c k; 300 int ret; 301 302 bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, 303 BTREE_ITER_intent| 304 BTREE_ITER_key_cache_fill| 305 BTREE_ITER_cached_nofill); 306 iter.flags &= ~BTREE_ITER_with_journal; 307 k = bch2_btree_iter_peek_slot(&iter); 308 ret = bkey_err(k); 309 if (ret) 310 goto err; 311 312 /* Recheck after btree lookup, before allocating: */ 313 ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; 314 if (unlikely(ret)) 315 goto out; 316 317 ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); 318 if (ret) 319 goto err; 320 321 if (trace_key_cache_fill_enabled()) { 322 struct printbuf buf = PRINTBUF; 323 324 bch2_bpos_to_text(&buf, ck_path->pos); 325 prt_char(&buf, ' '); 326 bch2_bkey_val_to_text(&buf, trans->c, k); 327 trace_key_cache_fill(trans, buf.buf); 328 printbuf_exit(&buf); 329 } 330 out: 331 /* We're not likely to need this iterator again: */ 332 bch2_set_btree_iter_dontneed(&iter); 333 err: 334 bch2_trans_iter_exit(trans, &iter); 335 return ret; 336 } 337 338 static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, 339 struct btree_path *path) 340 { 341 struct bch_fs *c = trans->c; 342 struct bkey_cached *ck; 343 retry: 344 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 345 if (!ck) 346 return -ENOENT; 347 348 enum six_lock_type lock_want = __btree_lock_want(path, 0); 349 350 int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); 351 if (ret) 352 return ret; 353 354 if (ck->key.btree_id != path->btree_id || 355 !bpos_eq(ck->key.pos, path->pos)) { 356 six_unlock_type(&ck->c.lock, lock_want); 357 goto retry; 358 } 359 360 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 361 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 362 363 btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); 364 path->uptodate = BTREE_ITER_UPTODATE; 365 return 0; 366 } 367 368 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 369 unsigned flags) 370 { 371 EBUG_ON(path->level); 372 373 path->l[1].b = NULL; 374 375 int ret; 376 do { 377 ret = btree_path_traverse_cached_fast(trans, path); 378 if (unlikely(ret == -ENOENT)) 379 ret = btree_key_cache_fill(trans, path, flags); 380 } while (ret == -EEXIST); 381 382 if (unlikely(ret)) { 383 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 384 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 385 btree_node_unlock(trans, path, 0); 386 path->l[0].b = ERR_PTR(ret); 387 } 388 } 389 return ret; 390 } 391 392 static int btree_key_cache_flush_pos(struct btree_trans *trans, 393 struct bkey_cached_key key, 394 u64 journal_seq, 395 unsigned commit_flags, 396 bool evict) 397 { 398 struct bch_fs *c = trans->c; 399 struct journal *j = &c->journal; 400 struct btree_iter c_iter, b_iter; 401 struct bkey_cached *ck = NULL; 402 int ret; 403 404 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 405 BTREE_ITER_slots| 406 BTREE_ITER_intent| 407 BTREE_ITER_all_snapshots); 408 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 409 BTREE_ITER_cached| 410 BTREE_ITER_intent); 411 b_iter.flags &= ~BTREE_ITER_with_key_cache; 412 413 ret = bch2_btree_iter_traverse(&c_iter); 414 if (ret) 415 goto out; 416 417 ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; 418 if (!ck) 419 goto out; 420 421 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 422 if (evict) 423 goto evict; 424 goto out; 425 } 426 427 if (journal_seq && ck->journal.seq != journal_seq) 428 goto out; 429 430 trans->journal_res.seq = ck->journal.seq; 431 432 /* 433 * If we're at the end of the journal, we really want to free up space 434 * in the journal right away - we don't want to pin that old journal 435 * sequence number with a new btree node write, we want to re-journal 436 * the update 437 */ 438 if (ck->journal.seq == journal_last_seq(j)) 439 commit_flags |= BCH_WATERMARK_reclaim; 440 441 if (ck->journal.seq != journal_last_seq(j) || 442 !test_bit(JOURNAL_space_low, &c->journal.flags)) 443 commit_flags |= BCH_TRANS_COMMIT_no_journal_res; 444 445 struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); 446 ret = bkey_err(btree_k); 447 if (ret) 448 goto err; 449 450 /* * Check that we're not violating cache coherency rules: */ 451 BUG_ON(bkey_deleted(btree_k.k)); 452 453 ret = bch2_trans_update(trans, &b_iter, ck->k, 454 BTREE_UPDATE_key_cache_reclaim| 455 BTREE_UPDATE_internal_snapshot_node| 456 BTREE_TRIGGER_norun) ?: 457 bch2_trans_commit(trans, NULL, NULL, 458 BCH_TRANS_COMMIT_no_check_rw| 459 BCH_TRANS_COMMIT_no_enospc| 460 commit_flags); 461 err: 462 bch2_fs_fatal_err_on(ret && 463 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 464 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 465 !bch2_journal_error(j), c, 466 "flushing key cache: %s", bch2_err_str(ret)); 467 if (ret) 468 goto out; 469 470 bch2_journal_pin_drop(j, &ck->journal); 471 472 struct btree_path *path = btree_iter_path(trans, &c_iter); 473 BUG_ON(!btree_node_locked(path, 0)); 474 475 if (!evict) { 476 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 477 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 478 atomic_long_dec(&c->btree_key_cache.nr_dirty); 479 } 480 } else { 481 struct btree_path *path2; 482 unsigned i; 483 evict: 484 trans_for_each_path(trans, path2, i) 485 if (path2 != path) 486 __bch2_btree_path_unlock(trans, path2); 487 488 bch2_btree_node_lock_write_nofail(trans, path, &ck->c); 489 490 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 491 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 492 atomic_long_dec(&c->btree_key_cache.nr_dirty); 493 } 494 495 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 496 if (bkey_cached_evict(&c->btree_key_cache, ck)) { 497 bkey_cached_free(&c->btree_key_cache, ck); 498 } else { 499 six_unlock_write(&ck->c.lock); 500 six_unlock_intent(&ck->c.lock); 501 } 502 } 503 out: 504 bch2_trans_iter_exit(trans, &b_iter); 505 bch2_trans_iter_exit(trans, &c_iter); 506 return ret; 507 } 508 509 int bch2_btree_key_cache_journal_flush(struct journal *j, 510 struct journal_entry_pin *pin, u64 seq) 511 { 512 struct bch_fs *c = container_of(j, struct bch_fs, journal); 513 struct bkey_cached *ck = 514 container_of(pin, struct bkey_cached, journal); 515 struct bkey_cached_key key; 516 struct btree_trans *trans = bch2_trans_get(c); 517 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 518 int ret = 0; 519 520 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 521 key = ck->key; 522 523 if (ck->journal.seq != seq || 524 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 525 six_unlock_read(&ck->c.lock); 526 goto unlock; 527 } 528 529 if (ck->seq != seq) { 530 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 531 bch2_btree_key_cache_journal_flush); 532 six_unlock_read(&ck->c.lock); 533 goto unlock; 534 } 535 six_unlock_read(&ck->c.lock); 536 537 ret = lockrestart_do(trans, 538 btree_key_cache_flush_pos(trans, key, seq, 539 BCH_TRANS_COMMIT_journal_reclaim, false)); 540 unlock: 541 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 542 543 bch2_trans_put(trans); 544 return ret; 545 } 546 547 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 548 unsigned flags, 549 struct btree_insert_entry *insert_entry) 550 { 551 struct bch_fs *c = trans->c; 552 struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; 553 struct bkey_i *insert = insert_entry->k; 554 bool kick_reclaim = false; 555 556 BUG_ON(insert->k.u64s > ck->u64s); 557 558 bkey_copy(ck->k, insert); 559 560 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 561 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); 562 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 563 atomic_long_inc(&c->btree_key_cache.nr_dirty); 564 565 if (bch2_nr_btree_keys_need_flush(c)) 566 kick_reclaim = true; 567 } 568 569 /* 570 * To minimize lock contention, we only add the journal pin here and 571 * defer pin updates to the flush callback via ->seq. Be careful not to 572 * update ->seq on nojournal commits because we don't want to update the 573 * pin to a seq that doesn't include journal updates on disk. Otherwise 574 * we risk losing the update after a crash. 575 * 576 * The only exception is if the pin is not active in the first place. We 577 * have to add the pin because journal reclaim drives key cache 578 * flushing. The flush callback will not proceed unless ->seq matches 579 * the latest pin, so make sure it starts with a consistent value. 580 */ 581 if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || 582 !journal_pin_active(&ck->journal)) { 583 ck->seq = trans->journal_res.seq; 584 } 585 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 586 &ck->journal, bch2_btree_key_cache_journal_flush); 587 588 if (kick_reclaim) 589 journal_reclaim_kick(&c->journal); 590 return true; 591 } 592 593 void bch2_btree_key_cache_drop(struct btree_trans *trans, 594 struct btree_path *path) 595 { 596 struct bch_fs *c = trans->c; 597 struct btree_key_cache *bc = &c->btree_key_cache; 598 struct bkey_cached *ck = (void *) path->l[0].b; 599 600 /* 601 * We just did an update to the btree, bypassing the key cache: the key 602 * cache key is now stale and must be dropped, even if dirty: 603 */ 604 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 605 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 606 atomic_long_dec(&c->btree_key_cache.nr_dirty); 607 bch2_journal_pin_drop(&c->journal, &ck->journal); 608 } 609 610 bkey_cached_evict(bc, ck); 611 bkey_cached_free(bc, ck); 612 613 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 614 615 struct btree_path *path2; 616 unsigned i; 617 trans_for_each_path(trans, path2, i) 618 if (path2->l[0].b == (void *) ck) { 619 __bch2_btree_path_unlock(trans, path2); 620 path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); 621 path2->should_be_locked = false; 622 btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); 623 } 624 625 bch2_trans_verify_locks(trans); 626 } 627 628 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 629 struct shrink_control *sc) 630 { 631 struct bch_fs *c = shrink->private_data; 632 struct btree_key_cache *bc = &c->btree_key_cache; 633 struct bucket_table *tbl; 634 struct bkey_cached *ck; 635 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 636 unsigned iter, start; 637 int srcu_idx; 638 639 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 640 rcu_read_lock(); 641 642 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 643 644 /* 645 * Scanning is expensive while a rehash is in progress - most elements 646 * will be on the new hashtable, if it's in progress 647 * 648 * A rehash could still start while we're scanning - that's ok, we'll 649 * still see most elements. 650 */ 651 if (unlikely(tbl->nest)) { 652 rcu_read_unlock(); 653 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 654 return SHRINK_STOP; 655 } 656 657 iter = bc->shrink_iter; 658 if (iter >= tbl->size) 659 iter = 0; 660 start = iter; 661 662 do { 663 struct rhash_head *pos, *next; 664 665 pos = rht_ptr_rcu(&tbl->buckets[iter]); 666 667 while (!rht_is_a_nulls(pos)) { 668 next = rht_dereference_bucket_rcu(pos->next, tbl, iter); 669 ck = container_of(pos, struct bkey_cached, hash); 670 671 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 672 bc->skipped_dirty++; 673 } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { 674 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 675 bc->skipped_accessed++; 676 } else if (!bkey_cached_lock_for_evict(ck)) { 677 bc->skipped_lock_fail++; 678 } else if (bkey_cached_evict(bc, ck)) { 679 bkey_cached_free(bc, ck); 680 bc->freed++; 681 freed++; 682 } else { 683 six_unlock_write(&ck->c.lock); 684 six_unlock_intent(&ck->c.lock); 685 } 686 687 scanned++; 688 if (scanned >= nr) 689 goto out; 690 691 pos = next; 692 } 693 694 iter++; 695 if (iter >= tbl->size) 696 iter = 0; 697 } while (scanned < nr && iter != start); 698 out: 699 bc->shrink_iter = iter; 700 701 rcu_read_unlock(); 702 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 703 704 return freed; 705 } 706 707 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 708 struct shrink_control *sc) 709 { 710 struct bch_fs *c = shrink->private_data; 711 struct btree_key_cache *bc = &c->btree_key_cache; 712 long nr = atomic_long_read(&bc->nr_keys) - 713 atomic_long_read(&bc->nr_dirty); 714 715 /* 716 * Avoid hammering our shrinker too much if it's nearly empty - the 717 * shrinker code doesn't take into account how big our cache is, if it's 718 * mostly empty but the system is under memory pressure it causes nasty 719 * lock contention: 720 */ 721 nr -= 128; 722 723 return max(0L, nr); 724 } 725 726 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 727 { 728 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 729 struct bucket_table *tbl; 730 struct bkey_cached *ck; 731 struct rhash_head *pos; 732 LIST_HEAD(items); 733 unsigned i; 734 735 shrinker_free(bc->shrink); 736 737 /* 738 * The loop is needed to guard against racing with rehash: 739 */ 740 while (atomic_long_read(&bc->nr_keys)) { 741 rcu_read_lock(); 742 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 743 if (tbl) { 744 if (tbl->nest) { 745 /* wait for in progress rehash */ 746 rcu_read_unlock(); 747 mutex_lock(&bc->table.mutex); 748 mutex_unlock(&bc->table.mutex); 749 rcu_read_lock(); 750 continue; 751 } 752 for (i = 0; i < tbl->size; i++) 753 while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { 754 ck = container_of(pos, struct bkey_cached, hash); 755 BUG_ON(!bkey_cached_evict(bc, ck)); 756 kfree(ck->k); 757 kmem_cache_free(bch2_key_cache, ck); 758 } 759 } 760 rcu_read_unlock(); 761 } 762 763 if (atomic_long_read(&bc->nr_dirty) && 764 !bch2_journal_error(&c->journal) && 765 test_bit(BCH_FS_was_rw, &c->flags)) 766 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 767 atomic_long_read(&bc->nr_dirty)); 768 769 if (atomic_long_read(&bc->nr_keys)) 770 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 771 atomic_long_read(&bc->nr_keys)); 772 773 if (bc->table_init_done) 774 rhashtable_destroy(&bc->table); 775 776 rcu_pending_exit(&bc->pending[0]); 777 rcu_pending_exit(&bc->pending[1]); 778 779 free_percpu(bc->nr_pending); 780 } 781 782 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 783 { 784 } 785 786 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 787 { 788 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 789 struct shrinker *shrink; 790 791 bc->nr_pending = alloc_percpu(size_t); 792 if (!bc->nr_pending) 793 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 794 795 if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || 796 rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) 797 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 798 799 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 800 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 801 802 bc->table_init_done = true; 803 804 shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); 805 if (!shrink) 806 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 807 bc->shrink = shrink; 808 shrink->count_objects = bch2_btree_key_cache_count; 809 shrink->scan_objects = bch2_btree_key_cache_scan; 810 shrink->batch = 1 << 14; 811 shrink->seeks = 0; 812 shrink->private_data = c; 813 shrinker_register(shrink); 814 return 0; 815 } 816 817 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) 818 { 819 printbuf_tabstop_push(out, 24); 820 printbuf_tabstop_push(out, 12); 821 822 prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); 823 prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); 824 prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size); 825 prt_newline(out); 826 prt_printf(out, "shrinker:\n"); 827 prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); 828 prt_printf(out, "freed:\t%lu\r\n", bc->freed); 829 prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); 830 prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); 831 prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); 832 prt_newline(out); 833 prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending)); 834 } 835 836 void bch2_btree_key_cache_exit(void) 837 { 838 kmem_cache_destroy(bch2_key_cache); 839 } 840 841 int __init bch2_btree_key_cache_init(void) 842 { 843 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 844 if (!bch2_key_cache) 845 return -ENOMEM; 846 847 return 0; 848 } 849