1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 .automatic_shrinking = true, 40 }; 41 42 static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, 43 struct bkey_cached *ck, 44 enum btree_node_locked_type lock_held) 45 { 46 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 47 path->l[0].b = (void *) ck; 48 mark_btree_node_locked(trans, path, 0, lock_held); 49 } 50 51 __flatten 52 inline struct bkey_cached * 53 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 54 { 55 struct bkey_cached_key key = { 56 .btree_id = btree_id, 57 .pos = pos, 58 }; 59 60 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 61 bch2_btree_key_cache_params); 62 } 63 64 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 65 { 66 if (!six_trylock_intent(&ck->c.lock)) 67 return false; 68 69 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 70 six_unlock_intent(&ck->c.lock); 71 return false; 72 } 73 74 if (!six_trylock_write(&ck->c.lock)) { 75 six_unlock_intent(&ck->c.lock); 76 return false; 77 } 78 79 return true; 80 } 81 82 static void bkey_cached_evict(struct btree_key_cache *c, 83 struct bkey_cached *ck) 84 { 85 BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, 86 bch2_btree_key_cache_params)); 87 memset(&ck->key, ~0, sizeof(ck->key)); 88 89 atomic_long_dec(&c->nr_keys); 90 } 91 92 static void bkey_cached_free(struct btree_key_cache *bc, 93 struct bkey_cached *ck) 94 { 95 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 96 97 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 98 99 ck->btree_trans_barrier_seq = 100 start_poll_synchronize_srcu(&c->btree_trans_barrier); 101 102 if (ck->c.lock.readers) { 103 list_move_tail(&ck->list, &bc->freed_pcpu); 104 bc->nr_freed_pcpu++; 105 } else { 106 list_move_tail(&ck->list, &bc->freed_nonpcpu); 107 bc->nr_freed_nonpcpu++; 108 } 109 atomic_long_inc(&bc->nr_freed); 110 111 kfree(ck->k); 112 ck->k = NULL; 113 ck->u64s = 0; 114 115 six_unlock_write(&ck->c.lock); 116 six_unlock_intent(&ck->c.lock); 117 } 118 119 #ifdef __KERNEL__ 120 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, 121 struct bkey_cached *ck) 122 { 123 struct bkey_cached *pos; 124 125 bc->nr_freed_nonpcpu++; 126 127 list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { 128 if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, 129 pos->btree_trans_barrier_seq)) { 130 list_move(&ck->list, &pos->list); 131 return; 132 } 133 } 134 135 list_move(&ck->list, &bc->freed_nonpcpu); 136 } 137 #endif 138 139 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, 140 struct bkey_cached *ck) 141 { 142 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 143 144 if (!ck->c.lock.readers) { 145 #ifdef __KERNEL__ 146 struct btree_key_cache_freelist *f; 147 bool freed = false; 148 149 preempt_disable(); 150 f = this_cpu_ptr(bc->pcpu_freed); 151 152 if (f->nr < ARRAY_SIZE(f->objs)) { 153 f->objs[f->nr++] = ck; 154 freed = true; 155 } 156 preempt_enable(); 157 158 if (!freed) { 159 mutex_lock(&bc->lock); 160 preempt_disable(); 161 f = this_cpu_ptr(bc->pcpu_freed); 162 163 while (f->nr > ARRAY_SIZE(f->objs) / 2) { 164 struct bkey_cached *ck2 = f->objs[--f->nr]; 165 166 __bkey_cached_move_to_freelist_ordered(bc, ck2); 167 } 168 preempt_enable(); 169 170 __bkey_cached_move_to_freelist_ordered(bc, ck); 171 mutex_unlock(&bc->lock); 172 } 173 #else 174 mutex_lock(&bc->lock); 175 list_move_tail(&ck->list, &bc->freed_nonpcpu); 176 bc->nr_freed_nonpcpu++; 177 mutex_unlock(&bc->lock); 178 #endif 179 } else { 180 mutex_lock(&bc->lock); 181 list_move_tail(&ck->list, &bc->freed_pcpu); 182 bc->nr_freed_pcpu++; 183 mutex_unlock(&bc->lock); 184 } 185 } 186 187 static void bkey_cached_free_fast(struct btree_key_cache *bc, 188 struct bkey_cached *ck) 189 { 190 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 191 192 ck->btree_trans_barrier_seq = 193 start_poll_synchronize_srcu(&c->btree_trans_barrier); 194 195 list_del_init(&ck->list); 196 atomic_long_inc(&bc->nr_freed); 197 198 kfree(ck->k); 199 ck->k = NULL; 200 ck->u64s = 0; 201 202 bkey_cached_move_to_freelist(bc, ck); 203 204 six_unlock_write(&ck->c.lock); 205 six_unlock_intent(&ck->c.lock); 206 } 207 208 static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) 209 { 210 struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); 211 if (unlikely(!ck)) 212 return NULL; 213 ck->k = kmalloc(key_u64s * sizeof(u64), gfp); 214 if (unlikely(!ck->k)) { 215 kmem_cache_free(bch2_key_cache, ck); 216 return NULL; 217 } 218 ck->u64s = key_u64s; 219 return ck; 220 } 221 222 static struct bkey_cached * 223 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) 224 { 225 struct bch_fs *c = trans->c; 226 struct btree_key_cache *bc = &c->btree_key_cache; 227 struct bkey_cached *ck = NULL; 228 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 229 int ret; 230 231 if (!pcpu_readers) { 232 #ifdef __KERNEL__ 233 struct btree_key_cache_freelist *f; 234 235 preempt_disable(); 236 f = this_cpu_ptr(bc->pcpu_freed); 237 if (f->nr) 238 ck = f->objs[--f->nr]; 239 preempt_enable(); 240 241 if (!ck) { 242 mutex_lock(&bc->lock); 243 preempt_disable(); 244 f = this_cpu_ptr(bc->pcpu_freed); 245 246 while (!list_empty(&bc->freed_nonpcpu) && 247 f->nr < ARRAY_SIZE(f->objs) / 2) { 248 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 249 list_del_init(&ck->list); 250 bc->nr_freed_nonpcpu--; 251 f->objs[f->nr++] = ck; 252 } 253 254 ck = f->nr ? f->objs[--f->nr] : NULL; 255 preempt_enable(); 256 mutex_unlock(&bc->lock); 257 } 258 #else 259 mutex_lock(&bc->lock); 260 if (!list_empty(&bc->freed_nonpcpu)) { 261 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 262 list_del_init(&ck->list); 263 bc->nr_freed_nonpcpu--; 264 } 265 mutex_unlock(&bc->lock); 266 #endif 267 } else { 268 mutex_lock(&bc->lock); 269 if (!list_empty(&bc->freed_pcpu)) { 270 ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); 271 list_del_init(&ck->list); 272 bc->nr_freed_pcpu--; 273 } 274 mutex_unlock(&bc->lock); 275 } 276 277 if (ck) { 278 ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); 279 if (unlikely(ret)) { 280 bkey_cached_move_to_freelist(bc, ck); 281 return ERR_PTR(ret); 282 } 283 284 btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED); 285 286 ret = bch2_btree_node_lock_write(trans, path, &ck->c); 287 if (unlikely(ret)) { 288 btree_node_unlock(trans, path, 0); 289 bkey_cached_move_to_freelist(bc, ck); 290 return ERR_PTR(ret); 291 } 292 293 return ck; 294 } 295 296 ck = allocate_dropping_locks(trans, ret, 297 __bkey_cached_alloc(key_u64s, _gfp)); 298 if (ret) { 299 if (ck) 300 kfree(ck->k); 301 kmem_cache_free(bch2_key_cache, ck); 302 return ERR_PTR(ret); 303 } 304 305 if (!ck) 306 return NULL; 307 308 INIT_LIST_HEAD(&ck->list); 309 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 310 311 ck->c.cached = true; 312 BUG_ON(!six_trylock_intent(&ck->c.lock)); 313 BUG_ON(!six_trylock_write(&ck->c.lock)); 314 return ck; 315 } 316 317 static struct bkey_cached * 318 bkey_cached_reuse(struct btree_key_cache *c) 319 { 320 struct bucket_table *tbl; 321 struct rhash_head *pos; 322 struct bkey_cached *ck; 323 unsigned i; 324 325 mutex_lock(&c->lock); 326 rcu_read_lock(); 327 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 328 for (i = 0; i < tbl->size; i++) 329 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 330 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 331 bkey_cached_lock_for_evict(ck)) { 332 bkey_cached_evict(c, ck); 333 goto out; 334 } 335 } 336 ck = NULL; 337 out: 338 rcu_read_unlock(); 339 mutex_unlock(&c->lock); 340 return ck; 341 } 342 343 static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path, 344 struct bkey_s_c k) 345 { 346 struct bch_fs *c = trans->c; 347 struct btree_key_cache *bc = &c->btree_key_cache; 348 349 /* 350 * bch2_varint_decode can read past the end of the buffer by at 351 * most 7 bytes (it won't be used): 352 */ 353 unsigned key_u64s = k.k->u64s + 1; 354 355 /* 356 * Allocate some extra space so that the transaction commit path is less 357 * likely to have to reallocate, since that requires a transaction 358 * restart: 359 */ 360 key_u64s = min(256U, (key_u64s * 3) / 2); 361 key_u64s = roundup_pow_of_two(key_u64s); 362 363 struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s); 364 int ret = PTR_ERR_OR_ZERO(ck); 365 if (ret) 366 return ret; 367 368 if (unlikely(!ck)) { 369 ck = bkey_cached_reuse(bc); 370 if (unlikely(!ck)) { 371 bch_err(c, "error allocating memory for key cache item, btree %s", 372 bch2_btree_id_str(path->btree_id)); 373 return -BCH_ERR_ENOMEM_btree_key_cache_create; 374 } 375 } 376 377 ck->c.level = 0; 378 ck->c.btree_id = path->btree_id; 379 ck->key.btree_id = path->btree_id; 380 ck->key.pos = path->pos; 381 ck->flags = 1U << BKEY_CACHED_ACCESSED; 382 383 if (unlikely(key_u64s > ck->u64s)) { 384 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 385 386 struct bkey_i *new_k = allocate_dropping_locks(trans, ret, 387 kmalloc(key_u64s * sizeof(u64), _gfp)); 388 if (unlikely(!new_k)) { 389 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 390 bch2_btree_id_str(ck->key.btree_id), key_u64s); 391 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 392 } else if (ret) { 393 kfree(new_k); 394 goto err; 395 } 396 397 kfree(ck->k); 398 ck->k = new_k; 399 ck->u64s = key_u64s; 400 } 401 402 bkey_reassemble(ck->k, k); 403 404 ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); 405 if (unlikely(ret)) /* raced with another fill? */ 406 goto err; 407 408 atomic_long_inc(&bc->nr_keys); 409 six_unlock_write(&ck->c.lock); 410 411 enum six_lock_type lock_want = __btree_lock_want(path, 0); 412 if (lock_want == SIX_LOCK_read) 413 six_lock_downgrade(&ck->c.lock); 414 btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); 415 path->uptodate = BTREE_ITER_UPTODATE; 416 return 0; 417 err: 418 bkey_cached_free_fast(bc, ck); 419 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 420 421 return ret; 422 } 423 424 static noinline int btree_key_cache_fill(struct btree_trans *trans, 425 struct btree_path *ck_path, 426 unsigned flags) 427 { 428 if (flags & BTREE_ITER_cached_nofill) { 429 ck_path->uptodate = BTREE_ITER_UPTODATE; 430 return 0; 431 } 432 433 struct bch_fs *c = trans->c; 434 struct btree_iter iter; 435 struct bkey_s_c k; 436 int ret; 437 438 bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, 439 BTREE_ITER_key_cache_fill| 440 BTREE_ITER_cached_nofill); 441 iter.flags &= ~BTREE_ITER_with_journal; 442 k = bch2_btree_iter_peek_slot(&iter); 443 ret = bkey_err(k); 444 if (ret) 445 goto err; 446 447 /* Recheck after btree lookup, before allocating: */ 448 ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; 449 if (unlikely(ret)) 450 goto out; 451 452 ret = btree_key_cache_create(trans, ck_path, k); 453 if (ret) 454 goto err; 455 out: 456 /* We're not likely to need this iterator again: */ 457 bch2_set_btree_iter_dontneed(&iter); 458 err: 459 bch2_trans_iter_exit(trans, &iter); 460 return ret; 461 } 462 463 static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, 464 struct btree_path *path) 465 { 466 struct bch_fs *c = trans->c; 467 struct bkey_cached *ck; 468 retry: 469 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 470 if (!ck) 471 return -ENOENT; 472 473 enum six_lock_type lock_want = __btree_lock_want(path, 0); 474 475 int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); 476 if (ret) 477 return ret; 478 479 if (ck->key.btree_id != path->btree_id || 480 !bpos_eq(ck->key.pos, path->pos)) { 481 six_unlock_type(&ck->c.lock, lock_want); 482 goto retry; 483 } 484 485 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 486 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 487 488 btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); 489 path->uptodate = BTREE_ITER_UPTODATE; 490 return 0; 491 } 492 493 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 494 unsigned flags) 495 { 496 EBUG_ON(path->level); 497 498 path->l[1].b = NULL; 499 500 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 501 path->uptodate = BTREE_ITER_UPTODATE; 502 return 0; 503 } 504 505 int ret; 506 do { 507 ret = btree_path_traverse_cached_fast(trans, path); 508 if (unlikely(ret == -ENOENT)) 509 ret = btree_key_cache_fill(trans, path, flags); 510 } while (ret == -EEXIST); 511 512 if (unlikely(ret)) { 513 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 514 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 515 btree_node_unlock(trans, path, 0); 516 path->l[0].b = ERR_PTR(ret); 517 } 518 } 519 return ret; 520 } 521 522 static int btree_key_cache_flush_pos(struct btree_trans *trans, 523 struct bkey_cached_key key, 524 u64 journal_seq, 525 unsigned commit_flags, 526 bool evict) 527 { 528 struct bch_fs *c = trans->c; 529 struct journal *j = &c->journal; 530 struct btree_iter c_iter, b_iter; 531 struct bkey_cached *ck = NULL; 532 int ret; 533 534 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 535 BTREE_ITER_slots| 536 BTREE_ITER_intent| 537 BTREE_ITER_all_snapshots); 538 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 539 BTREE_ITER_cached| 540 BTREE_ITER_intent); 541 b_iter.flags &= ~BTREE_ITER_with_key_cache; 542 543 ret = bch2_btree_iter_traverse(&c_iter); 544 if (ret) 545 goto out; 546 547 ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; 548 if (!ck) 549 goto out; 550 551 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 552 if (evict) 553 goto evict; 554 goto out; 555 } 556 557 if (journal_seq && ck->journal.seq != journal_seq) 558 goto out; 559 560 trans->journal_res.seq = ck->journal.seq; 561 562 /* 563 * If we're at the end of the journal, we really want to free up space 564 * in the journal right away - we don't want to pin that old journal 565 * sequence number with a new btree node write, we want to re-journal 566 * the update 567 */ 568 if (ck->journal.seq == journal_last_seq(j)) 569 commit_flags |= BCH_WATERMARK_reclaim; 570 571 if (ck->journal.seq != journal_last_seq(j) || 572 !test_bit(JOURNAL_space_low, &c->journal.flags)) 573 commit_flags |= BCH_TRANS_COMMIT_no_journal_res; 574 575 ret = bch2_btree_iter_traverse(&b_iter) ?: 576 bch2_trans_update(trans, &b_iter, ck->k, 577 BTREE_UPDATE_key_cache_reclaim| 578 BTREE_UPDATE_internal_snapshot_node| 579 BTREE_TRIGGER_norun) ?: 580 bch2_trans_commit(trans, NULL, NULL, 581 BCH_TRANS_COMMIT_no_check_rw| 582 BCH_TRANS_COMMIT_no_enospc| 583 commit_flags); 584 585 bch2_fs_fatal_err_on(ret && 586 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 587 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 588 !bch2_journal_error(j), c, 589 "flushing key cache: %s", bch2_err_str(ret)); 590 if (ret) 591 goto out; 592 593 bch2_journal_pin_drop(j, &ck->journal); 594 595 struct btree_path *path = btree_iter_path(trans, &c_iter); 596 BUG_ON(!btree_node_locked(path, 0)); 597 598 if (!evict) { 599 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 600 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 601 atomic_long_dec(&c->btree_key_cache.nr_dirty); 602 } 603 } else { 604 struct btree_path *path2; 605 unsigned i; 606 evict: 607 trans_for_each_path(trans, path2, i) 608 if (path2 != path) 609 __bch2_btree_path_unlock(trans, path2); 610 611 bch2_btree_node_lock_write_nofail(trans, path, &ck->c); 612 613 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 614 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 615 atomic_long_dec(&c->btree_key_cache.nr_dirty); 616 } 617 618 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 619 bkey_cached_evict(&c->btree_key_cache, ck); 620 bkey_cached_free_fast(&c->btree_key_cache, ck); 621 } 622 out: 623 bch2_trans_iter_exit(trans, &b_iter); 624 bch2_trans_iter_exit(trans, &c_iter); 625 return ret; 626 } 627 628 int bch2_btree_key_cache_journal_flush(struct journal *j, 629 struct journal_entry_pin *pin, u64 seq) 630 { 631 struct bch_fs *c = container_of(j, struct bch_fs, journal); 632 struct bkey_cached *ck = 633 container_of(pin, struct bkey_cached, journal); 634 struct bkey_cached_key key; 635 struct btree_trans *trans = bch2_trans_get(c); 636 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 637 int ret = 0; 638 639 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 640 key = ck->key; 641 642 if (ck->journal.seq != seq || 643 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 644 six_unlock_read(&ck->c.lock); 645 goto unlock; 646 } 647 648 if (ck->seq != seq) { 649 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 650 bch2_btree_key_cache_journal_flush); 651 six_unlock_read(&ck->c.lock); 652 goto unlock; 653 } 654 six_unlock_read(&ck->c.lock); 655 656 ret = lockrestart_do(trans, 657 btree_key_cache_flush_pos(trans, key, seq, 658 BCH_TRANS_COMMIT_journal_reclaim, false)); 659 unlock: 660 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 661 662 bch2_trans_put(trans); 663 return ret; 664 } 665 666 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 667 unsigned flags, 668 struct btree_insert_entry *insert_entry) 669 { 670 struct bch_fs *c = trans->c; 671 struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; 672 struct bkey_i *insert = insert_entry->k; 673 bool kick_reclaim = false; 674 675 BUG_ON(insert->k.u64s > ck->u64s); 676 677 bkey_copy(ck->k, insert); 678 679 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 680 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); 681 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 682 atomic_long_inc(&c->btree_key_cache.nr_dirty); 683 684 if (bch2_nr_btree_keys_need_flush(c)) 685 kick_reclaim = true; 686 } 687 688 /* 689 * To minimize lock contention, we only add the journal pin here and 690 * defer pin updates to the flush callback via ->seq. Be careful not to 691 * update ->seq on nojournal commits because we don't want to update the 692 * pin to a seq that doesn't include journal updates on disk. Otherwise 693 * we risk losing the update after a crash. 694 * 695 * The only exception is if the pin is not active in the first place. We 696 * have to add the pin because journal reclaim drives key cache 697 * flushing. The flush callback will not proceed unless ->seq matches 698 * the latest pin, so make sure it starts with a consistent value. 699 */ 700 if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || 701 !journal_pin_active(&ck->journal)) { 702 ck->seq = trans->journal_res.seq; 703 } 704 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 705 &ck->journal, bch2_btree_key_cache_journal_flush); 706 707 if (kick_reclaim) 708 journal_reclaim_kick(&c->journal); 709 return true; 710 } 711 712 void bch2_btree_key_cache_drop(struct btree_trans *trans, 713 struct btree_path *path) 714 { 715 struct bch_fs *c = trans->c; 716 struct btree_key_cache *bc = &c->btree_key_cache; 717 struct bkey_cached *ck = (void *) path->l[0].b; 718 719 /* 720 * We just did an update to the btree, bypassing the key cache: the key 721 * cache key is now stale and must be dropped, even if dirty: 722 */ 723 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 724 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 725 atomic_long_dec(&c->btree_key_cache.nr_dirty); 726 bch2_journal_pin_drop(&c->journal, &ck->journal); 727 } 728 729 bkey_cached_evict(bc, ck); 730 bkey_cached_free_fast(bc, ck); 731 732 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 733 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); 734 } 735 736 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 737 struct shrink_control *sc) 738 { 739 struct bch_fs *c = shrink->private_data; 740 struct btree_key_cache *bc = &c->btree_key_cache; 741 struct bucket_table *tbl; 742 struct bkey_cached *ck, *t; 743 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 744 unsigned start, flags; 745 int srcu_idx; 746 747 mutex_lock(&bc->lock); 748 bc->requested_to_free += sc->nr_to_scan; 749 750 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 751 flags = memalloc_nofs_save(); 752 753 /* 754 * Newest freed entries are at the end of the list - once we hit one 755 * that's too new to be freed, we can bail out: 756 */ 757 list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { 758 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 759 ck->btree_trans_barrier_seq)) 760 break; 761 762 list_del(&ck->list); 763 six_lock_exit(&ck->c.lock); 764 kmem_cache_free(bch2_key_cache, ck); 765 atomic_long_dec(&bc->nr_freed); 766 bc->nr_freed_nonpcpu--; 767 bc->freed++; 768 } 769 770 list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { 771 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 772 ck->btree_trans_barrier_seq)) 773 break; 774 775 list_del(&ck->list); 776 six_lock_exit(&ck->c.lock); 777 kmem_cache_free(bch2_key_cache, ck); 778 atomic_long_dec(&bc->nr_freed); 779 bc->nr_freed_pcpu--; 780 bc->freed++; 781 } 782 783 rcu_read_lock(); 784 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 785 if (bc->shrink_iter >= tbl->size) 786 bc->shrink_iter = 0; 787 start = bc->shrink_iter; 788 789 do { 790 struct rhash_head *pos, *next; 791 792 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); 793 794 while (!rht_is_a_nulls(pos)) { 795 next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); 796 ck = container_of(pos, struct bkey_cached, hash); 797 798 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 799 bc->skipped_dirty++; 800 } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { 801 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 802 bc->skipped_accessed++; 803 } else if (!bkey_cached_lock_for_evict(ck)) { 804 bc->skipped_lock_fail++; 805 } else { 806 bkey_cached_evict(bc, ck); 807 bkey_cached_free(bc, ck); 808 bc->moved_to_freelist++; 809 freed++; 810 } 811 812 scanned++; 813 if (scanned >= nr) 814 break; 815 816 pos = next; 817 } 818 819 bc->shrink_iter++; 820 if (bc->shrink_iter >= tbl->size) 821 bc->shrink_iter = 0; 822 } while (scanned < nr && bc->shrink_iter != start); 823 824 rcu_read_unlock(); 825 memalloc_nofs_restore(flags); 826 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 827 mutex_unlock(&bc->lock); 828 829 return freed; 830 } 831 832 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 833 struct shrink_control *sc) 834 { 835 struct bch_fs *c = shrink->private_data; 836 struct btree_key_cache *bc = &c->btree_key_cache; 837 long nr = atomic_long_read(&bc->nr_keys) - 838 atomic_long_read(&bc->nr_dirty); 839 840 /* 841 * Avoid hammering our shrinker too much if it's nearly empty - the 842 * shrinker code doesn't take into account how big our cache is, if it's 843 * mostly empty but the system is under memory pressure it causes nasty 844 * lock contention: 845 */ 846 nr -= 128; 847 848 return max(0L, nr); 849 } 850 851 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 852 { 853 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 854 struct bucket_table *tbl; 855 struct bkey_cached *ck, *n; 856 struct rhash_head *pos; 857 LIST_HEAD(items); 858 unsigned i; 859 #ifdef __KERNEL__ 860 int cpu; 861 #endif 862 863 shrinker_free(bc->shrink); 864 865 mutex_lock(&bc->lock); 866 867 /* 868 * The loop is needed to guard against racing with rehash: 869 */ 870 while (atomic_long_read(&bc->nr_keys)) { 871 rcu_read_lock(); 872 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 873 if (tbl) 874 for (i = 0; i < tbl->size; i++) 875 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 876 bkey_cached_evict(bc, ck); 877 list_add(&ck->list, &items); 878 } 879 rcu_read_unlock(); 880 } 881 882 #ifdef __KERNEL__ 883 if (bc->pcpu_freed) { 884 for_each_possible_cpu(cpu) { 885 struct btree_key_cache_freelist *f = 886 per_cpu_ptr(bc->pcpu_freed, cpu); 887 888 for (i = 0; i < f->nr; i++) { 889 ck = f->objs[i]; 890 list_add(&ck->list, &items); 891 } 892 } 893 } 894 #endif 895 896 BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); 897 BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); 898 899 list_splice(&bc->freed_pcpu, &items); 900 list_splice(&bc->freed_nonpcpu, &items); 901 902 mutex_unlock(&bc->lock); 903 904 list_for_each_entry_safe(ck, n, &items, list) { 905 cond_resched(); 906 907 list_del(&ck->list); 908 kfree(ck->k); 909 six_lock_exit(&ck->c.lock); 910 kmem_cache_free(bch2_key_cache, ck); 911 } 912 913 if (atomic_long_read(&bc->nr_dirty) && 914 !bch2_journal_error(&c->journal) && 915 test_bit(BCH_FS_was_rw, &c->flags)) 916 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 917 atomic_long_read(&bc->nr_dirty)); 918 919 if (atomic_long_read(&bc->nr_keys)) 920 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 921 atomic_long_read(&bc->nr_keys)); 922 923 if (bc->table_init_done) 924 rhashtable_destroy(&bc->table); 925 926 free_percpu(bc->pcpu_freed); 927 } 928 929 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 930 { 931 mutex_init(&c->lock); 932 INIT_LIST_HEAD(&c->freed_pcpu); 933 INIT_LIST_HEAD(&c->freed_nonpcpu); 934 } 935 936 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 937 { 938 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 939 struct shrinker *shrink; 940 941 #ifdef __KERNEL__ 942 bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); 943 if (!bc->pcpu_freed) 944 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 945 #endif 946 947 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 948 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 949 950 bc->table_init_done = true; 951 952 shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); 953 if (!shrink) 954 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 955 bc->shrink = shrink; 956 shrink->count_objects = bch2_btree_key_cache_count; 957 shrink->scan_objects = bch2_btree_key_cache_scan; 958 shrink->batch = 1 << 14; 959 shrink->seeks = 0; 960 shrink->private_data = c; 961 shrinker_register(shrink); 962 return 0; 963 } 964 965 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) 966 { 967 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 968 969 printbuf_tabstop_push(out, 24); 970 printbuf_tabstop_push(out, 12); 971 972 unsigned flags = memalloc_nofs_save(); 973 mutex_lock(&bc->lock); 974 prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); 975 prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); 976 prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed)); 977 prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu); 978 prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu); 979 980 prt_printf(out, "\nshrinker:\n"); 981 prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); 982 prt_printf(out, "freed:\t%lu\r\n", bc->freed); 983 prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist); 984 prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); 985 prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); 986 prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); 987 988 prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier)); 989 990 struct bkey_cached *ck; 991 unsigned iter = 0; 992 list_for_each_entry(ck, &bc->freed_nonpcpu, list) { 993 prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); 994 if (++iter > 10) 995 break; 996 } 997 998 iter = 0; 999 list_for_each_entry(ck, &bc->freed_pcpu, list) { 1000 prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); 1001 if (++iter > 10) 1002 break; 1003 } 1004 mutex_unlock(&bc->lock); 1005 memalloc_flags_restore(flags); 1006 } 1007 1008 void bch2_btree_key_cache_exit(void) 1009 { 1010 kmem_cache_destroy(bch2_key_cache); 1011 } 1012 1013 int __init bch2_btree_key_cache_init(void) 1014 { 1015 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 1016 if (!bch2_key_cache) 1017 return -ENOMEM; 1018 1019 return 0; 1020 } 1021