1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 .automatic_shrinking = true, 40 }; 41 42 static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, 43 struct bkey_cached *ck, 44 enum btree_node_locked_type lock_held) 45 { 46 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 47 path->l[0].b = (void *) ck; 48 mark_btree_node_locked(trans, path, 0, lock_held); 49 } 50 51 __flatten 52 inline struct bkey_cached * 53 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 54 { 55 struct bkey_cached_key key = { 56 .btree_id = btree_id, 57 .pos = pos, 58 }; 59 60 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 61 bch2_btree_key_cache_params); 62 } 63 64 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 65 { 66 if (!six_trylock_intent(&ck->c.lock)) 67 return false; 68 69 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 70 six_unlock_intent(&ck->c.lock); 71 return false; 72 } 73 74 if (!six_trylock_write(&ck->c.lock)) { 75 six_unlock_intent(&ck->c.lock); 76 return false; 77 } 78 79 return true; 80 } 81 82 static void bkey_cached_evict(struct btree_key_cache *c, 83 struct bkey_cached *ck) 84 { 85 BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, 86 bch2_btree_key_cache_params)); 87 memset(&ck->key, ~0, sizeof(ck->key)); 88 89 atomic_long_dec(&c->nr_keys); 90 } 91 92 static void bkey_cached_free(struct btree_key_cache *bc, 93 struct bkey_cached *ck) 94 { 95 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 96 97 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 98 99 ck->btree_trans_barrier_seq = 100 start_poll_synchronize_srcu(&c->btree_trans_barrier); 101 102 if (ck->c.lock.readers) { 103 list_move_tail(&ck->list, &bc->freed_pcpu); 104 bc->nr_freed_pcpu++; 105 } else { 106 list_move_tail(&ck->list, &bc->freed_nonpcpu); 107 bc->nr_freed_nonpcpu++; 108 } 109 atomic_long_inc(&bc->nr_freed); 110 111 kfree(ck->k); 112 ck->k = NULL; 113 ck->u64s = 0; 114 115 six_unlock_write(&ck->c.lock); 116 six_unlock_intent(&ck->c.lock); 117 } 118 119 #ifdef __KERNEL__ 120 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, 121 struct bkey_cached *ck) 122 { 123 struct bkey_cached *pos; 124 125 bc->nr_freed_nonpcpu++; 126 127 list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { 128 if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, 129 pos->btree_trans_barrier_seq)) { 130 list_move(&ck->list, &pos->list); 131 return; 132 } 133 } 134 135 list_move(&ck->list, &bc->freed_nonpcpu); 136 } 137 #endif 138 139 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, 140 struct bkey_cached *ck) 141 { 142 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 143 144 if (!ck->c.lock.readers) { 145 #ifdef __KERNEL__ 146 struct btree_key_cache_freelist *f; 147 bool freed = false; 148 149 preempt_disable(); 150 f = this_cpu_ptr(bc->pcpu_freed); 151 152 if (f->nr < ARRAY_SIZE(f->objs)) { 153 f->objs[f->nr++] = ck; 154 freed = true; 155 } 156 preempt_enable(); 157 158 if (!freed) { 159 mutex_lock(&bc->lock); 160 preempt_disable(); 161 f = this_cpu_ptr(bc->pcpu_freed); 162 163 while (f->nr > ARRAY_SIZE(f->objs) / 2) { 164 struct bkey_cached *ck2 = f->objs[--f->nr]; 165 166 __bkey_cached_move_to_freelist_ordered(bc, ck2); 167 } 168 preempt_enable(); 169 170 __bkey_cached_move_to_freelist_ordered(bc, ck); 171 mutex_unlock(&bc->lock); 172 } 173 #else 174 mutex_lock(&bc->lock); 175 list_move_tail(&ck->list, &bc->freed_nonpcpu); 176 bc->nr_freed_nonpcpu++; 177 mutex_unlock(&bc->lock); 178 #endif 179 } else { 180 mutex_lock(&bc->lock); 181 list_move_tail(&ck->list, &bc->freed_pcpu); 182 bc->nr_freed_pcpu++; 183 mutex_unlock(&bc->lock); 184 } 185 } 186 187 static void bkey_cached_free_fast(struct btree_key_cache *bc, 188 struct bkey_cached *ck) 189 { 190 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 191 192 ck->btree_trans_barrier_seq = 193 start_poll_synchronize_srcu(&c->btree_trans_barrier); 194 195 list_del_init(&ck->list); 196 atomic_long_inc(&bc->nr_freed); 197 198 kfree(ck->k); 199 ck->k = NULL; 200 ck->u64s = 0; 201 202 bkey_cached_move_to_freelist(bc, ck); 203 204 six_unlock_write(&ck->c.lock); 205 six_unlock_intent(&ck->c.lock); 206 } 207 208 static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) 209 { 210 struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); 211 if (unlikely(!ck)) 212 return NULL; 213 ck->k = kmalloc(key_u64s * sizeof(u64), gfp); 214 if (unlikely(!ck->k)) { 215 kmem_cache_free(bch2_key_cache, ck); 216 return NULL; 217 } 218 ck->u64s = key_u64s; 219 return ck; 220 } 221 222 static struct bkey_cached * 223 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) 224 { 225 struct bch_fs *c = trans->c; 226 struct btree_key_cache *bc = &c->btree_key_cache; 227 struct bkey_cached *ck = NULL; 228 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 229 int ret; 230 231 if (!pcpu_readers) { 232 #ifdef __KERNEL__ 233 struct btree_key_cache_freelist *f; 234 235 preempt_disable(); 236 f = this_cpu_ptr(bc->pcpu_freed); 237 if (f->nr) 238 ck = f->objs[--f->nr]; 239 preempt_enable(); 240 241 if (!ck) { 242 mutex_lock(&bc->lock); 243 preempt_disable(); 244 f = this_cpu_ptr(bc->pcpu_freed); 245 246 while (!list_empty(&bc->freed_nonpcpu) && 247 f->nr < ARRAY_SIZE(f->objs) / 2) { 248 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 249 list_del_init(&ck->list); 250 bc->nr_freed_nonpcpu--; 251 f->objs[f->nr++] = ck; 252 } 253 254 ck = f->nr ? f->objs[--f->nr] : NULL; 255 preempt_enable(); 256 mutex_unlock(&bc->lock); 257 } 258 #else 259 mutex_lock(&bc->lock); 260 if (!list_empty(&bc->freed_nonpcpu)) { 261 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 262 list_del_init(&ck->list); 263 bc->nr_freed_nonpcpu--; 264 } 265 mutex_unlock(&bc->lock); 266 #endif 267 } else { 268 mutex_lock(&bc->lock); 269 if (!list_empty(&bc->freed_pcpu)) { 270 ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); 271 list_del_init(&ck->list); 272 bc->nr_freed_pcpu--; 273 } 274 mutex_unlock(&bc->lock); 275 } 276 277 if (ck) { 278 ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); 279 if (unlikely(ret)) { 280 bkey_cached_move_to_freelist(bc, ck); 281 return ERR_PTR(ret); 282 } 283 284 btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED); 285 286 ret = bch2_btree_node_lock_write(trans, path, &ck->c); 287 if (unlikely(ret)) { 288 btree_node_unlock(trans, path, 0); 289 bkey_cached_move_to_freelist(bc, ck); 290 return ERR_PTR(ret); 291 } 292 293 return ck; 294 } 295 296 ck = allocate_dropping_locks(trans, ret, 297 __bkey_cached_alloc(key_u64s, _gfp)); 298 if (ret) { 299 if (ck) 300 kfree(ck->k); 301 kmem_cache_free(bch2_key_cache, ck); 302 return ERR_PTR(ret); 303 } 304 305 if (!ck) 306 return NULL; 307 308 INIT_LIST_HEAD(&ck->list); 309 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 310 311 ck->c.cached = true; 312 BUG_ON(!six_trylock_intent(&ck->c.lock)); 313 BUG_ON(!six_trylock_write(&ck->c.lock)); 314 return ck; 315 } 316 317 static struct bkey_cached * 318 bkey_cached_reuse(struct btree_key_cache *c) 319 { 320 struct bucket_table *tbl; 321 struct rhash_head *pos; 322 struct bkey_cached *ck; 323 unsigned i; 324 325 mutex_lock(&c->lock); 326 rcu_read_lock(); 327 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 328 for (i = 0; i < tbl->size; i++) 329 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 330 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 331 bkey_cached_lock_for_evict(ck)) { 332 bkey_cached_evict(c, ck); 333 goto out; 334 } 335 } 336 ck = NULL; 337 out: 338 rcu_read_unlock(); 339 mutex_unlock(&c->lock); 340 return ck; 341 } 342 343 static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path, 344 struct bkey_s_c k) 345 { 346 struct bch_fs *c = trans->c; 347 struct btree_key_cache *bc = &c->btree_key_cache; 348 349 /* 350 * bch2_varint_decode can read past the end of the buffer by at 351 * most 7 bytes (it won't be used): 352 */ 353 unsigned key_u64s = k.k->u64s + 1; 354 355 /* 356 * Allocate some extra space so that the transaction commit path is less 357 * likely to have to reallocate, since that requires a transaction 358 * restart: 359 */ 360 key_u64s = min(256U, (key_u64s * 3) / 2); 361 key_u64s = roundup_pow_of_two(key_u64s); 362 363 struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s); 364 int ret = PTR_ERR_OR_ZERO(ck); 365 if (ret) 366 return ret; 367 368 if (unlikely(!ck)) { 369 ck = bkey_cached_reuse(bc); 370 if (unlikely(!ck)) { 371 bch_err(c, "error allocating memory for key cache item, btree %s", 372 bch2_btree_id_str(path->btree_id)); 373 return -BCH_ERR_ENOMEM_btree_key_cache_create; 374 } 375 } 376 377 ck->c.level = 0; 378 ck->c.btree_id = path->btree_id; 379 ck->key.btree_id = path->btree_id; 380 ck->key.pos = path->pos; 381 ck->flags = 1U << BKEY_CACHED_ACCESSED; 382 383 if (unlikely(key_u64s > ck->u64s)) { 384 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 385 386 struct bkey_i *new_k = allocate_dropping_locks(trans, ret, 387 kmalloc(key_u64s * sizeof(u64), _gfp)); 388 if (unlikely(!new_k)) { 389 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 390 bch2_btree_id_str(ck->key.btree_id), key_u64s); 391 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 392 } else if (ret) { 393 kfree(new_k); 394 goto err; 395 } 396 397 kfree(ck->k); 398 ck->k = new_k; 399 ck->u64s = key_u64s; 400 } 401 402 bkey_reassemble(ck->k, k); 403 404 ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); 405 if (unlikely(ret)) /* raced with another fill? */ 406 goto err; 407 408 atomic_long_inc(&bc->nr_keys); 409 six_unlock_write(&ck->c.lock); 410 411 enum six_lock_type lock_want = __btree_lock_want(path, 0); 412 if (lock_want == SIX_LOCK_read) 413 six_lock_downgrade(&ck->c.lock); 414 btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); 415 path->uptodate = BTREE_ITER_UPTODATE; 416 return 0; 417 err: 418 bkey_cached_free_fast(bc, ck); 419 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 420 421 return ret; 422 } 423 424 static noinline int btree_key_cache_fill(struct btree_trans *trans, 425 struct btree_path *ck_path, 426 unsigned flags) 427 { 428 if (flags & BTREE_ITER_cached_nofill) { 429 ck_path->uptodate = BTREE_ITER_UPTODATE; 430 return 0; 431 } 432 433 struct bch_fs *c = trans->c; 434 struct btree_iter iter; 435 struct bkey_s_c k; 436 int ret; 437 438 bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, 439 BTREE_ITER_key_cache_fill| 440 BTREE_ITER_cached_nofill); 441 iter.flags &= ~BTREE_ITER_with_journal; 442 k = bch2_btree_iter_peek_slot(&iter); 443 ret = bkey_err(k); 444 if (ret) 445 goto err; 446 447 /* Recheck after btree lookup, before allocating: */ 448 ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; 449 if (unlikely(ret)) 450 goto out; 451 452 ret = btree_key_cache_create(trans, ck_path, k); 453 if (ret) 454 goto err; 455 out: 456 /* We're not likely to need this iterator again: */ 457 bch2_set_btree_iter_dontneed(&iter); 458 err: 459 bch2_trans_iter_exit(trans, &iter); 460 return ret; 461 } 462 463 static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, 464 struct btree_path *path) 465 { 466 struct bch_fs *c = trans->c; 467 struct bkey_cached *ck; 468 retry: 469 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 470 if (!ck) 471 return -ENOENT; 472 473 enum six_lock_type lock_want = __btree_lock_want(path, 0); 474 475 int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); 476 if (ret) 477 return ret; 478 479 if (ck->key.btree_id != path->btree_id || 480 !bpos_eq(ck->key.pos, path->pos)) { 481 six_unlock_type(&ck->c.lock, lock_want); 482 goto retry; 483 } 484 485 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 486 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 487 488 btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); 489 path->uptodate = BTREE_ITER_UPTODATE; 490 return 0; 491 } 492 493 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 494 unsigned flags) 495 { 496 EBUG_ON(path->level); 497 498 path->l[1].b = NULL; 499 500 int ret; 501 do { 502 ret = btree_path_traverse_cached_fast(trans, path); 503 if (unlikely(ret == -ENOENT)) 504 ret = btree_key_cache_fill(trans, path, flags); 505 } while (ret == -EEXIST); 506 507 if (unlikely(ret)) { 508 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 509 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 510 btree_node_unlock(trans, path, 0); 511 path->l[0].b = ERR_PTR(ret); 512 } 513 } 514 return ret; 515 } 516 517 static int btree_key_cache_flush_pos(struct btree_trans *trans, 518 struct bkey_cached_key key, 519 u64 journal_seq, 520 unsigned commit_flags, 521 bool evict) 522 { 523 struct bch_fs *c = trans->c; 524 struct journal *j = &c->journal; 525 struct btree_iter c_iter, b_iter; 526 struct bkey_cached *ck = NULL; 527 int ret; 528 529 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 530 BTREE_ITER_slots| 531 BTREE_ITER_intent| 532 BTREE_ITER_all_snapshots); 533 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 534 BTREE_ITER_cached| 535 BTREE_ITER_intent); 536 b_iter.flags &= ~BTREE_ITER_with_key_cache; 537 538 ret = bch2_btree_iter_traverse(&c_iter); 539 if (ret) 540 goto out; 541 542 ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; 543 if (!ck) 544 goto out; 545 546 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 547 if (evict) 548 goto evict; 549 goto out; 550 } 551 552 if (journal_seq && ck->journal.seq != journal_seq) 553 goto out; 554 555 trans->journal_res.seq = ck->journal.seq; 556 557 /* 558 * If we're at the end of the journal, we really want to free up space 559 * in the journal right away - we don't want to pin that old journal 560 * sequence number with a new btree node write, we want to re-journal 561 * the update 562 */ 563 if (ck->journal.seq == journal_last_seq(j)) 564 commit_flags |= BCH_WATERMARK_reclaim; 565 566 if (ck->journal.seq != journal_last_seq(j) || 567 !test_bit(JOURNAL_space_low, &c->journal.flags)) 568 commit_flags |= BCH_TRANS_COMMIT_no_journal_res; 569 570 ret = bch2_btree_iter_traverse(&b_iter) ?: 571 bch2_trans_update(trans, &b_iter, ck->k, 572 BTREE_UPDATE_key_cache_reclaim| 573 BTREE_UPDATE_internal_snapshot_node| 574 BTREE_TRIGGER_norun) ?: 575 bch2_trans_commit(trans, NULL, NULL, 576 BCH_TRANS_COMMIT_no_check_rw| 577 BCH_TRANS_COMMIT_no_enospc| 578 commit_flags); 579 580 bch2_fs_fatal_err_on(ret && 581 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 582 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 583 !bch2_journal_error(j), c, 584 "flushing key cache: %s", bch2_err_str(ret)); 585 if (ret) 586 goto out; 587 588 bch2_journal_pin_drop(j, &ck->journal); 589 590 struct btree_path *path = btree_iter_path(trans, &c_iter); 591 BUG_ON(!btree_node_locked(path, 0)); 592 593 if (!evict) { 594 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 595 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 596 atomic_long_dec(&c->btree_key_cache.nr_dirty); 597 } 598 } else { 599 struct btree_path *path2; 600 unsigned i; 601 evict: 602 trans_for_each_path(trans, path2, i) 603 if (path2 != path) 604 __bch2_btree_path_unlock(trans, path2); 605 606 bch2_btree_node_lock_write_nofail(trans, path, &ck->c); 607 608 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 609 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 610 atomic_long_dec(&c->btree_key_cache.nr_dirty); 611 } 612 613 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 614 bkey_cached_evict(&c->btree_key_cache, ck); 615 bkey_cached_free_fast(&c->btree_key_cache, ck); 616 } 617 out: 618 bch2_trans_iter_exit(trans, &b_iter); 619 bch2_trans_iter_exit(trans, &c_iter); 620 return ret; 621 } 622 623 int bch2_btree_key_cache_journal_flush(struct journal *j, 624 struct journal_entry_pin *pin, u64 seq) 625 { 626 struct bch_fs *c = container_of(j, struct bch_fs, journal); 627 struct bkey_cached *ck = 628 container_of(pin, struct bkey_cached, journal); 629 struct bkey_cached_key key; 630 struct btree_trans *trans = bch2_trans_get(c); 631 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 632 int ret = 0; 633 634 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 635 key = ck->key; 636 637 if (ck->journal.seq != seq || 638 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 639 six_unlock_read(&ck->c.lock); 640 goto unlock; 641 } 642 643 if (ck->seq != seq) { 644 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 645 bch2_btree_key_cache_journal_flush); 646 six_unlock_read(&ck->c.lock); 647 goto unlock; 648 } 649 six_unlock_read(&ck->c.lock); 650 651 ret = lockrestart_do(trans, 652 btree_key_cache_flush_pos(trans, key, seq, 653 BCH_TRANS_COMMIT_journal_reclaim, false)); 654 unlock: 655 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 656 657 bch2_trans_put(trans); 658 return ret; 659 } 660 661 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 662 unsigned flags, 663 struct btree_insert_entry *insert_entry) 664 { 665 struct bch_fs *c = trans->c; 666 struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; 667 struct bkey_i *insert = insert_entry->k; 668 bool kick_reclaim = false; 669 670 BUG_ON(insert->k.u64s > ck->u64s); 671 672 bkey_copy(ck->k, insert); 673 674 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 675 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); 676 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 677 atomic_long_inc(&c->btree_key_cache.nr_dirty); 678 679 if (bch2_nr_btree_keys_need_flush(c)) 680 kick_reclaim = true; 681 } 682 683 /* 684 * To minimize lock contention, we only add the journal pin here and 685 * defer pin updates to the flush callback via ->seq. Be careful not to 686 * update ->seq on nojournal commits because we don't want to update the 687 * pin to a seq that doesn't include journal updates on disk. Otherwise 688 * we risk losing the update after a crash. 689 * 690 * The only exception is if the pin is not active in the first place. We 691 * have to add the pin because journal reclaim drives key cache 692 * flushing. The flush callback will not proceed unless ->seq matches 693 * the latest pin, so make sure it starts with a consistent value. 694 */ 695 if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || 696 !journal_pin_active(&ck->journal)) { 697 ck->seq = trans->journal_res.seq; 698 } 699 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 700 &ck->journal, bch2_btree_key_cache_journal_flush); 701 702 if (kick_reclaim) 703 journal_reclaim_kick(&c->journal); 704 return true; 705 } 706 707 void bch2_btree_key_cache_drop(struct btree_trans *trans, 708 struct btree_path *path) 709 { 710 struct bch_fs *c = trans->c; 711 struct btree_key_cache *bc = &c->btree_key_cache; 712 struct bkey_cached *ck = (void *) path->l[0].b; 713 714 /* 715 * We just did an update to the btree, bypassing the key cache: the key 716 * cache key is now stale and must be dropped, even if dirty: 717 */ 718 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 719 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 720 atomic_long_dec(&c->btree_key_cache.nr_dirty); 721 bch2_journal_pin_drop(&c->journal, &ck->journal); 722 } 723 724 bkey_cached_evict(bc, ck); 725 bkey_cached_free_fast(bc, ck); 726 727 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 728 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); 729 path->should_be_locked = false; 730 } 731 732 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 733 struct shrink_control *sc) 734 { 735 struct bch_fs *c = shrink->private_data; 736 struct btree_key_cache *bc = &c->btree_key_cache; 737 struct bucket_table *tbl; 738 struct bkey_cached *ck, *t; 739 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 740 unsigned start, flags; 741 int srcu_idx; 742 743 mutex_lock(&bc->lock); 744 bc->requested_to_free += sc->nr_to_scan; 745 746 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 747 flags = memalloc_nofs_save(); 748 749 /* 750 * Newest freed entries are at the end of the list - once we hit one 751 * that's too new to be freed, we can bail out: 752 */ 753 list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { 754 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 755 ck->btree_trans_barrier_seq)) 756 break; 757 758 list_del(&ck->list); 759 six_lock_exit(&ck->c.lock); 760 kmem_cache_free(bch2_key_cache, ck); 761 atomic_long_dec(&bc->nr_freed); 762 bc->nr_freed_nonpcpu--; 763 bc->freed++; 764 } 765 766 list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { 767 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 768 ck->btree_trans_barrier_seq)) 769 break; 770 771 list_del(&ck->list); 772 six_lock_exit(&ck->c.lock); 773 kmem_cache_free(bch2_key_cache, ck); 774 atomic_long_dec(&bc->nr_freed); 775 bc->nr_freed_pcpu--; 776 bc->freed++; 777 } 778 779 rcu_read_lock(); 780 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 781 782 /* 783 * Scanning is expensive while a rehash is in progress - most elements 784 * will be on the new hashtable, if it's in progress 785 * 786 * A rehash could still start while we're scanning - that's ok, we'll 787 * still see most elements. 788 */ 789 if (unlikely(tbl->nest)) { 790 rcu_read_unlock(); 791 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 792 return SHRINK_STOP; 793 } 794 795 if (bc->shrink_iter >= tbl->size) 796 bc->shrink_iter = 0; 797 start = bc->shrink_iter; 798 799 do { 800 struct rhash_head *pos, *next; 801 802 pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]); 803 804 while (!rht_is_a_nulls(pos)) { 805 next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); 806 ck = container_of(pos, struct bkey_cached, hash); 807 808 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 809 bc->skipped_dirty++; 810 } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { 811 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 812 bc->skipped_accessed++; 813 } else if (!bkey_cached_lock_for_evict(ck)) { 814 bc->skipped_lock_fail++; 815 } else { 816 bkey_cached_evict(bc, ck); 817 bkey_cached_free(bc, ck); 818 bc->moved_to_freelist++; 819 freed++; 820 } 821 822 scanned++; 823 if (scanned >= nr) 824 break; 825 826 pos = next; 827 } 828 829 bc->shrink_iter++; 830 if (bc->shrink_iter >= tbl->size) 831 bc->shrink_iter = 0; 832 } while (scanned < nr && bc->shrink_iter != start); 833 834 rcu_read_unlock(); 835 memalloc_nofs_restore(flags); 836 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 837 mutex_unlock(&bc->lock); 838 839 return freed; 840 } 841 842 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 843 struct shrink_control *sc) 844 { 845 struct bch_fs *c = shrink->private_data; 846 struct btree_key_cache *bc = &c->btree_key_cache; 847 long nr = atomic_long_read(&bc->nr_keys) - 848 atomic_long_read(&bc->nr_dirty); 849 850 /* 851 * Avoid hammering our shrinker too much if it's nearly empty - the 852 * shrinker code doesn't take into account how big our cache is, if it's 853 * mostly empty but the system is under memory pressure it causes nasty 854 * lock contention: 855 */ 856 nr -= 128; 857 858 return max(0L, nr); 859 } 860 861 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 862 { 863 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 864 struct bucket_table *tbl; 865 struct bkey_cached *ck, *n; 866 struct rhash_head *pos; 867 LIST_HEAD(items); 868 unsigned i; 869 #ifdef __KERNEL__ 870 int cpu; 871 #endif 872 873 shrinker_free(bc->shrink); 874 875 mutex_lock(&bc->lock); 876 877 /* 878 * The loop is needed to guard against racing with rehash: 879 */ 880 while (atomic_long_read(&bc->nr_keys)) { 881 rcu_read_lock(); 882 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 883 if (tbl) { 884 if (tbl->nest) { 885 /* wait for in progress rehash */ 886 rcu_read_unlock(); 887 mutex_lock(&bc->table.mutex); 888 mutex_unlock(&bc->table.mutex); 889 rcu_read_lock(); 890 continue; 891 } 892 for (i = 0; i < tbl->size; i++) 893 while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { 894 ck = container_of(pos, struct bkey_cached, hash); 895 bkey_cached_evict(bc, ck); 896 list_add(&ck->list, &items); 897 } 898 } 899 rcu_read_unlock(); 900 } 901 902 #ifdef __KERNEL__ 903 if (bc->pcpu_freed) { 904 for_each_possible_cpu(cpu) { 905 struct btree_key_cache_freelist *f = 906 per_cpu_ptr(bc->pcpu_freed, cpu); 907 908 for (i = 0; i < f->nr; i++) { 909 ck = f->objs[i]; 910 list_add(&ck->list, &items); 911 } 912 } 913 } 914 #endif 915 916 BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); 917 BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); 918 919 list_splice(&bc->freed_pcpu, &items); 920 list_splice(&bc->freed_nonpcpu, &items); 921 922 mutex_unlock(&bc->lock); 923 924 list_for_each_entry_safe(ck, n, &items, list) { 925 cond_resched(); 926 927 list_del(&ck->list); 928 kfree(ck->k); 929 six_lock_exit(&ck->c.lock); 930 kmem_cache_free(bch2_key_cache, ck); 931 } 932 933 if (atomic_long_read(&bc->nr_dirty) && 934 !bch2_journal_error(&c->journal) && 935 test_bit(BCH_FS_was_rw, &c->flags)) 936 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 937 atomic_long_read(&bc->nr_dirty)); 938 939 if (atomic_long_read(&bc->nr_keys)) 940 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 941 atomic_long_read(&bc->nr_keys)); 942 943 if (bc->table_init_done) 944 rhashtable_destroy(&bc->table); 945 946 free_percpu(bc->pcpu_freed); 947 } 948 949 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 950 { 951 mutex_init(&c->lock); 952 INIT_LIST_HEAD(&c->freed_pcpu); 953 INIT_LIST_HEAD(&c->freed_nonpcpu); 954 } 955 956 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 957 { 958 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 959 struct shrinker *shrink; 960 961 #ifdef __KERNEL__ 962 bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); 963 if (!bc->pcpu_freed) 964 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 965 #endif 966 967 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 968 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 969 970 bc->table_init_done = true; 971 972 shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); 973 if (!shrink) 974 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 975 bc->shrink = shrink; 976 shrink->count_objects = bch2_btree_key_cache_count; 977 shrink->scan_objects = bch2_btree_key_cache_scan; 978 shrink->batch = 1 << 14; 979 shrink->seeks = 0; 980 shrink->private_data = c; 981 shrinker_register(shrink); 982 return 0; 983 } 984 985 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) 986 { 987 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 988 989 printbuf_tabstop_push(out, 24); 990 printbuf_tabstop_push(out, 12); 991 992 unsigned flags = memalloc_nofs_save(); 993 mutex_lock(&bc->lock); 994 prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); 995 prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); 996 prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed)); 997 prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu); 998 prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu); 999 1000 prt_printf(out, "\nshrinker:\n"); 1001 prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); 1002 prt_printf(out, "freed:\t%lu\r\n", bc->freed); 1003 prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist); 1004 prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); 1005 prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); 1006 prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); 1007 1008 prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier)); 1009 1010 struct bkey_cached *ck; 1011 unsigned iter = 0; 1012 list_for_each_entry(ck, &bc->freed_nonpcpu, list) { 1013 prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); 1014 if (++iter > 10) 1015 break; 1016 } 1017 1018 iter = 0; 1019 list_for_each_entry(ck, &bc->freed_pcpu, list) { 1020 prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); 1021 if (++iter > 10) 1022 break; 1023 } 1024 mutex_unlock(&bc->lock); 1025 memalloc_flags_restore(flags); 1026 } 1027 1028 void bch2_btree_key_cache_exit(void) 1029 { 1030 kmem_cache_destroy(bch2_key_cache); 1031 } 1032 1033 int __init bch2_btree_key_cache_init(void) 1034 { 1035 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 1036 if (!bch2_key_cache) 1037 return -ENOMEM; 1038 1039 return 0; 1040 } 1041