1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 }; 40 41 __flatten 42 inline struct bkey_cached * 43 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 44 { 45 struct bkey_cached_key key = { 46 .btree_id = btree_id, 47 .pos = pos, 48 }; 49 50 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 51 bch2_btree_key_cache_params); 52 } 53 54 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 55 { 56 if (!six_trylock_intent(&ck->c.lock)) 57 return false; 58 59 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 60 six_unlock_intent(&ck->c.lock); 61 return false; 62 } 63 64 if (!six_trylock_write(&ck->c.lock)) { 65 six_unlock_intent(&ck->c.lock); 66 return false; 67 } 68 69 return true; 70 } 71 72 static void bkey_cached_evict(struct btree_key_cache *c, 73 struct bkey_cached *ck) 74 { 75 BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, 76 bch2_btree_key_cache_params)); 77 memset(&ck->key, ~0, sizeof(ck->key)); 78 79 atomic_long_dec(&c->nr_keys); 80 } 81 82 static void bkey_cached_free(struct btree_key_cache *bc, 83 struct bkey_cached *ck) 84 { 85 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 86 87 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 88 89 ck->btree_trans_barrier_seq = 90 start_poll_synchronize_srcu(&c->btree_trans_barrier); 91 92 if (ck->c.lock.readers) { 93 list_move_tail(&ck->list, &bc->freed_pcpu); 94 bc->nr_freed_pcpu++; 95 } else { 96 list_move_tail(&ck->list, &bc->freed_nonpcpu); 97 bc->nr_freed_nonpcpu++; 98 } 99 atomic_long_inc(&bc->nr_freed); 100 101 kfree(ck->k); 102 ck->k = NULL; 103 ck->u64s = 0; 104 105 six_unlock_write(&ck->c.lock); 106 six_unlock_intent(&ck->c.lock); 107 } 108 109 #ifdef __KERNEL__ 110 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, 111 struct bkey_cached *ck) 112 { 113 struct bkey_cached *pos; 114 115 bc->nr_freed_nonpcpu++; 116 117 list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { 118 if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, 119 pos->btree_trans_barrier_seq)) { 120 list_move(&ck->list, &pos->list); 121 return; 122 } 123 } 124 125 list_move(&ck->list, &bc->freed_nonpcpu); 126 } 127 #endif 128 129 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, 130 struct bkey_cached *ck) 131 { 132 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 133 134 if (!ck->c.lock.readers) { 135 #ifdef __KERNEL__ 136 struct btree_key_cache_freelist *f; 137 bool freed = false; 138 139 preempt_disable(); 140 f = this_cpu_ptr(bc->pcpu_freed); 141 142 if (f->nr < ARRAY_SIZE(f->objs)) { 143 f->objs[f->nr++] = ck; 144 freed = true; 145 } 146 preempt_enable(); 147 148 if (!freed) { 149 mutex_lock(&bc->lock); 150 preempt_disable(); 151 f = this_cpu_ptr(bc->pcpu_freed); 152 153 while (f->nr > ARRAY_SIZE(f->objs) / 2) { 154 struct bkey_cached *ck2 = f->objs[--f->nr]; 155 156 __bkey_cached_move_to_freelist_ordered(bc, ck2); 157 } 158 preempt_enable(); 159 160 __bkey_cached_move_to_freelist_ordered(bc, ck); 161 mutex_unlock(&bc->lock); 162 } 163 #else 164 mutex_lock(&bc->lock); 165 list_move_tail(&ck->list, &bc->freed_nonpcpu); 166 bc->nr_freed_nonpcpu++; 167 mutex_unlock(&bc->lock); 168 #endif 169 } else { 170 mutex_lock(&bc->lock); 171 list_move_tail(&ck->list, &bc->freed_pcpu); 172 mutex_unlock(&bc->lock); 173 } 174 } 175 176 static void bkey_cached_free_fast(struct btree_key_cache *bc, 177 struct bkey_cached *ck) 178 { 179 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 180 181 ck->btree_trans_barrier_seq = 182 start_poll_synchronize_srcu(&c->btree_trans_barrier); 183 184 list_del_init(&ck->list); 185 atomic_long_inc(&bc->nr_freed); 186 187 kfree(ck->k); 188 ck->k = NULL; 189 ck->u64s = 0; 190 191 bkey_cached_move_to_freelist(bc, ck); 192 193 six_unlock_write(&ck->c.lock); 194 six_unlock_intent(&ck->c.lock); 195 } 196 197 static struct bkey_cached * 198 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, 199 bool *was_new) 200 { 201 struct bch_fs *c = trans->c; 202 struct btree_key_cache *bc = &c->btree_key_cache; 203 struct bkey_cached *ck = NULL; 204 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 205 int ret; 206 207 if (!pcpu_readers) { 208 #ifdef __KERNEL__ 209 struct btree_key_cache_freelist *f; 210 211 preempt_disable(); 212 f = this_cpu_ptr(bc->pcpu_freed); 213 if (f->nr) 214 ck = f->objs[--f->nr]; 215 preempt_enable(); 216 217 if (!ck) { 218 mutex_lock(&bc->lock); 219 preempt_disable(); 220 f = this_cpu_ptr(bc->pcpu_freed); 221 222 while (!list_empty(&bc->freed_nonpcpu) && 223 f->nr < ARRAY_SIZE(f->objs) / 2) { 224 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 225 list_del_init(&ck->list); 226 bc->nr_freed_nonpcpu--; 227 f->objs[f->nr++] = ck; 228 } 229 230 ck = f->nr ? f->objs[--f->nr] : NULL; 231 preempt_enable(); 232 mutex_unlock(&bc->lock); 233 } 234 #else 235 mutex_lock(&bc->lock); 236 if (!list_empty(&bc->freed_nonpcpu)) { 237 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 238 list_del_init(&ck->list); 239 bc->nr_freed_nonpcpu--; 240 } 241 mutex_unlock(&bc->lock); 242 #endif 243 } else { 244 mutex_lock(&bc->lock); 245 if (!list_empty(&bc->freed_pcpu)) { 246 ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); 247 list_del_init(&ck->list); 248 } 249 mutex_unlock(&bc->lock); 250 } 251 252 if (ck) { 253 ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); 254 if (unlikely(ret)) { 255 bkey_cached_move_to_freelist(bc, ck); 256 return ERR_PTR(ret); 257 } 258 259 path->l[0].b = (void *) ck; 260 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 261 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 262 263 ret = bch2_btree_node_lock_write(trans, path, &ck->c); 264 if (unlikely(ret)) { 265 btree_node_unlock(trans, path, 0); 266 bkey_cached_move_to_freelist(bc, ck); 267 return ERR_PTR(ret); 268 } 269 270 return ck; 271 } 272 273 ck = allocate_dropping_locks(trans, ret, 274 kmem_cache_zalloc(bch2_key_cache, _gfp)); 275 if (ret) { 276 kmem_cache_free(bch2_key_cache, ck); 277 return ERR_PTR(ret); 278 } 279 280 if (!ck) 281 return NULL; 282 283 INIT_LIST_HEAD(&ck->list); 284 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 285 286 ck->c.cached = true; 287 BUG_ON(!six_trylock_intent(&ck->c.lock)); 288 BUG_ON(!six_trylock_write(&ck->c.lock)); 289 *was_new = true; 290 return ck; 291 } 292 293 static struct bkey_cached * 294 bkey_cached_reuse(struct btree_key_cache *c) 295 { 296 struct bucket_table *tbl; 297 struct rhash_head *pos; 298 struct bkey_cached *ck; 299 unsigned i; 300 301 mutex_lock(&c->lock); 302 rcu_read_lock(); 303 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 304 for (i = 0; i < tbl->size; i++) 305 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 306 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 307 bkey_cached_lock_for_evict(ck)) { 308 bkey_cached_evict(c, ck); 309 goto out; 310 } 311 } 312 ck = NULL; 313 out: 314 rcu_read_unlock(); 315 mutex_unlock(&c->lock); 316 return ck; 317 } 318 319 static struct bkey_cached * 320 btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) 321 { 322 struct bch_fs *c = trans->c; 323 struct btree_key_cache *bc = &c->btree_key_cache; 324 struct bkey_cached *ck; 325 bool was_new = false; 326 327 ck = bkey_cached_alloc(trans, path, &was_new); 328 if (IS_ERR(ck)) 329 return ck; 330 331 if (unlikely(!ck)) { 332 ck = bkey_cached_reuse(bc); 333 if (unlikely(!ck)) { 334 bch_err(c, "error allocating memory for key cache item, btree %s", 335 bch2_btree_id_str(path->btree_id)); 336 return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); 337 } 338 339 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 340 } 341 342 ck->c.level = 0; 343 ck->c.btree_id = path->btree_id; 344 ck->key.btree_id = path->btree_id; 345 ck->key.pos = path->pos; 346 ck->valid = false; 347 ck->flags = 1U << BKEY_CACHED_ACCESSED; 348 349 if (unlikely(rhashtable_lookup_insert_fast(&bc->table, 350 &ck->hash, 351 bch2_btree_key_cache_params))) { 352 /* We raced with another fill: */ 353 354 if (likely(was_new)) { 355 six_unlock_write(&ck->c.lock); 356 six_unlock_intent(&ck->c.lock); 357 kfree(ck); 358 } else { 359 bkey_cached_free_fast(bc, ck); 360 } 361 362 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 363 return NULL; 364 } 365 366 atomic_long_inc(&bc->nr_keys); 367 368 six_unlock_write(&ck->c.lock); 369 370 return ck; 371 } 372 373 static int btree_key_cache_fill(struct btree_trans *trans, 374 struct btree_path *ck_path, 375 struct bkey_cached *ck) 376 { 377 struct btree_iter iter; 378 struct bkey_s_c k; 379 unsigned new_u64s = 0; 380 struct bkey_i *new_k = NULL; 381 int ret; 382 383 k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, 384 BTREE_ITER_KEY_CACHE_FILL| 385 BTREE_ITER_CACHED_NOFILL); 386 ret = bkey_err(k); 387 if (ret) 388 goto err; 389 390 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 391 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 392 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 393 goto err; 394 } 395 396 /* 397 * bch2_varint_decode can read past the end of the buffer by at 398 * most 7 bytes (it won't be used): 399 */ 400 new_u64s = k.k->u64s + 1; 401 402 /* 403 * Allocate some extra space so that the transaction commit path is less 404 * likely to have to reallocate, since that requires a transaction 405 * restart: 406 */ 407 new_u64s = min(256U, (new_u64s * 3) / 2); 408 409 if (new_u64s > ck->u64s) { 410 new_u64s = roundup_pow_of_two(new_u64s); 411 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); 412 if (!new_k) { 413 bch2_trans_unlock(trans); 414 415 new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); 416 if (!new_k) { 417 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 418 bch2_btree_id_str(ck->key.btree_id), new_u64s); 419 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 420 goto err; 421 } 422 423 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 424 kfree(new_k); 425 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 426 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 427 goto err; 428 } 429 430 ret = bch2_trans_relock(trans); 431 if (ret) { 432 kfree(new_k); 433 goto err; 434 } 435 } 436 } 437 438 ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); 439 if (ret) { 440 kfree(new_k); 441 goto err; 442 } 443 444 if (new_k) { 445 kfree(ck->k); 446 ck->u64s = new_u64s; 447 ck->k = new_k; 448 } 449 450 bkey_reassemble(ck->k, k); 451 ck->valid = true; 452 bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); 453 454 /* We're not likely to need this iterator again: */ 455 set_btree_iter_dontneed(&iter); 456 err: 457 bch2_trans_iter_exit(trans, &iter); 458 return ret; 459 } 460 461 static noinline int 462 bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, 463 unsigned flags) 464 { 465 struct bch_fs *c = trans->c; 466 struct bkey_cached *ck; 467 int ret = 0; 468 469 BUG_ON(path->level); 470 471 path->l[1].b = NULL; 472 473 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 474 ck = (void *) path->l[0].b; 475 goto fill; 476 } 477 retry: 478 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 479 if (!ck) { 480 ck = btree_key_cache_create(trans, path); 481 ret = PTR_ERR_OR_ZERO(ck); 482 if (ret) 483 goto err; 484 if (!ck) 485 goto retry; 486 487 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 488 path->locks_want = 1; 489 } else { 490 enum six_lock_type lock_want = __btree_lock_want(path, 0); 491 492 ret = btree_node_lock(trans, path, (void *) ck, 0, 493 lock_want, _THIS_IP_); 494 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 495 goto err; 496 497 BUG_ON(ret); 498 499 if (ck->key.btree_id != path->btree_id || 500 !bpos_eq(ck->key.pos, path->pos)) { 501 six_unlock_type(&ck->c.lock, lock_want); 502 goto retry; 503 } 504 505 mark_btree_node_locked(trans, path, 0, 506 (enum btree_node_locked_type) lock_want); 507 } 508 509 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 510 path->l[0].b = (void *) ck; 511 fill: 512 path->uptodate = BTREE_ITER_UPTODATE; 513 514 if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { 515 /* 516 * Using the underscore version because we haven't set 517 * path->uptodate yet: 518 */ 519 if (!path->locks_want && 520 !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { 521 trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); 522 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); 523 goto err; 524 } 525 526 ret = btree_key_cache_fill(trans, path, ck); 527 if (ret) 528 goto err; 529 530 ret = bch2_btree_path_relock(trans, path, _THIS_IP_); 531 if (ret) 532 goto err; 533 534 path->uptodate = BTREE_ITER_UPTODATE; 535 } 536 537 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 538 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 539 540 BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 541 BUG_ON(path->uptodate); 542 543 return ret; 544 err: 545 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 546 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 547 btree_node_unlock(trans, path, 0); 548 path->l[0].b = ERR_PTR(ret); 549 } 550 return ret; 551 } 552 553 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 554 unsigned flags) 555 { 556 struct bch_fs *c = trans->c; 557 struct bkey_cached *ck; 558 int ret = 0; 559 560 EBUG_ON(path->level); 561 562 path->l[1].b = NULL; 563 564 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 565 ck = (void *) path->l[0].b; 566 goto fill; 567 } 568 retry: 569 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 570 if (!ck) { 571 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 572 } else { 573 enum six_lock_type lock_want = __btree_lock_want(path, 0); 574 575 ret = btree_node_lock(trans, path, (void *) ck, 0, 576 lock_want, _THIS_IP_); 577 EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); 578 579 if (ret) 580 return ret; 581 582 if (ck->key.btree_id != path->btree_id || 583 !bpos_eq(ck->key.pos, path->pos)) { 584 six_unlock_type(&ck->c.lock, lock_want); 585 goto retry; 586 } 587 588 mark_btree_node_locked(trans, path, 0, 589 (enum btree_node_locked_type) lock_want); 590 } 591 592 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 593 path->l[0].b = (void *) ck; 594 fill: 595 if (!ck->valid) 596 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 597 598 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 599 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 600 601 path->uptodate = BTREE_ITER_UPTODATE; 602 EBUG_ON(!ck->valid); 603 EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 604 605 return ret; 606 } 607 608 static int btree_key_cache_flush_pos(struct btree_trans *trans, 609 struct bkey_cached_key key, 610 u64 journal_seq, 611 unsigned commit_flags, 612 bool evict) 613 { 614 struct bch_fs *c = trans->c; 615 struct journal *j = &c->journal; 616 struct btree_iter c_iter, b_iter; 617 struct bkey_cached *ck = NULL; 618 int ret; 619 620 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 621 BTREE_ITER_SLOTS| 622 BTREE_ITER_INTENT| 623 BTREE_ITER_ALL_SNAPSHOTS); 624 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 625 BTREE_ITER_CACHED| 626 BTREE_ITER_INTENT); 627 b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; 628 629 ret = bch2_btree_iter_traverse(&c_iter); 630 if (ret) 631 goto out; 632 633 ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; 634 if (!ck) 635 goto out; 636 637 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 638 if (evict) 639 goto evict; 640 goto out; 641 } 642 643 BUG_ON(!ck->valid); 644 645 if (journal_seq && ck->journal.seq != journal_seq) 646 goto out; 647 648 trans->journal_res.seq = ck->journal.seq; 649 650 /* 651 * If we're at the end of the journal, we really want to free up space 652 * in the journal right away - we don't want to pin that old journal 653 * sequence number with a new btree node write, we want to re-journal 654 * the update 655 */ 656 if (ck->journal.seq == journal_last_seq(j)) 657 commit_flags |= BCH_WATERMARK_reclaim; 658 659 if (ck->journal.seq != journal_last_seq(j) || 660 j->watermark == BCH_WATERMARK_stripe) 661 commit_flags |= BCH_TRANS_COMMIT_no_journal_res; 662 663 ret = bch2_btree_iter_traverse(&b_iter) ?: 664 bch2_trans_update(trans, &b_iter, ck->k, 665 BTREE_UPDATE_KEY_CACHE_RECLAIM| 666 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| 667 BTREE_TRIGGER_NORUN) ?: 668 bch2_trans_commit(trans, NULL, NULL, 669 BCH_TRANS_COMMIT_no_check_rw| 670 BCH_TRANS_COMMIT_no_enospc| 671 commit_flags); 672 673 bch2_fs_fatal_err_on(ret && 674 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 675 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 676 !bch2_journal_error(j), c, 677 "error flushing key cache: %s", bch2_err_str(ret)); 678 if (ret) 679 goto out; 680 681 bch2_journal_pin_drop(j, &ck->journal); 682 683 struct btree_path *path = btree_iter_path(trans, &c_iter); 684 BUG_ON(!btree_node_locked(path, 0)); 685 686 if (!evict) { 687 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 688 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 689 atomic_long_dec(&c->btree_key_cache.nr_dirty); 690 } 691 } else { 692 struct btree_path *path2; 693 unsigned i; 694 evict: 695 trans_for_each_path(trans, path2, i) 696 if (path2 != path) 697 __bch2_btree_path_unlock(trans, path2); 698 699 bch2_btree_node_lock_write_nofail(trans, path, &ck->c); 700 701 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 702 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 703 atomic_long_dec(&c->btree_key_cache.nr_dirty); 704 } 705 706 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 707 bkey_cached_evict(&c->btree_key_cache, ck); 708 bkey_cached_free_fast(&c->btree_key_cache, ck); 709 } 710 out: 711 bch2_trans_iter_exit(trans, &b_iter); 712 bch2_trans_iter_exit(trans, &c_iter); 713 return ret; 714 } 715 716 int bch2_btree_key_cache_journal_flush(struct journal *j, 717 struct journal_entry_pin *pin, u64 seq) 718 { 719 struct bch_fs *c = container_of(j, struct bch_fs, journal); 720 struct bkey_cached *ck = 721 container_of(pin, struct bkey_cached, journal); 722 struct bkey_cached_key key; 723 struct btree_trans *trans = bch2_trans_get(c); 724 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 725 int ret = 0; 726 727 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 728 key = ck->key; 729 730 if (ck->journal.seq != seq || 731 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 732 six_unlock_read(&ck->c.lock); 733 goto unlock; 734 } 735 736 if (ck->seq != seq) { 737 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 738 bch2_btree_key_cache_journal_flush); 739 six_unlock_read(&ck->c.lock); 740 goto unlock; 741 } 742 six_unlock_read(&ck->c.lock); 743 744 ret = lockrestart_do(trans, 745 btree_key_cache_flush_pos(trans, key, seq, 746 BCH_TRANS_COMMIT_journal_reclaim, false)); 747 unlock: 748 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 749 750 bch2_trans_put(trans); 751 return ret; 752 } 753 754 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 755 unsigned flags, 756 struct btree_insert_entry *insert_entry) 757 { 758 struct bch_fs *c = trans->c; 759 struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; 760 struct bkey_i *insert = insert_entry->k; 761 bool kick_reclaim = false; 762 763 BUG_ON(insert->k.u64s > ck->u64s); 764 765 bkey_copy(ck->k, insert); 766 ck->valid = true; 767 768 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 769 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); 770 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 771 atomic_long_inc(&c->btree_key_cache.nr_dirty); 772 773 if (bch2_nr_btree_keys_need_flush(c)) 774 kick_reclaim = true; 775 } 776 777 /* 778 * To minimize lock contention, we only add the journal pin here and 779 * defer pin updates to the flush callback via ->seq. Be careful not to 780 * update ->seq on nojournal commits because we don't want to update the 781 * pin to a seq that doesn't include journal updates on disk. Otherwise 782 * we risk losing the update after a crash. 783 * 784 * The only exception is if the pin is not active in the first place. We 785 * have to add the pin because journal reclaim drives key cache 786 * flushing. The flush callback will not proceed unless ->seq matches 787 * the latest pin, so make sure it starts with a consistent value. 788 */ 789 if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || 790 !journal_pin_active(&ck->journal)) { 791 ck->seq = trans->journal_res.seq; 792 } 793 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 794 &ck->journal, bch2_btree_key_cache_journal_flush); 795 796 if (kick_reclaim) 797 journal_reclaim_kick(&c->journal); 798 return true; 799 } 800 801 void bch2_btree_key_cache_drop(struct btree_trans *trans, 802 struct btree_path *path) 803 { 804 struct bch_fs *c = trans->c; 805 struct bkey_cached *ck = (void *) path->l[0].b; 806 807 BUG_ON(!ck->valid); 808 809 /* 810 * We just did an update to the btree, bypassing the key cache: the key 811 * cache key is now stale and must be dropped, even if dirty: 812 */ 813 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 814 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 815 atomic_long_dec(&c->btree_key_cache.nr_dirty); 816 bch2_journal_pin_drop(&c->journal, &ck->journal); 817 } 818 819 ck->valid = false; 820 } 821 822 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 823 struct shrink_control *sc) 824 { 825 struct bch_fs *c = shrink->private_data; 826 struct btree_key_cache *bc = &c->btree_key_cache; 827 struct bucket_table *tbl; 828 struct bkey_cached *ck, *t; 829 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 830 unsigned start, flags; 831 int srcu_idx; 832 833 mutex_lock(&bc->lock); 834 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 835 flags = memalloc_nofs_save(); 836 837 /* 838 * Newest freed entries are at the end of the list - once we hit one 839 * that's too new to be freed, we can bail out: 840 */ 841 scanned += bc->nr_freed_nonpcpu; 842 843 list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { 844 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 845 ck->btree_trans_barrier_seq)) 846 break; 847 848 list_del(&ck->list); 849 six_lock_exit(&ck->c.lock); 850 kmem_cache_free(bch2_key_cache, ck); 851 atomic_long_dec(&bc->nr_freed); 852 freed++; 853 bc->nr_freed_nonpcpu--; 854 } 855 856 if (scanned >= nr) 857 goto out; 858 859 scanned += bc->nr_freed_pcpu; 860 861 list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { 862 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 863 ck->btree_trans_barrier_seq)) 864 break; 865 866 list_del(&ck->list); 867 six_lock_exit(&ck->c.lock); 868 kmem_cache_free(bch2_key_cache, ck); 869 atomic_long_dec(&bc->nr_freed); 870 freed++; 871 bc->nr_freed_pcpu--; 872 } 873 874 if (scanned >= nr) 875 goto out; 876 877 rcu_read_lock(); 878 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 879 if (bc->shrink_iter >= tbl->size) 880 bc->shrink_iter = 0; 881 start = bc->shrink_iter; 882 883 do { 884 struct rhash_head *pos, *next; 885 886 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); 887 888 while (!rht_is_a_nulls(pos)) { 889 next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); 890 ck = container_of(pos, struct bkey_cached, hash); 891 892 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) 893 goto next; 894 895 if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 896 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 897 else if (bkey_cached_lock_for_evict(ck)) { 898 bkey_cached_evict(bc, ck); 899 bkey_cached_free(bc, ck); 900 } 901 902 scanned++; 903 if (scanned >= nr) 904 break; 905 next: 906 pos = next; 907 } 908 909 bc->shrink_iter++; 910 if (bc->shrink_iter >= tbl->size) 911 bc->shrink_iter = 0; 912 } while (scanned < nr && bc->shrink_iter != start); 913 914 rcu_read_unlock(); 915 out: 916 memalloc_nofs_restore(flags); 917 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 918 mutex_unlock(&bc->lock); 919 920 return freed; 921 } 922 923 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 924 struct shrink_control *sc) 925 { 926 struct bch_fs *c = shrink->private_data; 927 struct btree_key_cache *bc = &c->btree_key_cache; 928 long nr = atomic_long_read(&bc->nr_keys) - 929 atomic_long_read(&bc->nr_dirty); 930 931 return max(0L, nr); 932 } 933 934 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 935 { 936 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 937 struct bucket_table *tbl; 938 struct bkey_cached *ck, *n; 939 struct rhash_head *pos; 940 LIST_HEAD(items); 941 unsigned i; 942 #ifdef __KERNEL__ 943 int cpu; 944 #endif 945 946 shrinker_free(bc->shrink); 947 948 mutex_lock(&bc->lock); 949 950 /* 951 * The loop is needed to guard against racing with rehash: 952 */ 953 while (atomic_long_read(&bc->nr_keys)) { 954 rcu_read_lock(); 955 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 956 if (tbl) 957 for (i = 0; i < tbl->size; i++) 958 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 959 bkey_cached_evict(bc, ck); 960 list_add(&ck->list, &items); 961 } 962 rcu_read_unlock(); 963 } 964 965 #ifdef __KERNEL__ 966 for_each_possible_cpu(cpu) { 967 struct btree_key_cache_freelist *f = 968 per_cpu_ptr(bc->pcpu_freed, cpu); 969 970 for (i = 0; i < f->nr; i++) { 971 ck = f->objs[i]; 972 list_add(&ck->list, &items); 973 } 974 } 975 #endif 976 977 BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); 978 BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); 979 980 list_splice(&bc->freed_pcpu, &items); 981 list_splice(&bc->freed_nonpcpu, &items); 982 983 mutex_unlock(&bc->lock); 984 985 list_for_each_entry_safe(ck, n, &items, list) { 986 cond_resched(); 987 988 list_del(&ck->list); 989 kfree(ck->k); 990 six_lock_exit(&ck->c.lock); 991 kmem_cache_free(bch2_key_cache, ck); 992 } 993 994 if (atomic_long_read(&bc->nr_dirty) && 995 !bch2_journal_error(&c->journal) && 996 test_bit(BCH_FS_was_rw, &c->flags)) 997 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 998 atomic_long_read(&bc->nr_dirty)); 999 1000 if (atomic_long_read(&bc->nr_keys)) 1001 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 1002 atomic_long_read(&bc->nr_keys)); 1003 1004 if (bc->table_init_done) 1005 rhashtable_destroy(&bc->table); 1006 1007 free_percpu(bc->pcpu_freed); 1008 } 1009 1010 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 1011 { 1012 mutex_init(&c->lock); 1013 INIT_LIST_HEAD(&c->freed_pcpu); 1014 INIT_LIST_HEAD(&c->freed_nonpcpu); 1015 } 1016 1017 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 1018 { 1019 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 1020 struct shrinker *shrink; 1021 1022 #ifdef __KERNEL__ 1023 bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); 1024 if (!bc->pcpu_freed) 1025 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1026 #endif 1027 1028 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 1029 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1030 1031 bc->table_init_done = true; 1032 1033 shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); 1034 if (!shrink) 1035 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1036 bc->shrink = shrink; 1037 shrink->seeks = 0; 1038 shrink->count_objects = bch2_btree_key_cache_count; 1039 shrink->scan_objects = bch2_btree_key_cache_scan; 1040 shrink->private_data = c; 1041 shrinker_register(shrink); 1042 return 0; 1043 } 1044 1045 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) 1046 { 1047 prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); 1048 prt_newline(out); 1049 prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); 1050 prt_newline(out); 1051 prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); 1052 prt_newline(out); 1053 } 1054 1055 void bch2_btree_key_cache_exit(void) 1056 { 1057 kmem_cache_destroy(bch2_key_cache); 1058 } 1059 1060 int __init bch2_btree_key_cache_init(void) 1061 { 1062 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 1063 if (!bch2_key_cache) 1064 return -ENOMEM; 1065 1066 return 0; 1067 } 1068