1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 }; 40 41 __flatten 42 inline struct bkey_cached * 43 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 44 { 45 struct bkey_cached_key key = { 46 .btree_id = btree_id, 47 .pos = pos, 48 }; 49 50 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 51 bch2_btree_key_cache_params); 52 } 53 54 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 55 { 56 if (!six_trylock_intent(&ck->c.lock)) 57 return false; 58 59 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 60 six_unlock_intent(&ck->c.lock); 61 return false; 62 } 63 64 if (!six_trylock_write(&ck->c.lock)) { 65 six_unlock_intent(&ck->c.lock); 66 return false; 67 } 68 69 return true; 70 } 71 72 static void bkey_cached_evict(struct btree_key_cache *c, 73 struct bkey_cached *ck) 74 { 75 BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, 76 bch2_btree_key_cache_params)); 77 memset(&ck->key, ~0, sizeof(ck->key)); 78 79 atomic_long_dec(&c->nr_keys); 80 } 81 82 static void bkey_cached_free(struct btree_key_cache *bc, 83 struct bkey_cached *ck) 84 { 85 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 86 87 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 88 89 ck->btree_trans_barrier_seq = 90 start_poll_synchronize_srcu(&c->btree_trans_barrier); 91 92 if (ck->c.lock.readers) 93 list_move_tail(&ck->list, &bc->freed_pcpu); 94 else 95 list_move_tail(&ck->list, &bc->freed_nonpcpu); 96 atomic_long_inc(&bc->nr_freed); 97 98 kfree(ck->k); 99 ck->k = NULL; 100 ck->u64s = 0; 101 102 six_unlock_write(&ck->c.lock); 103 six_unlock_intent(&ck->c.lock); 104 } 105 106 #ifdef __KERNEL__ 107 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, 108 struct bkey_cached *ck) 109 { 110 struct bkey_cached *pos; 111 112 list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { 113 if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, 114 pos->btree_trans_barrier_seq)) { 115 list_move(&ck->list, &pos->list); 116 return; 117 } 118 } 119 120 list_move(&ck->list, &bc->freed_nonpcpu); 121 } 122 #endif 123 124 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, 125 struct bkey_cached *ck) 126 { 127 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 128 129 if (!ck->c.lock.readers) { 130 #ifdef __KERNEL__ 131 struct btree_key_cache_freelist *f; 132 bool freed = false; 133 134 preempt_disable(); 135 f = this_cpu_ptr(bc->pcpu_freed); 136 137 if (f->nr < ARRAY_SIZE(f->objs)) { 138 f->objs[f->nr++] = ck; 139 freed = true; 140 } 141 preempt_enable(); 142 143 if (!freed) { 144 mutex_lock(&bc->lock); 145 preempt_disable(); 146 f = this_cpu_ptr(bc->pcpu_freed); 147 148 while (f->nr > ARRAY_SIZE(f->objs) / 2) { 149 struct bkey_cached *ck2 = f->objs[--f->nr]; 150 151 __bkey_cached_move_to_freelist_ordered(bc, ck2); 152 } 153 preempt_enable(); 154 155 __bkey_cached_move_to_freelist_ordered(bc, ck); 156 mutex_unlock(&bc->lock); 157 } 158 #else 159 mutex_lock(&bc->lock); 160 list_move_tail(&ck->list, &bc->freed_nonpcpu); 161 mutex_unlock(&bc->lock); 162 #endif 163 } else { 164 mutex_lock(&bc->lock); 165 list_move_tail(&ck->list, &bc->freed_pcpu); 166 mutex_unlock(&bc->lock); 167 } 168 } 169 170 static void bkey_cached_free_fast(struct btree_key_cache *bc, 171 struct bkey_cached *ck) 172 { 173 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 174 175 ck->btree_trans_barrier_seq = 176 start_poll_synchronize_srcu(&c->btree_trans_barrier); 177 178 list_del_init(&ck->list); 179 atomic_long_inc(&bc->nr_freed); 180 181 kfree(ck->k); 182 ck->k = NULL; 183 ck->u64s = 0; 184 185 bkey_cached_move_to_freelist(bc, ck); 186 187 six_unlock_write(&ck->c.lock); 188 six_unlock_intent(&ck->c.lock); 189 } 190 191 static struct bkey_cached * 192 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, 193 bool *was_new) 194 { 195 struct bch_fs *c = trans->c; 196 struct btree_key_cache *bc = &c->btree_key_cache; 197 struct bkey_cached *ck = NULL; 198 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 199 int ret; 200 201 if (!pcpu_readers) { 202 #ifdef __KERNEL__ 203 struct btree_key_cache_freelist *f; 204 205 preempt_disable(); 206 f = this_cpu_ptr(bc->pcpu_freed); 207 if (f->nr) 208 ck = f->objs[--f->nr]; 209 preempt_enable(); 210 211 if (!ck) { 212 mutex_lock(&bc->lock); 213 preempt_disable(); 214 f = this_cpu_ptr(bc->pcpu_freed); 215 216 while (!list_empty(&bc->freed_nonpcpu) && 217 f->nr < ARRAY_SIZE(f->objs) / 2) { 218 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 219 list_del_init(&ck->list); 220 f->objs[f->nr++] = ck; 221 } 222 223 ck = f->nr ? f->objs[--f->nr] : NULL; 224 preempt_enable(); 225 mutex_unlock(&bc->lock); 226 } 227 #else 228 mutex_lock(&bc->lock); 229 if (!list_empty(&bc->freed_nonpcpu)) { 230 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 231 list_del_init(&ck->list); 232 } 233 mutex_unlock(&bc->lock); 234 #endif 235 } else { 236 mutex_lock(&bc->lock); 237 if (!list_empty(&bc->freed_pcpu)) { 238 ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); 239 list_del_init(&ck->list); 240 } 241 mutex_unlock(&bc->lock); 242 } 243 244 if (ck) { 245 ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); 246 if (unlikely(ret)) { 247 bkey_cached_move_to_freelist(bc, ck); 248 return ERR_PTR(ret); 249 } 250 251 path->l[0].b = (void *) ck; 252 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 253 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 254 255 ret = bch2_btree_node_lock_write(trans, path, &ck->c); 256 if (unlikely(ret)) { 257 btree_node_unlock(trans, path, 0); 258 bkey_cached_move_to_freelist(bc, ck); 259 return ERR_PTR(ret); 260 } 261 262 return ck; 263 } 264 265 ck = allocate_dropping_locks(trans, ret, 266 kmem_cache_zalloc(bch2_key_cache, _gfp)); 267 if (ret) { 268 kmem_cache_free(bch2_key_cache, ck); 269 return ERR_PTR(ret); 270 } 271 272 if (!ck) 273 return NULL; 274 275 INIT_LIST_HEAD(&ck->list); 276 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 277 278 ck->c.cached = true; 279 BUG_ON(!six_trylock_intent(&ck->c.lock)); 280 BUG_ON(!six_trylock_write(&ck->c.lock)); 281 *was_new = true; 282 return ck; 283 } 284 285 static struct bkey_cached * 286 bkey_cached_reuse(struct btree_key_cache *c) 287 { 288 struct bucket_table *tbl; 289 struct rhash_head *pos; 290 struct bkey_cached *ck; 291 unsigned i; 292 293 mutex_lock(&c->lock); 294 rcu_read_lock(); 295 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 296 for (i = 0; i < tbl->size; i++) 297 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 298 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 299 bkey_cached_lock_for_evict(ck)) { 300 bkey_cached_evict(c, ck); 301 goto out; 302 } 303 } 304 ck = NULL; 305 out: 306 rcu_read_unlock(); 307 mutex_unlock(&c->lock); 308 return ck; 309 } 310 311 static struct bkey_cached * 312 btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) 313 { 314 struct bch_fs *c = trans->c; 315 struct btree_key_cache *bc = &c->btree_key_cache; 316 struct bkey_cached *ck; 317 bool was_new = false; 318 319 ck = bkey_cached_alloc(trans, path, &was_new); 320 if (IS_ERR(ck)) 321 return ck; 322 323 if (unlikely(!ck)) { 324 ck = bkey_cached_reuse(bc); 325 if (unlikely(!ck)) { 326 bch_err(c, "error allocating memory for key cache item, btree %s", 327 bch2_btree_ids[path->btree_id]); 328 return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); 329 } 330 331 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 332 } 333 334 ck->c.level = 0; 335 ck->c.btree_id = path->btree_id; 336 ck->key.btree_id = path->btree_id; 337 ck->key.pos = path->pos; 338 ck->valid = false; 339 ck->flags = 1U << BKEY_CACHED_ACCESSED; 340 341 if (unlikely(rhashtable_lookup_insert_fast(&bc->table, 342 &ck->hash, 343 bch2_btree_key_cache_params))) { 344 /* We raced with another fill: */ 345 346 if (likely(was_new)) { 347 six_unlock_write(&ck->c.lock); 348 six_unlock_intent(&ck->c.lock); 349 kfree(ck); 350 } else { 351 bkey_cached_free_fast(bc, ck); 352 } 353 354 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 355 return NULL; 356 } 357 358 atomic_long_inc(&bc->nr_keys); 359 360 six_unlock_write(&ck->c.lock); 361 362 return ck; 363 } 364 365 static int btree_key_cache_fill(struct btree_trans *trans, 366 struct btree_path *ck_path, 367 struct bkey_cached *ck) 368 { 369 struct btree_iter iter; 370 struct bkey_s_c k; 371 unsigned new_u64s = 0; 372 struct bkey_i *new_k = NULL; 373 int ret; 374 375 k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, 376 BTREE_ITER_KEY_CACHE_FILL| 377 BTREE_ITER_CACHED_NOFILL); 378 ret = bkey_err(k); 379 if (ret) 380 goto err; 381 382 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 383 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 384 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 385 goto err; 386 } 387 388 /* 389 * bch2_varint_decode can read past the end of the buffer by at 390 * most 7 bytes (it won't be used): 391 */ 392 new_u64s = k.k->u64s + 1; 393 394 /* 395 * Allocate some extra space so that the transaction commit path is less 396 * likely to have to reallocate, since that requires a transaction 397 * restart: 398 */ 399 new_u64s = min(256U, (new_u64s * 3) / 2); 400 401 if (new_u64s > ck->u64s) { 402 new_u64s = roundup_pow_of_two(new_u64s); 403 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); 404 if (!new_k) { 405 bch2_trans_unlock(trans); 406 407 new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); 408 if (!new_k) { 409 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 410 bch2_btree_ids[ck->key.btree_id], new_u64s); 411 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 412 goto err; 413 } 414 415 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 416 kfree(new_k); 417 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 418 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 419 goto err; 420 } 421 422 ret = bch2_trans_relock(trans); 423 if (ret) { 424 kfree(new_k); 425 goto err; 426 } 427 } 428 } 429 430 ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); 431 if (ret) { 432 kfree(new_k); 433 goto err; 434 } 435 436 if (new_k) { 437 kfree(ck->k); 438 ck->u64s = new_u64s; 439 ck->k = new_k; 440 } 441 442 bkey_reassemble(ck->k, k); 443 ck->valid = true; 444 bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); 445 446 /* We're not likely to need this iterator again: */ 447 set_btree_iter_dontneed(&iter); 448 err: 449 bch2_trans_iter_exit(trans, &iter); 450 return ret; 451 } 452 453 static noinline int 454 bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, 455 unsigned flags) 456 { 457 struct bch_fs *c = trans->c; 458 struct bkey_cached *ck; 459 int ret = 0; 460 461 BUG_ON(path->level); 462 463 path->l[1].b = NULL; 464 465 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 466 ck = (void *) path->l[0].b; 467 goto fill; 468 } 469 retry: 470 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 471 if (!ck) { 472 ck = btree_key_cache_create(trans, path); 473 ret = PTR_ERR_OR_ZERO(ck); 474 if (ret) 475 goto err; 476 if (!ck) 477 goto retry; 478 479 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 480 path->locks_want = 1; 481 } else { 482 enum six_lock_type lock_want = __btree_lock_want(path, 0); 483 484 ret = btree_node_lock(trans, path, (void *) ck, 0, 485 lock_want, _THIS_IP_); 486 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 487 goto err; 488 489 BUG_ON(ret); 490 491 if (ck->key.btree_id != path->btree_id || 492 !bpos_eq(ck->key.pos, path->pos)) { 493 six_unlock_type(&ck->c.lock, lock_want); 494 goto retry; 495 } 496 497 mark_btree_node_locked(trans, path, 0, 498 (enum btree_node_locked_type) lock_want); 499 } 500 501 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 502 path->l[0].b = (void *) ck; 503 fill: 504 path->uptodate = BTREE_ITER_UPTODATE; 505 506 if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { 507 /* 508 * Using the underscore version because we haven't set 509 * path->uptodate yet: 510 */ 511 if (!path->locks_want && 512 !__bch2_btree_path_upgrade(trans, path, 1)) { 513 trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); 514 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); 515 goto err; 516 } 517 518 ret = btree_key_cache_fill(trans, path, ck); 519 if (ret) 520 goto err; 521 522 ret = bch2_btree_path_relock(trans, path, _THIS_IP_); 523 if (ret) 524 goto err; 525 526 path->uptodate = BTREE_ITER_UPTODATE; 527 } 528 529 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 530 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 531 532 BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 533 BUG_ON(path->uptodate); 534 535 return ret; 536 err: 537 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 538 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 539 btree_node_unlock(trans, path, 0); 540 path->l[0].b = ERR_PTR(ret); 541 } 542 return ret; 543 } 544 545 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 546 unsigned flags) 547 { 548 struct bch_fs *c = trans->c; 549 struct bkey_cached *ck; 550 int ret = 0; 551 552 EBUG_ON(path->level); 553 554 path->l[1].b = NULL; 555 556 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 557 ck = (void *) path->l[0].b; 558 goto fill; 559 } 560 retry: 561 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 562 if (!ck) { 563 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 564 } else { 565 enum six_lock_type lock_want = __btree_lock_want(path, 0); 566 567 ret = btree_node_lock(trans, path, (void *) ck, 0, 568 lock_want, _THIS_IP_); 569 EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); 570 571 if (ret) 572 return ret; 573 574 if (ck->key.btree_id != path->btree_id || 575 !bpos_eq(ck->key.pos, path->pos)) { 576 six_unlock_type(&ck->c.lock, lock_want); 577 goto retry; 578 } 579 580 mark_btree_node_locked(trans, path, 0, 581 (enum btree_node_locked_type) lock_want); 582 } 583 584 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 585 path->l[0].b = (void *) ck; 586 fill: 587 if (!ck->valid) 588 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 589 590 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 591 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 592 593 path->uptodate = BTREE_ITER_UPTODATE; 594 EBUG_ON(!ck->valid); 595 EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 596 597 return ret; 598 } 599 600 static int btree_key_cache_flush_pos(struct btree_trans *trans, 601 struct bkey_cached_key key, 602 u64 journal_seq, 603 unsigned commit_flags, 604 bool evict) 605 { 606 struct bch_fs *c = trans->c; 607 struct journal *j = &c->journal; 608 struct btree_iter c_iter, b_iter; 609 struct bkey_cached *ck = NULL; 610 int ret; 611 612 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 613 BTREE_ITER_SLOTS| 614 BTREE_ITER_INTENT| 615 BTREE_ITER_ALL_SNAPSHOTS); 616 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 617 BTREE_ITER_CACHED| 618 BTREE_ITER_INTENT); 619 b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; 620 621 ret = bch2_btree_iter_traverse(&c_iter); 622 if (ret) 623 goto out; 624 625 ck = (void *) c_iter.path->l[0].b; 626 if (!ck) 627 goto out; 628 629 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 630 if (evict) 631 goto evict; 632 goto out; 633 } 634 635 BUG_ON(!ck->valid); 636 637 if (journal_seq && ck->journal.seq != journal_seq) 638 goto out; 639 640 /* 641 * Since journal reclaim depends on us making progress here, and the 642 * allocator/copygc depend on journal reclaim making progress, we need 643 * to be using alloc reserves: 644 */ 645 ret = bch2_btree_iter_traverse(&b_iter) ?: 646 bch2_trans_update(trans, &b_iter, ck->k, 647 BTREE_UPDATE_KEY_CACHE_RECLAIM| 648 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| 649 BTREE_TRIGGER_NORUN) ?: 650 bch2_trans_commit(trans, NULL, NULL, 651 BTREE_INSERT_NOCHECK_RW| 652 BTREE_INSERT_NOFAIL| 653 (ck->journal.seq == journal_last_seq(j) 654 ? BCH_WATERMARK_reclaim 655 : 0)| 656 commit_flags); 657 658 bch2_fs_fatal_err_on(ret && 659 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 660 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 661 !bch2_journal_error(j), c, 662 "error flushing key cache: %s", bch2_err_str(ret)); 663 if (ret) 664 goto out; 665 666 bch2_journal_pin_drop(j, &ck->journal); 667 bch2_journal_preres_put(j, &ck->res); 668 669 BUG_ON(!btree_node_locked(c_iter.path, 0)); 670 671 if (!evict) { 672 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 673 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 674 atomic_long_dec(&c->btree_key_cache.nr_dirty); 675 } 676 } else { 677 struct btree_path *path2; 678 evict: 679 trans_for_each_path(trans, path2) 680 if (path2 != c_iter.path) 681 __bch2_btree_path_unlock(trans, path2); 682 683 bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); 684 685 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 686 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 687 atomic_long_dec(&c->btree_key_cache.nr_dirty); 688 } 689 690 mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); 691 bkey_cached_evict(&c->btree_key_cache, ck); 692 bkey_cached_free_fast(&c->btree_key_cache, ck); 693 } 694 out: 695 bch2_trans_iter_exit(trans, &b_iter); 696 bch2_trans_iter_exit(trans, &c_iter); 697 return ret; 698 } 699 700 int bch2_btree_key_cache_journal_flush(struct journal *j, 701 struct journal_entry_pin *pin, u64 seq) 702 { 703 struct bch_fs *c = container_of(j, struct bch_fs, journal); 704 struct bkey_cached *ck = 705 container_of(pin, struct bkey_cached, journal); 706 struct bkey_cached_key key; 707 struct btree_trans *trans = bch2_trans_get(c); 708 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 709 int ret = 0; 710 711 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 712 key = ck->key; 713 714 if (ck->journal.seq != seq || 715 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 716 six_unlock_read(&ck->c.lock); 717 goto unlock; 718 } 719 720 if (ck->seq != seq) { 721 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 722 bch2_btree_key_cache_journal_flush); 723 six_unlock_read(&ck->c.lock); 724 goto unlock; 725 } 726 six_unlock_read(&ck->c.lock); 727 728 ret = commit_do(trans, NULL, NULL, 0, 729 btree_key_cache_flush_pos(trans, key, seq, 730 BTREE_INSERT_JOURNAL_RECLAIM, false)); 731 unlock: 732 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 733 734 bch2_trans_put(trans); 735 return ret; 736 } 737 738 /* 739 * Flush and evict a key from the key cache: 740 */ 741 int bch2_btree_key_cache_flush(struct btree_trans *trans, 742 enum btree_id id, struct bpos pos) 743 { 744 struct bch_fs *c = trans->c; 745 struct bkey_cached_key key = { id, pos }; 746 747 /* Fastpath - assume it won't be found: */ 748 if (!bch2_btree_key_cache_find(c, id, pos)) 749 return 0; 750 751 return btree_key_cache_flush_pos(trans, key, 0, 0, true); 752 } 753 754 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 755 unsigned flags, 756 struct btree_insert_entry *insert_entry) 757 { 758 struct bch_fs *c = trans->c; 759 struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; 760 struct bkey_i *insert = insert_entry->k; 761 bool kick_reclaim = false; 762 763 BUG_ON(insert->k.u64s > ck->u64s); 764 765 if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { 766 int difference; 767 768 BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s); 769 770 difference = jset_u64s(insert->k.u64s) - ck->res.u64s; 771 if (difference > 0) { 772 trans->journal_preres.u64s -= difference; 773 ck->res.u64s += difference; 774 } 775 } 776 777 bkey_copy(ck->k, insert); 778 ck->valid = true; 779 780 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 781 EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); 782 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 783 atomic_long_inc(&c->btree_key_cache.nr_dirty); 784 785 if (bch2_nr_btree_keys_need_flush(c)) 786 kick_reclaim = true; 787 } 788 789 /* 790 * To minimize lock contention, we only add the journal pin here and 791 * defer pin updates to the flush callback via ->seq. Be careful not to 792 * update ->seq on nojournal commits because we don't want to update the 793 * pin to a seq that doesn't include journal updates on disk. Otherwise 794 * we risk losing the update after a crash. 795 * 796 * The only exception is if the pin is not active in the first place. We 797 * have to add the pin because journal reclaim drives key cache 798 * flushing. The flush callback will not proceed unless ->seq matches 799 * the latest pin, so make sure it starts with a consistent value. 800 */ 801 if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || 802 !journal_pin_active(&ck->journal)) { 803 ck->seq = trans->journal_res.seq; 804 } 805 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 806 &ck->journal, bch2_btree_key_cache_journal_flush); 807 808 if (kick_reclaim) 809 journal_reclaim_kick(&c->journal); 810 return true; 811 } 812 813 void bch2_btree_key_cache_drop(struct btree_trans *trans, 814 struct btree_path *path) 815 { 816 struct bch_fs *c = trans->c; 817 struct bkey_cached *ck = (void *) path->l[0].b; 818 819 BUG_ON(!ck->valid); 820 821 /* 822 * We just did an update to the btree, bypassing the key cache: the key 823 * cache key is now stale and must be dropped, even if dirty: 824 */ 825 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 826 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 827 atomic_long_dec(&c->btree_key_cache.nr_dirty); 828 bch2_journal_pin_drop(&c->journal, &ck->journal); 829 } 830 831 ck->valid = false; 832 } 833 834 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 835 struct shrink_control *sc) 836 { 837 struct bch_fs *c = container_of(shrink, struct bch_fs, 838 btree_key_cache.shrink); 839 struct btree_key_cache *bc = &c->btree_key_cache; 840 struct bucket_table *tbl; 841 struct bkey_cached *ck, *t; 842 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 843 unsigned start, flags; 844 int srcu_idx; 845 846 mutex_lock(&bc->lock); 847 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 848 flags = memalloc_nofs_save(); 849 850 /* 851 * Newest freed entries are at the end of the list - once we hit one 852 * that's too new to be freed, we can bail out: 853 */ 854 list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { 855 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 856 ck->btree_trans_barrier_seq)) 857 break; 858 859 list_del(&ck->list); 860 six_lock_exit(&ck->c.lock); 861 kmem_cache_free(bch2_key_cache, ck); 862 atomic_long_dec(&bc->nr_freed); 863 scanned++; 864 freed++; 865 } 866 867 if (scanned >= nr) 868 goto out; 869 870 list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { 871 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 872 ck->btree_trans_barrier_seq)) 873 break; 874 875 list_del(&ck->list); 876 six_lock_exit(&ck->c.lock); 877 kmem_cache_free(bch2_key_cache, ck); 878 atomic_long_dec(&bc->nr_freed); 879 scanned++; 880 freed++; 881 } 882 883 if (scanned >= nr) 884 goto out; 885 886 rcu_read_lock(); 887 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 888 if (bc->shrink_iter >= tbl->size) 889 bc->shrink_iter = 0; 890 start = bc->shrink_iter; 891 892 do { 893 struct rhash_head *pos, *next; 894 895 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); 896 897 while (!rht_is_a_nulls(pos)) { 898 next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); 899 ck = container_of(pos, struct bkey_cached, hash); 900 901 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) 902 goto next; 903 904 if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 905 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 906 else if (bkey_cached_lock_for_evict(ck)) { 907 bkey_cached_evict(bc, ck); 908 bkey_cached_free(bc, ck); 909 } 910 911 scanned++; 912 if (scanned >= nr) 913 break; 914 next: 915 pos = next; 916 } 917 918 bc->shrink_iter++; 919 if (bc->shrink_iter >= tbl->size) 920 bc->shrink_iter = 0; 921 } while (scanned < nr && bc->shrink_iter != start); 922 923 rcu_read_unlock(); 924 out: 925 memalloc_nofs_restore(flags); 926 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 927 mutex_unlock(&bc->lock); 928 929 return freed; 930 } 931 932 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 933 struct shrink_control *sc) 934 { 935 struct bch_fs *c = container_of(shrink, struct bch_fs, 936 btree_key_cache.shrink); 937 struct btree_key_cache *bc = &c->btree_key_cache; 938 long nr = atomic_long_read(&bc->nr_keys) - 939 atomic_long_read(&bc->nr_dirty); 940 941 return max(0L, nr); 942 } 943 944 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 945 { 946 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 947 struct bucket_table *tbl; 948 struct bkey_cached *ck, *n; 949 struct rhash_head *pos; 950 LIST_HEAD(items); 951 unsigned i; 952 #ifdef __KERNEL__ 953 int cpu; 954 #endif 955 956 unregister_shrinker(&bc->shrink); 957 958 mutex_lock(&bc->lock); 959 960 /* 961 * The loop is needed to guard against racing with rehash: 962 */ 963 while (atomic_long_read(&bc->nr_keys)) { 964 rcu_read_lock(); 965 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 966 if (tbl) 967 for (i = 0; i < tbl->size; i++) 968 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 969 bkey_cached_evict(bc, ck); 970 list_add(&ck->list, &items); 971 } 972 rcu_read_unlock(); 973 } 974 975 #ifdef __KERNEL__ 976 for_each_possible_cpu(cpu) { 977 struct btree_key_cache_freelist *f = 978 per_cpu_ptr(bc->pcpu_freed, cpu); 979 980 for (i = 0; i < f->nr; i++) { 981 ck = f->objs[i]; 982 list_add(&ck->list, &items); 983 } 984 } 985 #endif 986 987 list_splice(&bc->freed_pcpu, &items); 988 list_splice(&bc->freed_nonpcpu, &items); 989 990 mutex_unlock(&bc->lock); 991 992 list_for_each_entry_safe(ck, n, &items, list) { 993 cond_resched(); 994 995 bch2_journal_pin_drop(&c->journal, &ck->journal); 996 bch2_journal_preres_put(&c->journal, &ck->res); 997 998 list_del(&ck->list); 999 kfree(ck->k); 1000 six_lock_exit(&ck->c.lock); 1001 kmem_cache_free(bch2_key_cache, ck); 1002 } 1003 1004 if (atomic_long_read(&bc->nr_dirty) && 1005 !bch2_journal_error(&c->journal) && 1006 test_bit(BCH_FS_WAS_RW, &c->flags)) 1007 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 1008 atomic_long_read(&bc->nr_dirty)); 1009 1010 if (atomic_long_read(&bc->nr_keys)) 1011 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 1012 atomic_long_read(&bc->nr_keys)); 1013 1014 if (bc->table_init_done) 1015 rhashtable_destroy(&bc->table); 1016 1017 free_percpu(bc->pcpu_freed); 1018 } 1019 1020 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 1021 { 1022 mutex_init(&c->lock); 1023 INIT_LIST_HEAD(&c->freed_pcpu); 1024 INIT_LIST_HEAD(&c->freed_nonpcpu); 1025 } 1026 1027 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 1028 { 1029 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 1030 1031 #ifdef __KERNEL__ 1032 bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); 1033 if (!bc->pcpu_freed) 1034 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1035 #endif 1036 1037 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 1038 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1039 1040 bc->table_init_done = true; 1041 1042 bc->shrink.seeks = 0; 1043 bc->shrink.count_objects = bch2_btree_key_cache_count; 1044 bc->shrink.scan_objects = bch2_btree_key_cache_scan; 1045 if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name)) 1046 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1047 return 0; 1048 } 1049 1050 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) 1051 { 1052 prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); 1053 prt_newline(out); 1054 prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); 1055 prt_newline(out); 1056 prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); 1057 prt_newline(out); 1058 } 1059 1060 void bch2_btree_key_cache_exit(void) 1061 { 1062 kmem_cache_destroy(bch2_key_cache); 1063 } 1064 1065 int __init bch2_btree_key_cache_init(void) 1066 { 1067 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 1068 if (!bch2_key_cache) 1069 return -ENOMEM; 1070 1071 return 0; 1072 } 1073