1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 }; 40 41 __flatten 42 inline struct bkey_cached * 43 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 44 { 45 struct bkey_cached_key key = { 46 .btree_id = btree_id, 47 .pos = pos, 48 }; 49 50 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 51 bch2_btree_key_cache_params); 52 } 53 54 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 55 { 56 if (!six_trylock_intent(&ck->c.lock)) 57 return false; 58 59 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 60 six_unlock_intent(&ck->c.lock); 61 return false; 62 } 63 64 if (!six_trylock_write(&ck->c.lock)) { 65 six_unlock_intent(&ck->c.lock); 66 return false; 67 } 68 69 return true; 70 } 71 72 static void bkey_cached_evict(struct btree_key_cache *c, 73 struct bkey_cached *ck) 74 { 75 BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, 76 bch2_btree_key_cache_params)); 77 memset(&ck->key, ~0, sizeof(ck->key)); 78 79 atomic_long_dec(&c->nr_keys); 80 } 81 82 static void bkey_cached_free(struct btree_key_cache *bc, 83 struct bkey_cached *ck) 84 { 85 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 86 87 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 88 89 ck->btree_trans_barrier_seq = 90 start_poll_synchronize_srcu(&c->btree_trans_barrier); 91 92 if (ck->c.lock.readers) { 93 list_move_tail(&ck->list, &bc->freed_pcpu); 94 bc->nr_freed_pcpu++; 95 } else { 96 list_move_tail(&ck->list, &bc->freed_nonpcpu); 97 bc->nr_freed_nonpcpu++; 98 } 99 atomic_long_inc(&bc->nr_freed); 100 101 kfree(ck->k); 102 ck->k = NULL; 103 ck->u64s = 0; 104 105 six_unlock_write(&ck->c.lock); 106 six_unlock_intent(&ck->c.lock); 107 } 108 109 #ifdef __KERNEL__ 110 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, 111 struct bkey_cached *ck) 112 { 113 struct bkey_cached *pos; 114 115 bc->nr_freed_nonpcpu++; 116 117 list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { 118 if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, 119 pos->btree_trans_barrier_seq)) { 120 list_move(&ck->list, &pos->list); 121 return; 122 } 123 } 124 125 list_move(&ck->list, &bc->freed_nonpcpu); 126 } 127 #endif 128 129 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, 130 struct bkey_cached *ck) 131 { 132 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 133 134 if (!ck->c.lock.readers) { 135 #ifdef __KERNEL__ 136 struct btree_key_cache_freelist *f; 137 bool freed = false; 138 139 preempt_disable(); 140 f = this_cpu_ptr(bc->pcpu_freed); 141 142 if (f->nr < ARRAY_SIZE(f->objs)) { 143 f->objs[f->nr++] = ck; 144 freed = true; 145 } 146 preempt_enable(); 147 148 if (!freed) { 149 mutex_lock(&bc->lock); 150 preempt_disable(); 151 f = this_cpu_ptr(bc->pcpu_freed); 152 153 while (f->nr > ARRAY_SIZE(f->objs) / 2) { 154 struct bkey_cached *ck2 = f->objs[--f->nr]; 155 156 __bkey_cached_move_to_freelist_ordered(bc, ck2); 157 } 158 preempt_enable(); 159 160 __bkey_cached_move_to_freelist_ordered(bc, ck); 161 mutex_unlock(&bc->lock); 162 } 163 #else 164 mutex_lock(&bc->lock); 165 list_move_tail(&ck->list, &bc->freed_nonpcpu); 166 bc->nr_freed_nonpcpu++; 167 mutex_unlock(&bc->lock); 168 #endif 169 } else { 170 mutex_lock(&bc->lock); 171 list_move_tail(&ck->list, &bc->freed_pcpu); 172 bc->nr_freed_pcpu++; 173 mutex_unlock(&bc->lock); 174 } 175 } 176 177 static void bkey_cached_free_fast(struct btree_key_cache *bc, 178 struct bkey_cached *ck) 179 { 180 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 181 182 ck->btree_trans_barrier_seq = 183 start_poll_synchronize_srcu(&c->btree_trans_barrier); 184 185 list_del_init(&ck->list); 186 atomic_long_inc(&bc->nr_freed); 187 188 kfree(ck->k); 189 ck->k = NULL; 190 ck->u64s = 0; 191 192 bkey_cached_move_to_freelist(bc, ck); 193 194 six_unlock_write(&ck->c.lock); 195 six_unlock_intent(&ck->c.lock); 196 } 197 198 static struct bkey_cached * 199 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, 200 bool *was_new) 201 { 202 struct bch_fs *c = trans->c; 203 struct btree_key_cache *bc = &c->btree_key_cache; 204 struct bkey_cached *ck = NULL; 205 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 206 int ret; 207 208 if (!pcpu_readers) { 209 #ifdef __KERNEL__ 210 struct btree_key_cache_freelist *f; 211 212 preempt_disable(); 213 f = this_cpu_ptr(bc->pcpu_freed); 214 if (f->nr) 215 ck = f->objs[--f->nr]; 216 preempt_enable(); 217 218 if (!ck) { 219 mutex_lock(&bc->lock); 220 preempt_disable(); 221 f = this_cpu_ptr(bc->pcpu_freed); 222 223 while (!list_empty(&bc->freed_nonpcpu) && 224 f->nr < ARRAY_SIZE(f->objs) / 2) { 225 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 226 list_del_init(&ck->list); 227 bc->nr_freed_nonpcpu--; 228 f->objs[f->nr++] = ck; 229 } 230 231 ck = f->nr ? f->objs[--f->nr] : NULL; 232 preempt_enable(); 233 mutex_unlock(&bc->lock); 234 } 235 #else 236 mutex_lock(&bc->lock); 237 if (!list_empty(&bc->freed_nonpcpu)) { 238 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 239 list_del_init(&ck->list); 240 bc->nr_freed_nonpcpu--; 241 } 242 mutex_unlock(&bc->lock); 243 #endif 244 } else { 245 mutex_lock(&bc->lock); 246 if (!list_empty(&bc->freed_pcpu)) { 247 ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); 248 list_del_init(&ck->list); 249 bc->nr_freed_pcpu--; 250 } 251 mutex_unlock(&bc->lock); 252 } 253 254 if (ck) { 255 ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); 256 if (unlikely(ret)) { 257 bkey_cached_move_to_freelist(bc, ck); 258 return ERR_PTR(ret); 259 } 260 261 path->l[0].b = (void *) ck; 262 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 263 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 264 265 ret = bch2_btree_node_lock_write(trans, path, &ck->c); 266 if (unlikely(ret)) { 267 btree_node_unlock(trans, path, 0); 268 bkey_cached_move_to_freelist(bc, ck); 269 return ERR_PTR(ret); 270 } 271 272 return ck; 273 } 274 275 ck = allocate_dropping_locks(trans, ret, 276 kmem_cache_zalloc(bch2_key_cache, _gfp)); 277 if (ret) { 278 kmem_cache_free(bch2_key_cache, ck); 279 return ERR_PTR(ret); 280 } 281 282 if (!ck) 283 return NULL; 284 285 INIT_LIST_HEAD(&ck->list); 286 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 287 288 ck->c.cached = true; 289 BUG_ON(!six_trylock_intent(&ck->c.lock)); 290 BUG_ON(!six_trylock_write(&ck->c.lock)); 291 *was_new = true; 292 return ck; 293 } 294 295 static struct bkey_cached * 296 bkey_cached_reuse(struct btree_key_cache *c) 297 { 298 struct bucket_table *tbl; 299 struct rhash_head *pos; 300 struct bkey_cached *ck; 301 unsigned i; 302 303 mutex_lock(&c->lock); 304 rcu_read_lock(); 305 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 306 for (i = 0; i < tbl->size; i++) 307 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 308 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 309 bkey_cached_lock_for_evict(ck)) { 310 bkey_cached_evict(c, ck); 311 goto out; 312 } 313 } 314 ck = NULL; 315 out: 316 rcu_read_unlock(); 317 mutex_unlock(&c->lock); 318 return ck; 319 } 320 321 static struct bkey_cached * 322 btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) 323 { 324 struct bch_fs *c = trans->c; 325 struct btree_key_cache *bc = &c->btree_key_cache; 326 struct bkey_cached *ck; 327 bool was_new = false; 328 329 ck = bkey_cached_alloc(trans, path, &was_new); 330 if (IS_ERR(ck)) 331 return ck; 332 333 if (unlikely(!ck)) { 334 ck = bkey_cached_reuse(bc); 335 if (unlikely(!ck)) { 336 bch_err(c, "error allocating memory for key cache item, btree %s", 337 bch2_btree_id_str(path->btree_id)); 338 return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); 339 } 340 341 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 342 } 343 344 ck->c.level = 0; 345 ck->c.btree_id = path->btree_id; 346 ck->key.btree_id = path->btree_id; 347 ck->key.pos = path->pos; 348 ck->valid = false; 349 ck->flags = 1U << BKEY_CACHED_ACCESSED; 350 351 if (unlikely(rhashtable_lookup_insert_fast(&bc->table, 352 &ck->hash, 353 bch2_btree_key_cache_params))) { 354 /* We raced with another fill: */ 355 356 if (likely(was_new)) { 357 six_unlock_write(&ck->c.lock); 358 six_unlock_intent(&ck->c.lock); 359 kfree(ck); 360 } else { 361 bkey_cached_free_fast(bc, ck); 362 } 363 364 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 365 return NULL; 366 } 367 368 atomic_long_inc(&bc->nr_keys); 369 370 six_unlock_write(&ck->c.lock); 371 372 return ck; 373 } 374 375 static int btree_key_cache_fill(struct btree_trans *trans, 376 struct btree_path *ck_path, 377 struct bkey_cached *ck) 378 { 379 struct btree_iter iter; 380 struct bkey_s_c k; 381 unsigned new_u64s = 0; 382 struct bkey_i *new_k = NULL; 383 int ret; 384 385 bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, 386 BTREE_ITER_KEY_CACHE_FILL| 387 BTREE_ITER_CACHED_NOFILL); 388 iter.flags &= ~BTREE_ITER_WITH_JOURNAL; 389 k = bch2_btree_iter_peek_slot(&iter); 390 ret = bkey_err(k); 391 if (ret) 392 goto err; 393 394 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 395 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 396 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 397 goto err; 398 } 399 400 /* 401 * bch2_varint_decode can read past the end of the buffer by at 402 * most 7 bytes (it won't be used): 403 */ 404 new_u64s = k.k->u64s + 1; 405 406 /* 407 * Allocate some extra space so that the transaction commit path is less 408 * likely to have to reallocate, since that requires a transaction 409 * restart: 410 */ 411 new_u64s = min(256U, (new_u64s * 3) / 2); 412 413 if (new_u64s > ck->u64s) { 414 new_u64s = roundup_pow_of_two(new_u64s); 415 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); 416 if (!new_k) { 417 bch2_trans_unlock(trans); 418 419 new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); 420 if (!new_k) { 421 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 422 bch2_btree_id_str(ck->key.btree_id), new_u64s); 423 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 424 goto err; 425 } 426 427 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 428 kfree(new_k); 429 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 430 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 431 goto err; 432 } 433 434 ret = bch2_trans_relock(trans); 435 if (ret) { 436 kfree(new_k); 437 goto err; 438 } 439 } 440 } 441 442 ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); 443 if (ret) { 444 kfree(new_k); 445 goto err; 446 } 447 448 if (new_k) { 449 kfree(ck->k); 450 ck->u64s = new_u64s; 451 ck->k = new_k; 452 } 453 454 bkey_reassemble(ck->k, k); 455 ck->valid = true; 456 bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); 457 458 /* We're not likely to need this iterator again: */ 459 set_btree_iter_dontneed(&iter); 460 err: 461 bch2_trans_iter_exit(trans, &iter); 462 return ret; 463 } 464 465 static noinline int 466 bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, 467 unsigned flags) 468 { 469 struct bch_fs *c = trans->c; 470 struct bkey_cached *ck; 471 int ret = 0; 472 473 BUG_ON(path->level); 474 475 path->l[1].b = NULL; 476 477 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 478 ck = (void *) path->l[0].b; 479 goto fill; 480 } 481 retry: 482 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 483 if (!ck) { 484 ck = btree_key_cache_create(trans, path); 485 ret = PTR_ERR_OR_ZERO(ck); 486 if (ret) 487 goto err; 488 if (!ck) 489 goto retry; 490 491 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 492 path->locks_want = 1; 493 } else { 494 enum six_lock_type lock_want = __btree_lock_want(path, 0); 495 496 ret = btree_node_lock(trans, path, (void *) ck, 0, 497 lock_want, _THIS_IP_); 498 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 499 goto err; 500 501 BUG_ON(ret); 502 503 if (ck->key.btree_id != path->btree_id || 504 !bpos_eq(ck->key.pos, path->pos)) { 505 six_unlock_type(&ck->c.lock, lock_want); 506 goto retry; 507 } 508 509 mark_btree_node_locked(trans, path, 0, 510 (enum btree_node_locked_type) lock_want); 511 } 512 513 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 514 path->l[0].b = (void *) ck; 515 fill: 516 path->uptodate = BTREE_ITER_UPTODATE; 517 518 if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { 519 /* 520 * Using the underscore version because we haven't set 521 * path->uptodate yet: 522 */ 523 if (!path->locks_want && 524 !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { 525 trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); 526 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); 527 goto err; 528 } 529 530 ret = btree_key_cache_fill(trans, path, ck); 531 if (ret) 532 goto err; 533 534 ret = bch2_btree_path_relock(trans, path, _THIS_IP_); 535 if (ret) 536 goto err; 537 538 path->uptodate = BTREE_ITER_UPTODATE; 539 } 540 541 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 542 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 543 544 BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 545 BUG_ON(path->uptodate); 546 547 return ret; 548 err: 549 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 550 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 551 btree_node_unlock(trans, path, 0); 552 path->l[0].b = ERR_PTR(ret); 553 } 554 return ret; 555 } 556 557 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 558 unsigned flags) 559 { 560 struct bch_fs *c = trans->c; 561 struct bkey_cached *ck; 562 int ret = 0; 563 564 EBUG_ON(path->level); 565 566 path->l[1].b = NULL; 567 568 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 569 ck = (void *) path->l[0].b; 570 goto fill; 571 } 572 retry: 573 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 574 if (!ck) { 575 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 576 } else { 577 enum six_lock_type lock_want = __btree_lock_want(path, 0); 578 579 ret = btree_node_lock(trans, path, (void *) ck, 0, 580 lock_want, _THIS_IP_); 581 EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); 582 583 if (ret) 584 return ret; 585 586 if (ck->key.btree_id != path->btree_id || 587 !bpos_eq(ck->key.pos, path->pos)) { 588 six_unlock_type(&ck->c.lock, lock_want); 589 goto retry; 590 } 591 592 mark_btree_node_locked(trans, path, 0, 593 (enum btree_node_locked_type) lock_want); 594 } 595 596 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 597 path->l[0].b = (void *) ck; 598 fill: 599 if (!ck->valid) 600 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 601 602 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 603 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 604 605 path->uptodate = BTREE_ITER_UPTODATE; 606 EBUG_ON(!ck->valid); 607 EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 608 609 return ret; 610 } 611 612 static int btree_key_cache_flush_pos(struct btree_trans *trans, 613 struct bkey_cached_key key, 614 u64 journal_seq, 615 unsigned commit_flags, 616 bool evict) 617 { 618 struct bch_fs *c = trans->c; 619 struct journal *j = &c->journal; 620 struct btree_iter c_iter, b_iter; 621 struct bkey_cached *ck = NULL; 622 int ret; 623 624 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 625 BTREE_ITER_SLOTS| 626 BTREE_ITER_INTENT| 627 BTREE_ITER_ALL_SNAPSHOTS); 628 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 629 BTREE_ITER_CACHED| 630 BTREE_ITER_INTENT); 631 b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; 632 633 ret = bch2_btree_iter_traverse(&c_iter); 634 if (ret) 635 goto out; 636 637 ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; 638 if (!ck) 639 goto out; 640 641 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 642 if (evict) 643 goto evict; 644 goto out; 645 } 646 647 BUG_ON(!ck->valid); 648 649 if (journal_seq && ck->journal.seq != journal_seq) 650 goto out; 651 652 trans->journal_res.seq = ck->journal.seq; 653 654 /* 655 * If we're at the end of the journal, we really want to free up space 656 * in the journal right away - we don't want to pin that old journal 657 * sequence number with a new btree node write, we want to re-journal 658 * the update 659 */ 660 if (ck->journal.seq == journal_last_seq(j)) 661 commit_flags |= BCH_WATERMARK_reclaim; 662 663 if (ck->journal.seq != journal_last_seq(j) || 664 !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) 665 commit_flags |= BCH_TRANS_COMMIT_no_journal_res; 666 667 ret = bch2_btree_iter_traverse(&b_iter) ?: 668 bch2_trans_update(trans, &b_iter, ck->k, 669 BTREE_UPDATE_KEY_CACHE_RECLAIM| 670 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| 671 BTREE_TRIGGER_NORUN) ?: 672 bch2_trans_commit(trans, NULL, NULL, 673 BCH_TRANS_COMMIT_no_check_rw| 674 BCH_TRANS_COMMIT_no_enospc| 675 commit_flags); 676 677 bch2_fs_fatal_err_on(ret && 678 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 679 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 680 !bch2_journal_error(j), c, 681 "flushing key cache: %s", bch2_err_str(ret)); 682 if (ret) 683 goto out; 684 685 bch2_journal_pin_drop(j, &ck->journal); 686 687 struct btree_path *path = btree_iter_path(trans, &c_iter); 688 BUG_ON(!btree_node_locked(path, 0)); 689 690 if (!evict) { 691 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 692 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 693 atomic_long_dec(&c->btree_key_cache.nr_dirty); 694 } 695 } else { 696 struct btree_path *path2; 697 unsigned i; 698 evict: 699 trans_for_each_path(trans, path2, i) 700 if (path2 != path) 701 __bch2_btree_path_unlock(trans, path2); 702 703 bch2_btree_node_lock_write_nofail(trans, path, &ck->c); 704 705 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 706 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 707 atomic_long_dec(&c->btree_key_cache.nr_dirty); 708 } 709 710 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 711 bkey_cached_evict(&c->btree_key_cache, ck); 712 bkey_cached_free_fast(&c->btree_key_cache, ck); 713 } 714 out: 715 bch2_trans_iter_exit(trans, &b_iter); 716 bch2_trans_iter_exit(trans, &c_iter); 717 return ret; 718 } 719 720 int bch2_btree_key_cache_journal_flush(struct journal *j, 721 struct journal_entry_pin *pin, u64 seq) 722 { 723 struct bch_fs *c = container_of(j, struct bch_fs, journal); 724 struct bkey_cached *ck = 725 container_of(pin, struct bkey_cached, journal); 726 struct bkey_cached_key key; 727 struct btree_trans *trans = bch2_trans_get(c); 728 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 729 int ret = 0; 730 731 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 732 key = ck->key; 733 734 if (ck->journal.seq != seq || 735 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 736 six_unlock_read(&ck->c.lock); 737 goto unlock; 738 } 739 740 if (ck->seq != seq) { 741 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 742 bch2_btree_key_cache_journal_flush); 743 six_unlock_read(&ck->c.lock); 744 goto unlock; 745 } 746 six_unlock_read(&ck->c.lock); 747 748 ret = lockrestart_do(trans, 749 btree_key_cache_flush_pos(trans, key, seq, 750 BCH_TRANS_COMMIT_journal_reclaim, false)); 751 unlock: 752 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 753 754 bch2_trans_put(trans); 755 return ret; 756 } 757 758 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 759 unsigned flags, 760 struct btree_insert_entry *insert_entry) 761 { 762 struct bch_fs *c = trans->c; 763 struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; 764 struct bkey_i *insert = insert_entry->k; 765 bool kick_reclaim = false; 766 767 BUG_ON(insert->k.u64s > ck->u64s); 768 769 bkey_copy(ck->k, insert); 770 ck->valid = true; 771 772 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 773 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); 774 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 775 atomic_long_inc(&c->btree_key_cache.nr_dirty); 776 777 if (bch2_nr_btree_keys_need_flush(c)) 778 kick_reclaim = true; 779 } 780 781 /* 782 * To minimize lock contention, we only add the journal pin here and 783 * defer pin updates to the flush callback via ->seq. Be careful not to 784 * update ->seq on nojournal commits because we don't want to update the 785 * pin to a seq that doesn't include journal updates on disk. Otherwise 786 * we risk losing the update after a crash. 787 * 788 * The only exception is if the pin is not active in the first place. We 789 * have to add the pin because journal reclaim drives key cache 790 * flushing. The flush callback will not proceed unless ->seq matches 791 * the latest pin, so make sure it starts with a consistent value. 792 */ 793 if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || 794 !journal_pin_active(&ck->journal)) { 795 ck->seq = trans->journal_res.seq; 796 } 797 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 798 &ck->journal, bch2_btree_key_cache_journal_flush); 799 800 if (kick_reclaim) 801 journal_reclaim_kick(&c->journal); 802 return true; 803 } 804 805 void bch2_btree_key_cache_drop(struct btree_trans *trans, 806 struct btree_path *path) 807 { 808 struct bch_fs *c = trans->c; 809 struct bkey_cached *ck = (void *) path->l[0].b; 810 811 BUG_ON(!ck->valid); 812 813 /* 814 * We just did an update to the btree, bypassing the key cache: the key 815 * cache key is now stale and must be dropped, even if dirty: 816 */ 817 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 818 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 819 atomic_long_dec(&c->btree_key_cache.nr_dirty); 820 bch2_journal_pin_drop(&c->journal, &ck->journal); 821 } 822 823 ck->valid = false; 824 } 825 826 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 827 struct shrink_control *sc) 828 { 829 struct bch_fs *c = shrink->private_data; 830 struct btree_key_cache *bc = &c->btree_key_cache; 831 struct bucket_table *tbl; 832 struct bkey_cached *ck, *t; 833 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 834 unsigned start, flags; 835 int srcu_idx; 836 837 mutex_lock(&bc->lock); 838 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 839 flags = memalloc_nofs_save(); 840 841 /* 842 * Newest freed entries are at the end of the list - once we hit one 843 * that's too new to be freed, we can bail out: 844 */ 845 scanned += bc->nr_freed_nonpcpu; 846 847 list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { 848 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 849 ck->btree_trans_barrier_seq)) 850 break; 851 852 list_del(&ck->list); 853 six_lock_exit(&ck->c.lock); 854 kmem_cache_free(bch2_key_cache, ck); 855 atomic_long_dec(&bc->nr_freed); 856 freed++; 857 bc->nr_freed_nonpcpu--; 858 } 859 860 if (scanned >= nr) 861 goto out; 862 863 scanned += bc->nr_freed_pcpu; 864 865 list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { 866 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 867 ck->btree_trans_barrier_seq)) 868 break; 869 870 list_del(&ck->list); 871 six_lock_exit(&ck->c.lock); 872 kmem_cache_free(bch2_key_cache, ck); 873 atomic_long_dec(&bc->nr_freed); 874 freed++; 875 bc->nr_freed_pcpu--; 876 } 877 878 if (scanned >= nr) 879 goto out; 880 881 rcu_read_lock(); 882 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 883 if (bc->shrink_iter >= tbl->size) 884 bc->shrink_iter = 0; 885 start = bc->shrink_iter; 886 887 do { 888 struct rhash_head *pos, *next; 889 890 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); 891 892 while (!rht_is_a_nulls(pos)) { 893 next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); 894 ck = container_of(pos, struct bkey_cached, hash); 895 896 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) 897 goto next; 898 899 if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 900 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 901 else if (bkey_cached_lock_for_evict(ck)) { 902 bkey_cached_evict(bc, ck); 903 bkey_cached_free(bc, ck); 904 } 905 906 scanned++; 907 if (scanned >= nr) 908 break; 909 next: 910 pos = next; 911 } 912 913 bc->shrink_iter++; 914 if (bc->shrink_iter >= tbl->size) 915 bc->shrink_iter = 0; 916 } while (scanned < nr && bc->shrink_iter != start); 917 918 rcu_read_unlock(); 919 out: 920 memalloc_nofs_restore(flags); 921 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 922 mutex_unlock(&bc->lock); 923 924 return freed; 925 } 926 927 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 928 struct shrink_control *sc) 929 { 930 struct bch_fs *c = shrink->private_data; 931 struct btree_key_cache *bc = &c->btree_key_cache; 932 long nr = atomic_long_read(&bc->nr_keys) - 933 atomic_long_read(&bc->nr_dirty); 934 935 return max(0L, nr); 936 } 937 938 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 939 { 940 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 941 struct bucket_table *tbl; 942 struct bkey_cached *ck, *n; 943 struct rhash_head *pos; 944 LIST_HEAD(items); 945 unsigned i; 946 #ifdef __KERNEL__ 947 int cpu; 948 #endif 949 950 shrinker_free(bc->shrink); 951 952 mutex_lock(&bc->lock); 953 954 /* 955 * The loop is needed to guard against racing with rehash: 956 */ 957 while (atomic_long_read(&bc->nr_keys)) { 958 rcu_read_lock(); 959 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 960 if (tbl) 961 for (i = 0; i < tbl->size; i++) 962 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 963 bkey_cached_evict(bc, ck); 964 list_add(&ck->list, &items); 965 } 966 rcu_read_unlock(); 967 } 968 969 #ifdef __KERNEL__ 970 for_each_possible_cpu(cpu) { 971 struct btree_key_cache_freelist *f = 972 per_cpu_ptr(bc->pcpu_freed, cpu); 973 974 for (i = 0; i < f->nr; i++) { 975 ck = f->objs[i]; 976 list_add(&ck->list, &items); 977 } 978 } 979 #endif 980 981 BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); 982 BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); 983 984 list_splice(&bc->freed_pcpu, &items); 985 list_splice(&bc->freed_nonpcpu, &items); 986 987 mutex_unlock(&bc->lock); 988 989 list_for_each_entry_safe(ck, n, &items, list) { 990 cond_resched(); 991 992 list_del(&ck->list); 993 kfree(ck->k); 994 six_lock_exit(&ck->c.lock); 995 kmem_cache_free(bch2_key_cache, ck); 996 } 997 998 if (atomic_long_read(&bc->nr_dirty) && 999 !bch2_journal_error(&c->journal) && 1000 test_bit(BCH_FS_was_rw, &c->flags)) 1001 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 1002 atomic_long_read(&bc->nr_dirty)); 1003 1004 if (atomic_long_read(&bc->nr_keys)) 1005 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 1006 atomic_long_read(&bc->nr_keys)); 1007 1008 if (bc->table_init_done) 1009 rhashtable_destroy(&bc->table); 1010 1011 free_percpu(bc->pcpu_freed); 1012 } 1013 1014 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 1015 { 1016 mutex_init(&c->lock); 1017 INIT_LIST_HEAD(&c->freed_pcpu); 1018 INIT_LIST_HEAD(&c->freed_nonpcpu); 1019 } 1020 1021 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 1022 { 1023 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 1024 struct shrinker *shrink; 1025 1026 #ifdef __KERNEL__ 1027 bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); 1028 if (!bc->pcpu_freed) 1029 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1030 #endif 1031 1032 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 1033 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1034 1035 bc->table_init_done = true; 1036 1037 shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); 1038 if (!shrink) 1039 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1040 bc->shrink = shrink; 1041 shrink->seeks = 0; 1042 shrink->count_objects = bch2_btree_key_cache_count; 1043 shrink->scan_objects = bch2_btree_key_cache_scan; 1044 shrink->private_data = c; 1045 shrinker_register(shrink); 1046 return 0; 1047 } 1048 1049 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) 1050 { 1051 prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); 1052 prt_newline(out); 1053 prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); 1054 prt_newline(out); 1055 prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); 1056 prt_newline(out); 1057 } 1058 1059 void bch2_btree_key_cache_exit(void) 1060 { 1061 kmem_cache_destroy(bch2_key_cache); 1062 } 1063 1064 int __init bch2_btree_key_cache_init(void) 1065 { 1066 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 1067 if (!bch2_key_cache) 1068 return -ENOMEM; 1069 1070 return 0; 1071 } 1072