1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 }; 40 41 __flatten 42 inline struct bkey_cached * 43 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 44 { 45 struct bkey_cached_key key = { 46 .btree_id = btree_id, 47 .pos = pos, 48 }; 49 50 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 51 bch2_btree_key_cache_params); 52 } 53 54 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 55 { 56 if (!six_trylock_intent(&ck->c.lock)) 57 return false; 58 59 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 60 six_unlock_intent(&ck->c.lock); 61 return false; 62 } 63 64 if (!six_trylock_write(&ck->c.lock)) { 65 six_unlock_intent(&ck->c.lock); 66 return false; 67 } 68 69 return true; 70 } 71 72 static void bkey_cached_evict(struct btree_key_cache *c, 73 struct bkey_cached *ck) 74 { 75 BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, 76 bch2_btree_key_cache_params)); 77 memset(&ck->key, ~0, sizeof(ck->key)); 78 79 atomic_long_dec(&c->nr_keys); 80 } 81 82 static void bkey_cached_free(struct btree_key_cache *bc, 83 struct bkey_cached *ck) 84 { 85 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 86 87 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 88 89 ck->btree_trans_barrier_seq = 90 start_poll_synchronize_srcu(&c->btree_trans_barrier); 91 92 if (ck->c.lock.readers) { 93 list_move_tail(&ck->list, &bc->freed_pcpu); 94 bc->nr_freed_pcpu++; 95 } else { 96 list_move_tail(&ck->list, &bc->freed_nonpcpu); 97 bc->nr_freed_nonpcpu++; 98 } 99 atomic_long_inc(&bc->nr_freed); 100 101 kfree(ck->k); 102 ck->k = NULL; 103 ck->u64s = 0; 104 105 six_unlock_write(&ck->c.lock); 106 six_unlock_intent(&ck->c.lock); 107 } 108 109 #ifdef __KERNEL__ 110 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, 111 struct bkey_cached *ck) 112 { 113 struct bkey_cached *pos; 114 115 bc->nr_freed_nonpcpu++; 116 117 list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { 118 if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, 119 pos->btree_trans_barrier_seq)) { 120 list_move(&ck->list, &pos->list); 121 return; 122 } 123 } 124 125 list_move(&ck->list, &bc->freed_nonpcpu); 126 } 127 #endif 128 129 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, 130 struct bkey_cached *ck) 131 { 132 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 133 134 if (!ck->c.lock.readers) { 135 #ifdef __KERNEL__ 136 struct btree_key_cache_freelist *f; 137 bool freed = false; 138 139 preempt_disable(); 140 f = this_cpu_ptr(bc->pcpu_freed); 141 142 if (f->nr < ARRAY_SIZE(f->objs)) { 143 f->objs[f->nr++] = ck; 144 freed = true; 145 } 146 preempt_enable(); 147 148 if (!freed) { 149 mutex_lock(&bc->lock); 150 preempt_disable(); 151 f = this_cpu_ptr(bc->pcpu_freed); 152 153 while (f->nr > ARRAY_SIZE(f->objs) / 2) { 154 struct bkey_cached *ck2 = f->objs[--f->nr]; 155 156 __bkey_cached_move_to_freelist_ordered(bc, ck2); 157 } 158 preempt_enable(); 159 160 __bkey_cached_move_to_freelist_ordered(bc, ck); 161 mutex_unlock(&bc->lock); 162 } 163 #else 164 mutex_lock(&bc->lock); 165 list_move_tail(&ck->list, &bc->freed_nonpcpu); 166 bc->nr_freed_nonpcpu++; 167 mutex_unlock(&bc->lock); 168 #endif 169 } else { 170 mutex_lock(&bc->lock); 171 list_move_tail(&ck->list, &bc->freed_pcpu); 172 mutex_unlock(&bc->lock); 173 } 174 } 175 176 static void bkey_cached_free_fast(struct btree_key_cache *bc, 177 struct bkey_cached *ck) 178 { 179 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 180 181 ck->btree_trans_barrier_seq = 182 start_poll_synchronize_srcu(&c->btree_trans_barrier); 183 184 list_del_init(&ck->list); 185 atomic_long_inc(&bc->nr_freed); 186 187 kfree(ck->k); 188 ck->k = NULL; 189 ck->u64s = 0; 190 191 bkey_cached_move_to_freelist(bc, ck); 192 193 six_unlock_write(&ck->c.lock); 194 six_unlock_intent(&ck->c.lock); 195 } 196 197 static struct bkey_cached * 198 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, 199 bool *was_new) 200 { 201 struct bch_fs *c = trans->c; 202 struct btree_key_cache *bc = &c->btree_key_cache; 203 struct bkey_cached *ck = NULL; 204 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 205 int ret; 206 207 if (!pcpu_readers) { 208 #ifdef __KERNEL__ 209 struct btree_key_cache_freelist *f; 210 211 preempt_disable(); 212 f = this_cpu_ptr(bc->pcpu_freed); 213 if (f->nr) 214 ck = f->objs[--f->nr]; 215 preempt_enable(); 216 217 if (!ck) { 218 mutex_lock(&bc->lock); 219 preempt_disable(); 220 f = this_cpu_ptr(bc->pcpu_freed); 221 222 while (!list_empty(&bc->freed_nonpcpu) && 223 f->nr < ARRAY_SIZE(f->objs) / 2) { 224 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 225 list_del_init(&ck->list); 226 bc->nr_freed_nonpcpu--; 227 f->objs[f->nr++] = ck; 228 } 229 230 ck = f->nr ? f->objs[--f->nr] : NULL; 231 preempt_enable(); 232 mutex_unlock(&bc->lock); 233 } 234 #else 235 mutex_lock(&bc->lock); 236 if (!list_empty(&bc->freed_nonpcpu)) { 237 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 238 list_del_init(&ck->list); 239 bc->nr_freed_nonpcpu--; 240 } 241 mutex_unlock(&bc->lock); 242 #endif 243 } else { 244 mutex_lock(&bc->lock); 245 if (!list_empty(&bc->freed_pcpu)) { 246 ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); 247 list_del_init(&ck->list); 248 } 249 mutex_unlock(&bc->lock); 250 } 251 252 if (ck) { 253 ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); 254 if (unlikely(ret)) { 255 bkey_cached_move_to_freelist(bc, ck); 256 return ERR_PTR(ret); 257 } 258 259 path->l[0].b = (void *) ck; 260 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 261 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 262 263 ret = bch2_btree_node_lock_write(trans, path, &ck->c); 264 if (unlikely(ret)) { 265 btree_node_unlock(trans, path, 0); 266 bkey_cached_move_to_freelist(bc, ck); 267 return ERR_PTR(ret); 268 } 269 270 return ck; 271 } 272 273 ck = allocate_dropping_locks(trans, ret, 274 kmem_cache_zalloc(bch2_key_cache, _gfp)); 275 if (ret) { 276 kmem_cache_free(bch2_key_cache, ck); 277 return ERR_PTR(ret); 278 } 279 280 if (!ck) 281 return NULL; 282 283 INIT_LIST_HEAD(&ck->list); 284 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 285 286 ck->c.cached = true; 287 BUG_ON(!six_trylock_intent(&ck->c.lock)); 288 BUG_ON(!six_trylock_write(&ck->c.lock)); 289 *was_new = true; 290 return ck; 291 } 292 293 static struct bkey_cached * 294 bkey_cached_reuse(struct btree_key_cache *c) 295 { 296 struct bucket_table *tbl; 297 struct rhash_head *pos; 298 struct bkey_cached *ck; 299 unsigned i; 300 301 mutex_lock(&c->lock); 302 rcu_read_lock(); 303 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 304 for (i = 0; i < tbl->size; i++) 305 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 306 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 307 bkey_cached_lock_for_evict(ck)) { 308 bkey_cached_evict(c, ck); 309 goto out; 310 } 311 } 312 ck = NULL; 313 out: 314 rcu_read_unlock(); 315 mutex_unlock(&c->lock); 316 return ck; 317 } 318 319 static struct bkey_cached * 320 btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) 321 { 322 struct bch_fs *c = trans->c; 323 struct btree_key_cache *bc = &c->btree_key_cache; 324 struct bkey_cached *ck; 325 bool was_new = false; 326 327 ck = bkey_cached_alloc(trans, path, &was_new); 328 if (IS_ERR(ck)) 329 return ck; 330 331 if (unlikely(!ck)) { 332 ck = bkey_cached_reuse(bc); 333 if (unlikely(!ck)) { 334 bch_err(c, "error allocating memory for key cache item, btree %s", 335 bch2_btree_id_str(path->btree_id)); 336 return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); 337 } 338 339 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 340 } 341 342 ck->c.level = 0; 343 ck->c.btree_id = path->btree_id; 344 ck->key.btree_id = path->btree_id; 345 ck->key.pos = path->pos; 346 ck->valid = false; 347 ck->flags = 1U << BKEY_CACHED_ACCESSED; 348 349 if (unlikely(rhashtable_lookup_insert_fast(&bc->table, 350 &ck->hash, 351 bch2_btree_key_cache_params))) { 352 /* We raced with another fill: */ 353 354 if (likely(was_new)) { 355 six_unlock_write(&ck->c.lock); 356 six_unlock_intent(&ck->c.lock); 357 kfree(ck); 358 } else { 359 bkey_cached_free_fast(bc, ck); 360 } 361 362 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 363 return NULL; 364 } 365 366 atomic_long_inc(&bc->nr_keys); 367 368 six_unlock_write(&ck->c.lock); 369 370 return ck; 371 } 372 373 static int btree_key_cache_fill(struct btree_trans *trans, 374 struct btree_path *ck_path, 375 struct bkey_cached *ck) 376 { 377 struct btree_iter iter; 378 struct bkey_s_c k; 379 unsigned new_u64s = 0; 380 struct bkey_i *new_k = NULL; 381 int ret; 382 383 bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, 384 BTREE_ITER_KEY_CACHE_FILL| 385 BTREE_ITER_CACHED_NOFILL); 386 iter.flags &= ~BTREE_ITER_WITH_JOURNAL; 387 k = bch2_btree_iter_peek_slot(&iter); 388 ret = bkey_err(k); 389 if (ret) 390 goto err; 391 392 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 393 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 394 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 395 goto err; 396 } 397 398 /* 399 * bch2_varint_decode can read past the end of the buffer by at 400 * most 7 bytes (it won't be used): 401 */ 402 new_u64s = k.k->u64s + 1; 403 404 /* 405 * Allocate some extra space so that the transaction commit path is less 406 * likely to have to reallocate, since that requires a transaction 407 * restart: 408 */ 409 new_u64s = min(256U, (new_u64s * 3) / 2); 410 411 if (new_u64s > ck->u64s) { 412 new_u64s = roundup_pow_of_two(new_u64s); 413 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); 414 if (!new_k) { 415 bch2_trans_unlock(trans); 416 417 new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); 418 if (!new_k) { 419 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 420 bch2_btree_id_str(ck->key.btree_id), new_u64s); 421 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 422 goto err; 423 } 424 425 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 426 kfree(new_k); 427 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 428 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 429 goto err; 430 } 431 432 ret = bch2_trans_relock(trans); 433 if (ret) { 434 kfree(new_k); 435 goto err; 436 } 437 } 438 } 439 440 ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); 441 if (ret) { 442 kfree(new_k); 443 goto err; 444 } 445 446 if (new_k) { 447 kfree(ck->k); 448 ck->u64s = new_u64s; 449 ck->k = new_k; 450 } 451 452 bkey_reassemble(ck->k, k); 453 ck->valid = true; 454 bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); 455 456 /* We're not likely to need this iterator again: */ 457 set_btree_iter_dontneed(&iter); 458 err: 459 bch2_trans_iter_exit(trans, &iter); 460 return ret; 461 } 462 463 static noinline int 464 bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, 465 unsigned flags) 466 { 467 struct bch_fs *c = trans->c; 468 struct bkey_cached *ck; 469 int ret = 0; 470 471 BUG_ON(path->level); 472 473 path->l[1].b = NULL; 474 475 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 476 ck = (void *) path->l[0].b; 477 goto fill; 478 } 479 retry: 480 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 481 if (!ck) { 482 ck = btree_key_cache_create(trans, path); 483 ret = PTR_ERR_OR_ZERO(ck); 484 if (ret) 485 goto err; 486 if (!ck) 487 goto retry; 488 489 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 490 path->locks_want = 1; 491 } else { 492 enum six_lock_type lock_want = __btree_lock_want(path, 0); 493 494 ret = btree_node_lock(trans, path, (void *) ck, 0, 495 lock_want, _THIS_IP_); 496 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 497 goto err; 498 499 BUG_ON(ret); 500 501 if (ck->key.btree_id != path->btree_id || 502 !bpos_eq(ck->key.pos, path->pos)) { 503 six_unlock_type(&ck->c.lock, lock_want); 504 goto retry; 505 } 506 507 mark_btree_node_locked(trans, path, 0, 508 (enum btree_node_locked_type) lock_want); 509 } 510 511 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 512 path->l[0].b = (void *) ck; 513 fill: 514 path->uptodate = BTREE_ITER_UPTODATE; 515 516 if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { 517 /* 518 * Using the underscore version because we haven't set 519 * path->uptodate yet: 520 */ 521 if (!path->locks_want && 522 !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { 523 trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); 524 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); 525 goto err; 526 } 527 528 ret = btree_key_cache_fill(trans, path, ck); 529 if (ret) 530 goto err; 531 532 ret = bch2_btree_path_relock(trans, path, _THIS_IP_); 533 if (ret) 534 goto err; 535 536 path->uptodate = BTREE_ITER_UPTODATE; 537 } 538 539 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 540 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 541 542 BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 543 BUG_ON(path->uptodate); 544 545 return ret; 546 err: 547 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 548 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 549 btree_node_unlock(trans, path, 0); 550 path->l[0].b = ERR_PTR(ret); 551 } 552 return ret; 553 } 554 555 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 556 unsigned flags) 557 { 558 struct bch_fs *c = trans->c; 559 struct bkey_cached *ck; 560 int ret = 0; 561 562 EBUG_ON(path->level); 563 564 path->l[1].b = NULL; 565 566 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 567 ck = (void *) path->l[0].b; 568 goto fill; 569 } 570 retry: 571 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 572 if (!ck) { 573 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 574 } else { 575 enum six_lock_type lock_want = __btree_lock_want(path, 0); 576 577 ret = btree_node_lock(trans, path, (void *) ck, 0, 578 lock_want, _THIS_IP_); 579 EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); 580 581 if (ret) 582 return ret; 583 584 if (ck->key.btree_id != path->btree_id || 585 !bpos_eq(ck->key.pos, path->pos)) { 586 six_unlock_type(&ck->c.lock, lock_want); 587 goto retry; 588 } 589 590 mark_btree_node_locked(trans, path, 0, 591 (enum btree_node_locked_type) lock_want); 592 } 593 594 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 595 path->l[0].b = (void *) ck; 596 fill: 597 if (!ck->valid) 598 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 599 600 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 601 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 602 603 path->uptodate = BTREE_ITER_UPTODATE; 604 EBUG_ON(!ck->valid); 605 EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 606 607 return ret; 608 } 609 610 static int btree_key_cache_flush_pos(struct btree_trans *trans, 611 struct bkey_cached_key key, 612 u64 journal_seq, 613 unsigned commit_flags, 614 bool evict) 615 { 616 struct bch_fs *c = trans->c; 617 struct journal *j = &c->journal; 618 struct btree_iter c_iter, b_iter; 619 struct bkey_cached *ck = NULL; 620 int ret; 621 622 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 623 BTREE_ITER_SLOTS| 624 BTREE_ITER_INTENT| 625 BTREE_ITER_ALL_SNAPSHOTS); 626 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 627 BTREE_ITER_CACHED| 628 BTREE_ITER_INTENT); 629 b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; 630 631 ret = bch2_btree_iter_traverse(&c_iter); 632 if (ret) 633 goto out; 634 635 ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; 636 if (!ck) 637 goto out; 638 639 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 640 if (evict) 641 goto evict; 642 goto out; 643 } 644 645 BUG_ON(!ck->valid); 646 647 if (journal_seq && ck->journal.seq != journal_seq) 648 goto out; 649 650 trans->journal_res.seq = ck->journal.seq; 651 652 /* 653 * If we're at the end of the journal, we really want to free up space 654 * in the journal right away - we don't want to pin that old journal 655 * sequence number with a new btree node write, we want to re-journal 656 * the update 657 */ 658 if (ck->journal.seq == journal_last_seq(j)) 659 commit_flags |= BCH_WATERMARK_reclaim; 660 661 if (ck->journal.seq != journal_last_seq(j) || 662 j->watermark == BCH_WATERMARK_stripe) 663 commit_flags |= BCH_TRANS_COMMIT_no_journal_res; 664 665 ret = bch2_btree_iter_traverse(&b_iter) ?: 666 bch2_trans_update(trans, &b_iter, ck->k, 667 BTREE_UPDATE_KEY_CACHE_RECLAIM| 668 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| 669 BTREE_TRIGGER_NORUN) ?: 670 bch2_trans_commit(trans, NULL, NULL, 671 BCH_TRANS_COMMIT_no_check_rw| 672 BCH_TRANS_COMMIT_no_enospc| 673 commit_flags); 674 675 bch2_fs_fatal_err_on(ret && 676 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 677 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 678 !bch2_journal_error(j), c, 679 "flushing key cache: %s", bch2_err_str(ret)); 680 if (ret) 681 goto out; 682 683 bch2_journal_pin_drop(j, &ck->journal); 684 685 struct btree_path *path = btree_iter_path(trans, &c_iter); 686 BUG_ON(!btree_node_locked(path, 0)); 687 688 if (!evict) { 689 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 690 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 691 atomic_long_dec(&c->btree_key_cache.nr_dirty); 692 } 693 } else { 694 struct btree_path *path2; 695 unsigned i; 696 evict: 697 trans_for_each_path(trans, path2, i) 698 if (path2 != path) 699 __bch2_btree_path_unlock(trans, path2); 700 701 bch2_btree_node_lock_write_nofail(trans, path, &ck->c); 702 703 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 704 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 705 atomic_long_dec(&c->btree_key_cache.nr_dirty); 706 } 707 708 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 709 bkey_cached_evict(&c->btree_key_cache, ck); 710 bkey_cached_free_fast(&c->btree_key_cache, ck); 711 } 712 out: 713 bch2_trans_iter_exit(trans, &b_iter); 714 bch2_trans_iter_exit(trans, &c_iter); 715 return ret; 716 } 717 718 int bch2_btree_key_cache_journal_flush(struct journal *j, 719 struct journal_entry_pin *pin, u64 seq) 720 { 721 struct bch_fs *c = container_of(j, struct bch_fs, journal); 722 struct bkey_cached *ck = 723 container_of(pin, struct bkey_cached, journal); 724 struct bkey_cached_key key; 725 struct btree_trans *trans = bch2_trans_get(c); 726 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 727 int ret = 0; 728 729 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 730 key = ck->key; 731 732 if (ck->journal.seq != seq || 733 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 734 six_unlock_read(&ck->c.lock); 735 goto unlock; 736 } 737 738 if (ck->seq != seq) { 739 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 740 bch2_btree_key_cache_journal_flush); 741 six_unlock_read(&ck->c.lock); 742 goto unlock; 743 } 744 six_unlock_read(&ck->c.lock); 745 746 ret = lockrestart_do(trans, 747 btree_key_cache_flush_pos(trans, key, seq, 748 BCH_TRANS_COMMIT_journal_reclaim, false)); 749 unlock: 750 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 751 752 bch2_trans_put(trans); 753 return ret; 754 } 755 756 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 757 unsigned flags, 758 struct btree_insert_entry *insert_entry) 759 { 760 struct bch_fs *c = trans->c; 761 struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; 762 struct bkey_i *insert = insert_entry->k; 763 bool kick_reclaim = false; 764 765 BUG_ON(insert->k.u64s > ck->u64s); 766 767 bkey_copy(ck->k, insert); 768 ck->valid = true; 769 770 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 771 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); 772 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 773 atomic_long_inc(&c->btree_key_cache.nr_dirty); 774 775 if (bch2_nr_btree_keys_need_flush(c)) 776 kick_reclaim = true; 777 } 778 779 /* 780 * To minimize lock contention, we only add the journal pin here and 781 * defer pin updates to the flush callback via ->seq. Be careful not to 782 * update ->seq on nojournal commits because we don't want to update the 783 * pin to a seq that doesn't include journal updates on disk. Otherwise 784 * we risk losing the update after a crash. 785 * 786 * The only exception is if the pin is not active in the first place. We 787 * have to add the pin because journal reclaim drives key cache 788 * flushing. The flush callback will not proceed unless ->seq matches 789 * the latest pin, so make sure it starts with a consistent value. 790 */ 791 if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || 792 !journal_pin_active(&ck->journal)) { 793 ck->seq = trans->journal_res.seq; 794 } 795 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 796 &ck->journal, bch2_btree_key_cache_journal_flush); 797 798 if (kick_reclaim) 799 journal_reclaim_kick(&c->journal); 800 return true; 801 } 802 803 void bch2_btree_key_cache_drop(struct btree_trans *trans, 804 struct btree_path *path) 805 { 806 struct bch_fs *c = trans->c; 807 struct bkey_cached *ck = (void *) path->l[0].b; 808 809 BUG_ON(!ck->valid); 810 811 /* 812 * We just did an update to the btree, bypassing the key cache: the key 813 * cache key is now stale and must be dropped, even if dirty: 814 */ 815 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 816 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 817 atomic_long_dec(&c->btree_key_cache.nr_dirty); 818 bch2_journal_pin_drop(&c->journal, &ck->journal); 819 } 820 821 ck->valid = false; 822 } 823 824 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 825 struct shrink_control *sc) 826 { 827 struct bch_fs *c = shrink->private_data; 828 struct btree_key_cache *bc = &c->btree_key_cache; 829 struct bucket_table *tbl; 830 struct bkey_cached *ck, *t; 831 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 832 unsigned start, flags; 833 int srcu_idx; 834 835 mutex_lock(&bc->lock); 836 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 837 flags = memalloc_nofs_save(); 838 839 /* 840 * Newest freed entries are at the end of the list - once we hit one 841 * that's too new to be freed, we can bail out: 842 */ 843 scanned += bc->nr_freed_nonpcpu; 844 845 list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { 846 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 847 ck->btree_trans_barrier_seq)) 848 break; 849 850 list_del(&ck->list); 851 six_lock_exit(&ck->c.lock); 852 kmem_cache_free(bch2_key_cache, ck); 853 atomic_long_dec(&bc->nr_freed); 854 freed++; 855 bc->nr_freed_nonpcpu--; 856 } 857 858 if (scanned >= nr) 859 goto out; 860 861 scanned += bc->nr_freed_pcpu; 862 863 list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { 864 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 865 ck->btree_trans_barrier_seq)) 866 break; 867 868 list_del(&ck->list); 869 six_lock_exit(&ck->c.lock); 870 kmem_cache_free(bch2_key_cache, ck); 871 atomic_long_dec(&bc->nr_freed); 872 freed++; 873 bc->nr_freed_pcpu--; 874 } 875 876 if (scanned >= nr) 877 goto out; 878 879 rcu_read_lock(); 880 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 881 if (bc->shrink_iter >= tbl->size) 882 bc->shrink_iter = 0; 883 start = bc->shrink_iter; 884 885 do { 886 struct rhash_head *pos, *next; 887 888 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); 889 890 while (!rht_is_a_nulls(pos)) { 891 next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); 892 ck = container_of(pos, struct bkey_cached, hash); 893 894 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) 895 goto next; 896 897 if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 898 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 899 else if (bkey_cached_lock_for_evict(ck)) { 900 bkey_cached_evict(bc, ck); 901 bkey_cached_free(bc, ck); 902 } 903 904 scanned++; 905 if (scanned >= nr) 906 break; 907 next: 908 pos = next; 909 } 910 911 bc->shrink_iter++; 912 if (bc->shrink_iter >= tbl->size) 913 bc->shrink_iter = 0; 914 } while (scanned < nr && bc->shrink_iter != start); 915 916 rcu_read_unlock(); 917 out: 918 memalloc_nofs_restore(flags); 919 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 920 mutex_unlock(&bc->lock); 921 922 return freed; 923 } 924 925 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 926 struct shrink_control *sc) 927 { 928 struct bch_fs *c = shrink->private_data; 929 struct btree_key_cache *bc = &c->btree_key_cache; 930 long nr = atomic_long_read(&bc->nr_keys) - 931 atomic_long_read(&bc->nr_dirty); 932 933 return max(0L, nr); 934 } 935 936 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 937 { 938 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 939 struct bucket_table *tbl; 940 struct bkey_cached *ck, *n; 941 struct rhash_head *pos; 942 LIST_HEAD(items); 943 unsigned i; 944 #ifdef __KERNEL__ 945 int cpu; 946 #endif 947 948 shrinker_free(bc->shrink); 949 950 mutex_lock(&bc->lock); 951 952 /* 953 * The loop is needed to guard against racing with rehash: 954 */ 955 while (atomic_long_read(&bc->nr_keys)) { 956 rcu_read_lock(); 957 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 958 if (tbl) 959 for (i = 0; i < tbl->size; i++) 960 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 961 bkey_cached_evict(bc, ck); 962 list_add(&ck->list, &items); 963 } 964 rcu_read_unlock(); 965 } 966 967 #ifdef __KERNEL__ 968 for_each_possible_cpu(cpu) { 969 struct btree_key_cache_freelist *f = 970 per_cpu_ptr(bc->pcpu_freed, cpu); 971 972 for (i = 0; i < f->nr; i++) { 973 ck = f->objs[i]; 974 list_add(&ck->list, &items); 975 } 976 } 977 #endif 978 979 BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); 980 BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); 981 982 list_splice(&bc->freed_pcpu, &items); 983 list_splice(&bc->freed_nonpcpu, &items); 984 985 mutex_unlock(&bc->lock); 986 987 list_for_each_entry_safe(ck, n, &items, list) { 988 cond_resched(); 989 990 list_del(&ck->list); 991 kfree(ck->k); 992 six_lock_exit(&ck->c.lock); 993 kmem_cache_free(bch2_key_cache, ck); 994 } 995 996 if (atomic_long_read(&bc->nr_dirty) && 997 !bch2_journal_error(&c->journal) && 998 test_bit(BCH_FS_was_rw, &c->flags)) 999 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 1000 atomic_long_read(&bc->nr_dirty)); 1001 1002 if (atomic_long_read(&bc->nr_keys)) 1003 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 1004 atomic_long_read(&bc->nr_keys)); 1005 1006 if (bc->table_init_done) 1007 rhashtable_destroy(&bc->table); 1008 1009 free_percpu(bc->pcpu_freed); 1010 } 1011 1012 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 1013 { 1014 mutex_init(&c->lock); 1015 INIT_LIST_HEAD(&c->freed_pcpu); 1016 INIT_LIST_HEAD(&c->freed_nonpcpu); 1017 } 1018 1019 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 1020 { 1021 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 1022 struct shrinker *shrink; 1023 1024 #ifdef __KERNEL__ 1025 bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); 1026 if (!bc->pcpu_freed) 1027 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1028 #endif 1029 1030 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 1031 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1032 1033 bc->table_init_done = true; 1034 1035 shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); 1036 if (!shrink) 1037 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1038 bc->shrink = shrink; 1039 shrink->seeks = 0; 1040 shrink->count_objects = bch2_btree_key_cache_count; 1041 shrink->scan_objects = bch2_btree_key_cache_scan; 1042 shrink->private_data = c; 1043 shrinker_register(shrink); 1044 return 0; 1045 } 1046 1047 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) 1048 { 1049 prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); 1050 prt_newline(out); 1051 prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); 1052 prt_newline(out); 1053 prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); 1054 prt_newline(out); 1055 } 1056 1057 void bch2_btree_key_cache_exit(void) 1058 { 1059 kmem_cache_destroy(bch2_key_cache); 1060 } 1061 1062 int __init bch2_btree_key_cache_init(void) 1063 { 1064 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 1065 if (!bch2_key_cache) 1066 return -ENOMEM; 1067 1068 return 0; 1069 } 1070