1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 }; 40 41 __flatten 42 inline struct bkey_cached * 43 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 44 { 45 struct bkey_cached_key key = { 46 .btree_id = btree_id, 47 .pos = pos, 48 }; 49 50 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 51 bch2_btree_key_cache_params); 52 } 53 54 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 55 { 56 if (!six_trylock_intent(&ck->c.lock)) 57 return false; 58 59 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 60 six_unlock_intent(&ck->c.lock); 61 return false; 62 } 63 64 if (!six_trylock_write(&ck->c.lock)) { 65 six_unlock_intent(&ck->c.lock); 66 return false; 67 } 68 69 return true; 70 } 71 72 static void bkey_cached_evict(struct btree_key_cache *c, 73 struct bkey_cached *ck) 74 { 75 BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, 76 bch2_btree_key_cache_params)); 77 memset(&ck->key, ~0, sizeof(ck->key)); 78 79 atomic_long_dec(&c->nr_keys); 80 } 81 82 static void bkey_cached_free(struct btree_key_cache *bc, 83 struct bkey_cached *ck) 84 { 85 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 86 87 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 88 89 ck->btree_trans_barrier_seq = 90 start_poll_synchronize_srcu(&c->btree_trans_barrier); 91 92 if (ck->c.lock.readers) { 93 list_move_tail(&ck->list, &bc->freed_pcpu); 94 bc->nr_freed_pcpu++; 95 } else { 96 list_move_tail(&ck->list, &bc->freed_nonpcpu); 97 bc->nr_freed_nonpcpu++; 98 } 99 atomic_long_inc(&bc->nr_freed); 100 101 kfree(ck->k); 102 ck->k = NULL; 103 ck->u64s = 0; 104 105 six_unlock_write(&ck->c.lock); 106 six_unlock_intent(&ck->c.lock); 107 } 108 109 #ifdef __KERNEL__ 110 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, 111 struct bkey_cached *ck) 112 { 113 struct bkey_cached *pos; 114 115 bc->nr_freed_nonpcpu++; 116 117 list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { 118 if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, 119 pos->btree_trans_barrier_seq)) { 120 list_move(&ck->list, &pos->list); 121 return; 122 } 123 } 124 125 list_move(&ck->list, &bc->freed_nonpcpu); 126 } 127 #endif 128 129 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, 130 struct bkey_cached *ck) 131 { 132 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 133 134 if (!ck->c.lock.readers) { 135 #ifdef __KERNEL__ 136 struct btree_key_cache_freelist *f; 137 bool freed = false; 138 139 preempt_disable(); 140 f = this_cpu_ptr(bc->pcpu_freed); 141 142 if (f->nr < ARRAY_SIZE(f->objs)) { 143 f->objs[f->nr++] = ck; 144 freed = true; 145 } 146 preempt_enable(); 147 148 if (!freed) { 149 mutex_lock(&bc->lock); 150 preempt_disable(); 151 f = this_cpu_ptr(bc->pcpu_freed); 152 153 while (f->nr > ARRAY_SIZE(f->objs) / 2) { 154 struct bkey_cached *ck2 = f->objs[--f->nr]; 155 156 __bkey_cached_move_to_freelist_ordered(bc, ck2); 157 } 158 preempt_enable(); 159 160 __bkey_cached_move_to_freelist_ordered(bc, ck); 161 mutex_unlock(&bc->lock); 162 } 163 #else 164 mutex_lock(&bc->lock); 165 list_move_tail(&ck->list, &bc->freed_nonpcpu); 166 bc->nr_freed_nonpcpu++; 167 mutex_unlock(&bc->lock); 168 #endif 169 } else { 170 mutex_lock(&bc->lock); 171 list_move_tail(&ck->list, &bc->freed_pcpu); 172 mutex_unlock(&bc->lock); 173 } 174 } 175 176 static void bkey_cached_free_fast(struct btree_key_cache *bc, 177 struct bkey_cached *ck) 178 { 179 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 180 181 ck->btree_trans_barrier_seq = 182 start_poll_synchronize_srcu(&c->btree_trans_barrier); 183 184 list_del_init(&ck->list); 185 atomic_long_inc(&bc->nr_freed); 186 187 kfree(ck->k); 188 ck->k = NULL; 189 ck->u64s = 0; 190 191 bkey_cached_move_to_freelist(bc, ck); 192 193 six_unlock_write(&ck->c.lock); 194 six_unlock_intent(&ck->c.lock); 195 } 196 197 static struct bkey_cached * 198 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, 199 bool *was_new) 200 { 201 struct bch_fs *c = trans->c; 202 struct btree_key_cache *bc = &c->btree_key_cache; 203 struct bkey_cached *ck = NULL; 204 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 205 int ret; 206 207 if (!pcpu_readers) { 208 #ifdef __KERNEL__ 209 struct btree_key_cache_freelist *f; 210 211 preempt_disable(); 212 f = this_cpu_ptr(bc->pcpu_freed); 213 if (f->nr) 214 ck = f->objs[--f->nr]; 215 preempt_enable(); 216 217 if (!ck) { 218 mutex_lock(&bc->lock); 219 preempt_disable(); 220 f = this_cpu_ptr(bc->pcpu_freed); 221 222 while (!list_empty(&bc->freed_nonpcpu) && 223 f->nr < ARRAY_SIZE(f->objs) / 2) { 224 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 225 list_del_init(&ck->list); 226 bc->nr_freed_nonpcpu--; 227 f->objs[f->nr++] = ck; 228 } 229 230 ck = f->nr ? f->objs[--f->nr] : NULL; 231 preempt_enable(); 232 mutex_unlock(&bc->lock); 233 } 234 #else 235 mutex_lock(&bc->lock); 236 if (!list_empty(&bc->freed_nonpcpu)) { 237 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 238 list_del_init(&ck->list); 239 bc->nr_freed_nonpcpu--; 240 } 241 mutex_unlock(&bc->lock); 242 #endif 243 } else { 244 mutex_lock(&bc->lock); 245 if (!list_empty(&bc->freed_pcpu)) { 246 ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); 247 list_del_init(&ck->list); 248 } 249 mutex_unlock(&bc->lock); 250 } 251 252 if (ck) { 253 ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); 254 if (unlikely(ret)) { 255 bkey_cached_move_to_freelist(bc, ck); 256 return ERR_PTR(ret); 257 } 258 259 path->l[0].b = (void *) ck; 260 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 261 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 262 263 ret = bch2_btree_node_lock_write(trans, path, &ck->c); 264 if (unlikely(ret)) { 265 btree_node_unlock(trans, path, 0); 266 bkey_cached_move_to_freelist(bc, ck); 267 return ERR_PTR(ret); 268 } 269 270 return ck; 271 } 272 273 ck = allocate_dropping_locks(trans, ret, 274 kmem_cache_zalloc(bch2_key_cache, _gfp)); 275 if (ret) { 276 kmem_cache_free(bch2_key_cache, ck); 277 return ERR_PTR(ret); 278 } 279 280 if (!ck) 281 return NULL; 282 283 INIT_LIST_HEAD(&ck->list); 284 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 285 286 ck->c.cached = true; 287 BUG_ON(!six_trylock_intent(&ck->c.lock)); 288 BUG_ON(!six_trylock_write(&ck->c.lock)); 289 *was_new = true; 290 return ck; 291 } 292 293 static struct bkey_cached * 294 bkey_cached_reuse(struct btree_key_cache *c) 295 { 296 struct bucket_table *tbl; 297 struct rhash_head *pos; 298 struct bkey_cached *ck; 299 unsigned i; 300 301 mutex_lock(&c->lock); 302 rcu_read_lock(); 303 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 304 for (i = 0; i < tbl->size; i++) 305 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 306 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 307 bkey_cached_lock_for_evict(ck)) { 308 bkey_cached_evict(c, ck); 309 goto out; 310 } 311 } 312 ck = NULL; 313 out: 314 rcu_read_unlock(); 315 mutex_unlock(&c->lock); 316 return ck; 317 } 318 319 static struct bkey_cached * 320 btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) 321 { 322 struct bch_fs *c = trans->c; 323 struct btree_key_cache *bc = &c->btree_key_cache; 324 struct bkey_cached *ck; 325 bool was_new = false; 326 327 ck = bkey_cached_alloc(trans, path, &was_new); 328 if (IS_ERR(ck)) 329 return ck; 330 331 if (unlikely(!ck)) { 332 ck = bkey_cached_reuse(bc); 333 if (unlikely(!ck)) { 334 bch_err(c, "error allocating memory for key cache item, btree %s", 335 bch2_btree_id_str(path->btree_id)); 336 return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); 337 } 338 339 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 340 } 341 342 ck->c.level = 0; 343 ck->c.btree_id = path->btree_id; 344 ck->key.btree_id = path->btree_id; 345 ck->key.pos = path->pos; 346 ck->valid = false; 347 ck->flags = 1U << BKEY_CACHED_ACCESSED; 348 349 if (unlikely(rhashtable_lookup_insert_fast(&bc->table, 350 &ck->hash, 351 bch2_btree_key_cache_params))) { 352 /* We raced with another fill: */ 353 354 if (likely(was_new)) { 355 six_unlock_write(&ck->c.lock); 356 six_unlock_intent(&ck->c.lock); 357 kfree(ck); 358 } else { 359 bkey_cached_free_fast(bc, ck); 360 } 361 362 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 363 return NULL; 364 } 365 366 atomic_long_inc(&bc->nr_keys); 367 368 six_unlock_write(&ck->c.lock); 369 370 return ck; 371 } 372 373 static int btree_key_cache_fill(struct btree_trans *trans, 374 struct btree_path *ck_path, 375 struct bkey_cached *ck) 376 { 377 struct btree_iter iter; 378 struct bkey_s_c k; 379 unsigned new_u64s = 0; 380 struct bkey_i *new_k = NULL; 381 int ret; 382 383 k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, 384 BTREE_ITER_KEY_CACHE_FILL| 385 BTREE_ITER_CACHED_NOFILL); 386 ret = bkey_err(k); 387 if (ret) 388 goto err; 389 390 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 391 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 392 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 393 goto err; 394 } 395 396 /* 397 * bch2_varint_decode can read past the end of the buffer by at 398 * most 7 bytes (it won't be used): 399 */ 400 new_u64s = k.k->u64s + 1; 401 402 /* 403 * Allocate some extra space so that the transaction commit path is less 404 * likely to have to reallocate, since that requires a transaction 405 * restart: 406 */ 407 new_u64s = min(256U, (new_u64s * 3) / 2); 408 409 if (new_u64s > ck->u64s) { 410 new_u64s = roundup_pow_of_two(new_u64s); 411 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); 412 if (!new_k) { 413 bch2_trans_unlock(trans); 414 415 new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); 416 if (!new_k) { 417 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 418 bch2_btree_id_str(ck->key.btree_id), new_u64s); 419 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 420 goto err; 421 } 422 423 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 424 kfree(new_k); 425 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 426 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 427 goto err; 428 } 429 430 ret = bch2_trans_relock(trans); 431 if (ret) { 432 kfree(new_k); 433 goto err; 434 } 435 } 436 } 437 438 ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); 439 if (ret) { 440 kfree(new_k); 441 goto err; 442 } 443 444 if (new_k) { 445 kfree(ck->k); 446 ck->u64s = new_u64s; 447 ck->k = new_k; 448 } 449 450 bkey_reassemble(ck->k, k); 451 ck->valid = true; 452 bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); 453 454 /* We're not likely to need this iterator again: */ 455 set_btree_iter_dontneed(&iter); 456 err: 457 bch2_trans_iter_exit(trans, &iter); 458 return ret; 459 } 460 461 static noinline int 462 bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, 463 unsigned flags) 464 { 465 struct bch_fs *c = trans->c; 466 struct bkey_cached *ck; 467 int ret = 0; 468 469 BUG_ON(path->level); 470 471 path->l[1].b = NULL; 472 473 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 474 ck = (void *) path->l[0].b; 475 goto fill; 476 } 477 retry: 478 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 479 if (!ck) { 480 ck = btree_key_cache_create(trans, path); 481 ret = PTR_ERR_OR_ZERO(ck); 482 if (ret) 483 goto err; 484 if (!ck) 485 goto retry; 486 487 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 488 path->locks_want = 1; 489 } else { 490 enum six_lock_type lock_want = __btree_lock_want(path, 0); 491 492 ret = btree_node_lock(trans, path, (void *) ck, 0, 493 lock_want, _THIS_IP_); 494 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 495 goto err; 496 497 BUG_ON(ret); 498 499 if (ck->key.btree_id != path->btree_id || 500 !bpos_eq(ck->key.pos, path->pos)) { 501 six_unlock_type(&ck->c.lock, lock_want); 502 goto retry; 503 } 504 505 mark_btree_node_locked(trans, path, 0, 506 (enum btree_node_locked_type) lock_want); 507 } 508 509 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 510 path->l[0].b = (void *) ck; 511 fill: 512 path->uptodate = BTREE_ITER_UPTODATE; 513 514 if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { 515 /* 516 * Using the underscore version because we haven't set 517 * path->uptodate yet: 518 */ 519 if (!path->locks_want && 520 !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { 521 trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); 522 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); 523 goto err; 524 } 525 526 ret = btree_key_cache_fill(trans, path, ck); 527 if (ret) 528 goto err; 529 530 ret = bch2_btree_path_relock(trans, path, _THIS_IP_); 531 if (ret) 532 goto err; 533 534 path->uptodate = BTREE_ITER_UPTODATE; 535 } 536 537 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 538 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 539 540 BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 541 BUG_ON(path->uptodate); 542 543 return ret; 544 err: 545 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 546 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 547 btree_node_unlock(trans, path, 0); 548 path->l[0].b = ERR_PTR(ret); 549 } 550 return ret; 551 } 552 553 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 554 unsigned flags) 555 { 556 struct bch_fs *c = trans->c; 557 struct bkey_cached *ck; 558 int ret = 0; 559 560 EBUG_ON(path->level); 561 562 path->l[1].b = NULL; 563 564 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 565 ck = (void *) path->l[0].b; 566 goto fill; 567 } 568 retry: 569 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 570 if (!ck) { 571 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 572 } else { 573 enum six_lock_type lock_want = __btree_lock_want(path, 0); 574 575 ret = btree_node_lock(trans, path, (void *) ck, 0, 576 lock_want, _THIS_IP_); 577 EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); 578 579 if (ret) 580 return ret; 581 582 if (ck->key.btree_id != path->btree_id || 583 !bpos_eq(ck->key.pos, path->pos)) { 584 six_unlock_type(&ck->c.lock, lock_want); 585 goto retry; 586 } 587 588 mark_btree_node_locked(trans, path, 0, 589 (enum btree_node_locked_type) lock_want); 590 } 591 592 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 593 path->l[0].b = (void *) ck; 594 fill: 595 if (!ck->valid) 596 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 597 598 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 599 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 600 601 path->uptodate = BTREE_ITER_UPTODATE; 602 EBUG_ON(!ck->valid); 603 EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 604 605 return ret; 606 } 607 608 static int btree_key_cache_flush_pos(struct btree_trans *trans, 609 struct bkey_cached_key key, 610 u64 journal_seq, 611 unsigned commit_flags, 612 bool evict) 613 { 614 struct bch_fs *c = trans->c; 615 struct journal *j = &c->journal; 616 struct btree_iter c_iter, b_iter; 617 struct bkey_cached *ck = NULL; 618 int ret; 619 620 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 621 BTREE_ITER_SLOTS| 622 BTREE_ITER_INTENT| 623 BTREE_ITER_ALL_SNAPSHOTS); 624 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 625 BTREE_ITER_CACHED| 626 BTREE_ITER_INTENT); 627 b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; 628 629 ret = bch2_btree_iter_traverse(&c_iter); 630 if (ret) 631 goto out; 632 633 ck = (void *) c_iter.path->l[0].b; 634 if (!ck) 635 goto out; 636 637 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 638 if (evict) 639 goto evict; 640 goto out; 641 } 642 643 BUG_ON(!ck->valid); 644 645 if (journal_seq && ck->journal.seq != journal_seq) 646 goto out; 647 648 /* 649 * Since journal reclaim depends on us making progress here, and the 650 * allocator/copygc depend on journal reclaim making progress, we need 651 * to be using alloc reserves: 652 */ 653 ret = bch2_btree_iter_traverse(&b_iter) ?: 654 bch2_trans_update(trans, &b_iter, ck->k, 655 BTREE_UPDATE_KEY_CACHE_RECLAIM| 656 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| 657 BTREE_TRIGGER_NORUN) ?: 658 bch2_trans_commit(trans, NULL, NULL, 659 BTREE_INSERT_NOCHECK_RW| 660 BTREE_INSERT_NOFAIL| 661 (ck->journal.seq == journal_last_seq(j) 662 ? BCH_WATERMARK_reclaim 663 : 0)| 664 commit_flags); 665 666 bch2_fs_fatal_err_on(ret && 667 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 668 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 669 !bch2_journal_error(j), c, 670 "error flushing key cache: %s", bch2_err_str(ret)); 671 if (ret) 672 goto out; 673 674 bch2_journal_pin_drop(j, &ck->journal); 675 676 BUG_ON(!btree_node_locked(c_iter.path, 0)); 677 678 if (!evict) { 679 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 680 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 681 atomic_long_dec(&c->btree_key_cache.nr_dirty); 682 } 683 } else { 684 struct btree_path *path2; 685 evict: 686 trans_for_each_path(trans, path2) 687 if (path2 != c_iter.path) 688 __bch2_btree_path_unlock(trans, path2); 689 690 bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); 691 692 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 693 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 694 atomic_long_dec(&c->btree_key_cache.nr_dirty); 695 } 696 697 mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); 698 bkey_cached_evict(&c->btree_key_cache, ck); 699 bkey_cached_free_fast(&c->btree_key_cache, ck); 700 } 701 out: 702 bch2_trans_iter_exit(trans, &b_iter); 703 bch2_trans_iter_exit(trans, &c_iter); 704 return ret; 705 } 706 707 int bch2_btree_key_cache_journal_flush(struct journal *j, 708 struct journal_entry_pin *pin, u64 seq) 709 { 710 struct bch_fs *c = container_of(j, struct bch_fs, journal); 711 struct bkey_cached *ck = 712 container_of(pin, struct bkey_cached, journal); 713 struct bkey_cached_key key; 714 struct btree_trans *trans = bch2_trans_get(c); 715 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 716 int ret = 0; 717 718 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 719 key = ck->key; 720 721 if (ck->journal.seq != seq || 722 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 723 six_unlock_read(&ck->c.lock); 724 goto unlock; 725 } 726 727 if (ck->seq != seq) { 728 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 729 bch2_btree_key_cache_journal_flush); 730 six_unlock_read(&ck->c.lock); 731 goto unlock; 732 } 733 six_unlock_read(&ck->c.lock); 734 735 ret = commit_do(trans, NULL, NULL, 0, 736 btree_key_cache_flush_pos(trans, key, seq, 737 BTREE_INSERT_JOURNAL_RECLAIM, false)); 738 unlock: 739 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 740 741 bch2_trans_put(trans); 742 return ret; 743 } 744 745 /* 746 * Flush and evict a key from the key cache: 747 */ 748 int bch2_btree_key_cache_flush(struct btree_trans *trans, 749 enum btree_id id, struct bpos pos) 750 { 751 struct bch_fs *c = trans->c; 752 struct bkey_cached_key key = { id, pos }; 753 754 /* Fastpath - assume it won't be found: */ 755 if (!bch2_btree_key_cache_find(c, id, pos)) 756 return 0; 757 758 return btree_key_cache_flush_pos(trans, key, 0, 0, true); 759 } 760 761 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 762 unsigned flags, 763 struct btree_insert_entry *insert_entry) 764 { 765 struct bch_fs *c = trans->c; 766 struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; 767 struct bkey_i *insert = insert_entry->k; 768 bool kick_reclaim = false; 769 770 BUG_ON(insert->k.u64s > ck->u64s); 771 772 bkey_copy(ck->k, insert); 773 ck->valid = true; 774 775 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 776 EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); 777 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 778 atomic_long_inc(&c->btree_key_cache.nr_dirty); 779 780 if (bch2_nr_btree_keys_need_flush(c)) 781 kick_reclaim = true; 782 } 783 784 /* 785 * To minimize lock contention, we only add the journal pin here and 786 * defer pin updates to the flush callback via ->seq. Be careful not to 787 * update ->seq on nojournal commits because we don't want to update the 788 * pin to a seq that doesn't include journal updates on disk. Otherwise 789 * we risk losing the update after a crash. 790 * 791 * The only exception is if the pin is not active in the first place. We 792 * have to add the pin because journal reclaim drives key cache 793 * flushing. The flush callback will not proceed unless ->seq matches 794 * the latest pin, so make sure it starts with a consistent value. 795 */ 796 if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || 797 !journal_pin_active(&ck->journal)) { 798 ck->seq = trans->journal_res.seq; 799 } 800 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 801 &ck->journal, bch2_btree_key_cache_journal_flush); 802 803 if (kick_reclaim) 804 journal_reclaim_kick(&c->journal); 805 return true; 806 } 807 808 void bch2_btree_key_cache_drop(struct btree_trans *trans, 809 struct btree_path *path) 810 { 811 struct bch_fs *c = trans->c; 812 struct bkey_cached *ck = (void *) path->l[0].b; 813 814 BUG_ON(!ck->valid); 815 816 /* 817 * We just did an update to the btree, bypassing the key cache: the key 818 * cache key is now stale and must be dropped, even if dirty: 819 */ 820 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 821 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 822 atomic_long_dec(&c->btree_key_cache.nr_dirty); 823 bch2_journal_pin_drop(&c->journal, &ck->journal); 824 } 825 826 ck->valid = false; 827 } 828 829 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 830 struct shrink_control *sc) 831 { 832 struct bch_fs *c = shrink->private_data; 833 struct btree_key_cache *bc = &c->btree_key_cache; 834 struct bucket_table *tbl; 835 struct bkey_cached *ck, *t; 836 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 837 unsigned start, flags; 838 int srcu_idx; 839 840 mutex_lock(&bc->lock); 841 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 842 flags = memalloc_nofs_save(); 843 844 /* 845 * Newest freed entries are at the end of the list - once we hit one 846 * that's too new to be freed, we can bail out: 847 */ 848 scanned += bc->nr_freed_nonpcpu; 849 850 list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { 851 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 852 ck->btree_trans_barrier_seq)) 853 break; 854 855 list_del(&ck->list); 856 six_lock_exit(&ck->c.lock); 857 kmem_cache_free(bch2_key_cache, ck); 858 atomic_long_dec(&bc->nr_freed); 859 freed++; 860 bc->nr_freed_nonpcpu--; 861 } 862 863 if (scanned >= nr) 864 goto out; 865 866 scanned += bc->nr_freed_pcpu; 867 868 list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { 869 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 870 ck->btree_trans_barrier_seq)) 871 break; 872 873 list_del(&ck->list); 874 six_lock_exit(&ck->c.lock); 875 kmem_cache_free(bch2_key_cache, ck); 876 atomic_long_dec(&bc->nr_freed); 877 freed++; 878 bc->nr_freed_pcpu--; 879 } 880 881 if (scanned >= nr) 882 goto out; 883 884 rcu_read_lock(); 885 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 886 if (bc->shrink_iter >= tbl->size) 887 bc->shrink_iter = 0; 888 start = bc->shrink_iter; 889 890 do { 891 struct rhash_head *pos, *next; 892 893 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); 894 895 while (!rht_is_a_nulls(pos)) { 896 next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); 897 ck = container_of(pos, struct bkey_cached, hash); 898 899 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) 900 goto next; 901 902 if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 903 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 904 else if (bkey_cached_lock_for_evict(ck)) { 905 bkey_cached_evict(bc, ck); 906 bkey_cached_free(bc, ck); 907 } 908 909 scanned++; 910 if (scanned >= nr) 911 break; 912 next: 913 pos = next; 914 } 915 916 bc->shrink_iter++; 917 if (bc->shrink_iter >= tbl->size) 918 bc->shrink_iter = 0; 919 } while (scanned < nr && bc->shrink_iter != start); 920 921 rcu_read_unlock(); 922 out: 923 memalloc_nofs_restore(flags); 924 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 925 mutex_unlock(&bc->lock); 926 927 return freed; 928 } 929 930 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 931 struct shrink_control *sc) 932 { 933 struct bch_fs *c = shrink->private_data; 934 struct btree_key_cache *bc = &c->btree_key_cache; 935 long nr = atomic_long_read(&bc->nr_keys) - 936 atomic_long_read(&bc->nr_dirty); 937 938 return max(0L, nr); 939 } 940 941 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 942 { 943 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 944 struct bucket_table *tbl; 945 struct bkey_cached *ck, *n; 946 struct rhash_head *pos; 947 LIST_HEAD(items); 948 unsigned i; 949 #ifdef __KERNEL__ 950 int cpu; 951 #endif 952 953 shrinker_free(bc->shrink); 954 955 mutex_lock(&bc->lock); 956 957 /* 958 * The loop is needed to guard against racing with rehash: 959 */ 960 while (atomic_long_read(&bc->nr_keys)) { 961 rcu_read_lock(); 962 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 963 if (tbl) 964 for (i = 0; i < tbl->size; i++) 965 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 966 bkey_cached_evict(bc, ck); 967 list_add(&ck->list, &items); 968 } 969 rcu_read_unlock(); 970 } 971 972 #ifdef __KERNEL__ 973 for_each_possible_cpu(cpu) { 974 struct btree_key_cache_freelist *f = 975 per_cpu_ptr(bc->pcpu_freed, cpu); 976 977 for (i = 0; i < f->nr; i++) { 978 ck = f->objs[i]; 979 list_add(&ck->list, &items); 980 } 981 } 982 #endif 983 984 BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); 985 BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); 986 987 list_splice(&bc->freed_pcpu, &items); 988 list_splice(&bc->freed_nonpcpu, &items); 989 990 mutex_unlock(&bc->lock); 991 992 list_for_each_entry_safe(ck, n, &items, list) { 993 cond_resched(); 994 995 list_del(&ck->list); 996 kfree(ck->k); 997 six_lock_exit(&ck->c.lock); 998 kmem_cache_free(bch2_key_cache, ck); 999 } 1000 1001 if (atomic_long_read(&bc->nr_dirty) && 1002 !bch2_journal_error(&c->journal) && 1003 test_bit(BCH_FS_WAS_RW, &c->flags)) 1004 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 1005 atomic_long_read(&bc->nr_dirty)); 1006 1007 if (atomic_long_read(&bc->nr_keys)) 1008 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 1009 atomic_long_read(&bc->nr_keys)); 1010 1011 if (bc->table_init_done) 1012 rhashtable_destroy(&bc->table); 1013 1014 free_percpu(bc->pcpu_freed); 1015 } 1016 1017 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 1018 { 1019 mutex_init(&c->lock); 1020 INIT_LIST_HEAD(&c->freed_pcpu); 1021 INIT_LIST_HEAD(&c->freed_nonpcpu); 1022 } 1023 1024 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 1025 { 1026 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 1027 struct shrinker *shrink; 1028 1029 #ifdef __KERNEL__ 1030 bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); 1031 if (!bc->pcpu_freed) 1032 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1033 #endif 1034 1035 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 1036 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1037 1038 bc->table_init_done = true; 1039 1040 shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); 1041 if (!shrink) 1042 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1043 bc->shrink = shrink; 1044 shrink->seeks = 0; 1045 shrink->count_objects = bch2_btree_key_cache_count; 1046 shrink->scan_objects = bch2_btree_key_cache_scan; 1047 shrink->private_data = c; 1048 shrinker_register(shrink); 1049 return 0; 1050 } 1051 1052 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) 1053 { 1054 prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); 1055 prt_newline(out); 1056 prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); 1057 prt_newline(out); 1058 prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); 1059 prt_newline(out); 1060 } 1061 1062 void bch2_btree_key_cache_exit(void) 1063 { 1064 kmem_cache_destroy(bch2_key_cache); 1065 } 1066 1067 int __init bch2_btree_key_cache_init(void) 1068 { 1069 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 1070 if (!bch2_key_cache) 1071 return -ENOMEM; 1072 1073 return 0; 1074 } 1075