1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 }; 40 41 __flatten 42 inline struct bkey_cached * 43 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 44 { 45 struct bkey_cached_key key = { 46 .btree_id = btree_id, 47 .pos = pos, 48 }; 49 50 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 51 bch2_btree_key_cache_params); 52 } 53 54 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 55 { 56 if (!six_trylock_intent(&ck->c.lock)) 57 return false; 58 59 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 60 six_unlock_intent(&ck->c.lock); 61 return false; 62 } 63 64 if (!six_trylock_write(&ck->c.lock)) { 65 six_unlock_intent(&ck->c.lock); 66 return false; 67 } 68 69 return true; 70 } 71 72 static void bkey_cached_evict(struct btree_key_cache *c, 73 struct bkey_cached *ck) 74 { 75 BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, 76 bch2_btree_key_cache_params)); 77 memset(&ck->key, ~0, sizeof(ck->key)); 78 79 atomic_long_dec(&c->nr_keys); 80 } 81 82 static void bkey_cached_free(struct btree_key_cache *bc, 83 struct bkey_cached *ck) 84 { 85 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 86 87 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 88 89 ck->btree_trans_barrier_seq = 90 start_poll_synchronize_srcu(&c->btree_trans_barrier); 91 92 if (ck->c.lock.readers) { 93 list_move_tail(&ck->list, &bc->freed_pcpu); 94 bc->nr_freed_pcpu++; 95 } else { 96 list_move_tail(&ck->list, &bc->freed_nonpcpu); 97 bc->nr_freed_nonpcpu++; 98 } 99 atomic_long_inc(&bc->nr_freed); 100 101 kfree(ck->k); 102 ck->k = NULL; 103 ck->u64s = 0; 104 105 six_unlock_write(&ck->c.lock); 106 six_unlock_intent(&ck->c.lock); 107 } 108 109 #ifdef __KERNEL__ 110 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, 111 struct bkey_cached *ck) 112 { 113 struct bkey_cached *pos; 114 115 bc->nr_freed_nonpcpu++; 116 117 list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { 118 if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, 119 pos->btree_trans_barrier_seq)) { 120 list_move(&ck->list, &pos->list); 121 return; 122 } 123 } 124 125 list_move(&ck->list, &bc->freed_nonpcpu); 126 } 127 #endif 128 129 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, 130 struct bkey_cached *ck) 131 { 132 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 133 134 if (!ck->c.lock.readers) { 135 #ifdef __KERNEL__ 136 struct btree_key_cache_freelist *f; 137 bool freed = false; 138 139 preempt_disable(); 140 f = this_cpu_ptr(bc->pcpu_freed); 141 142 if (f->nr < ARRAY_SIZE(f->objs)) { 143 f->objs[f->nr++] = ck; 144 freed = true; 145 } 146 preempt_enable(); 147 148 if (!freed) { 149 mutex_lock(&bc->lock); 150 preempt_disable(); 151 f = this_cpu_ptr(bc->pcpu_freed); 152 153 while (f->nr > ARRAY_SIZE(f->objs) / 2) { 154 struct bkey_cached *ck2 = f->objs[--f->nr]; 155 156 __bkey_cached_move_to_freelist_ordered(bc, ck2); 157 } 158 preempt_enable(); 159 160 __bkey_cached_move_to_freelist_ordered(bc, ck); 161 mutex_unlock(&bc->lock); 162 } 163 #else 164 mutex_lock(&bc->lock); 165 list_move_tail(&ck->list, &bc->freed_nonpcpu); 166 bc->nr_freed_nonpcpu++; 167 mutex_unlock(&bc->lock); 168 #endif 169 } else { 170 mutex_lock(&bc->lock); 171 list_move_tail(&ck->list, &bc->freed_pcpu); 172 bc->nr_freed_pcpu++; 173 mutex_unlock(&bc->lock); 174 } 175 } 176 177 static void bkey_cached_free_fast(struct btree_key_cache *bc, 178 struct bkey_cached *ck) 179 { 180 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 181 182 ck->btree_trans_barrier_seq = 183 start_poll_synchronize_srcu(&c->btree_trans_barrier); 184 185 list_del_init(&ck->list); 186 atomic_long_inc(&bc->nr_freed); 187 188 kfree(ck->k); 189 ck->k = NULL; 190 ck->u64s = 0; 191 192 bkey_cached_move_to_freelist(bc, ck); 193 194 six_unlock_write(&ck->c.lock); 195 six_unlock_intent(&ck->c.lock); 196 } 197 198 static struct bkey_cached * 199 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, 200 bool *was_new) 201 { 202 struct bch_fs *c = trans->c; 203 struct btree_key_cache *bc = &c->btree_key_cache; 204 struct bkey_cached *ck = NULL; 205 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 206 int ret; 207 208 if (!pcpu_readers) { 209 #ifdef __KERNEL__ 210 struct btree_key_cache_freelist *f; 211 212 preempt_disable(); 213 f = this_cpu_ptr(bc->pcpu_freed); 214 if (f->nr) 215 ck = f->objs[--f->nr]; 216 preempt_enable(); 217 218 if (!ck) { 219 mutex_lock(&bc->lock); 220 preempt_disable(); 221 f = this_cpu_ptr(bc->pcpu_freed); 222 223 while (!list_empty(&bc->freed_nonpcpu) && 224 f->nr < ARRAY_SIZE(f->objs) / 2) { 225 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 226 list_del_init(&ck->list); 227 bc->nr_freed_nonpcpu--; 228 f->objs[f->nr++] = ck; 229 } 230 231 ck = f->nr ? f->objs[--f->nr] : NULL; 232 preempt_enable(); 233 mutex_unlock(&bc->lock); 234 } 235 #else 236 mutex_lock(&bc->lock); 237 if (!list_empty(&bc->freed_nonpcpu)) { 238 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 239 list_del_init(&ck->list); 240 bc->nr_freed_nonpcpu--; 241 } 242 mutex_unlock(&bc->lock); 243 #endif 244 } else { 245 mutex_lock(&bc->lock); 246 if (!list_empty(&bc->freed_pcpu)) { 247 ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); 248 list_del_init(&ck->list); 249 bc->nr_freed_pcpu--; 250 } 251 mutex_unlock(&bc->lock); 252 } 253 254 if (ck) { 255 ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); 256 if (unlikely(ret)) { 257 bkey_cached_move_to_freelist(bc, ck); 258 return ERR_PTR(ret); 259 } 260 261 path->l[0].b = (void *) ck; 262 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 263 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 264 265 ret = bch2_btree_node_lock_write(trans, path, &ck->c); 266 if (unlikely(ret)) { 267 btree_node_unlock(trans, path, 0); 268 bkey_cached_move_to_freelist(bc, ck); 269 return ERR_PTR(ret); 270 } 271 272 return ck; 273 } 274 275 ck = allocate_dropping_locks(trans, ret, 276 kmem_cache_zalloc(bch2_key_cache, _gfp)); 277 if (ret) { 278 kmem_cache_free(bch2_key_cache, ck); 279 return ERR_PTR(ret); 280 } 281 282 if (!ck) 283 return NULL; 284 285 INIT_LIST_HEAD(&ck->list); 286 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 287 288 ck->c.cached = true; 289 BUG_ON(!six_trylock_intent(&ck->c.lock)); 290 BUG_ON(!six_trylock_write(&ck->c.lock)); 291 *was_new = true; 292 return ck; 293 } 294 295 static struct bkey_cached * 296 bkey_cached_reuse(struct btree_key_cache *c) 297 { 298 struct bucket_table *tbl; 299 struct rhash_head *pos; 300 struct bkey_cached *ck; 301 unsigned i; 302 303 mutex_lock(&c->lock); 304 rcu_read_lock(); 305 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 306 for (i = 0; i < tbl->size; i++) 307 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 308 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 309 bkey_cached_lock_for_evict(ck)) { 310 bkey_cached_evict(c, ck); 311 goto out; 312 } 313 } 314 ck = NULL; 315 out: 316 rcu_read_unlock(); 317 mutex_unlock(&c->lock); 318 return ck; 319 } 320 321 static struct bkey_cached * 322 btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) 323 { 324 struct bch_fs *c = trans->c; 325 struct btree_key_cache *bc = &c->btree_key_cache; 326 struct bkey_cached *ck; 327 bool was_new = false; 328 329 ck = bkey_cached_alloc(trans, path, &was_new); 330 if (IS_ERR(ck)) 331 return ck; 332 333 if (unlikely(!ck)) { 334 ck = bkey_cached_reuse(bc); 335 if (unlikely(!ck)) { 336 bch_err(c, "error allocating memory for key cache item, btree %s", 337 bch2_btree_id_str(path->btree_id)); 338 return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); 339 } 340 341 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 342 } 343 344 ck->c.level = 0; 345 ck->c.btree_id = path->btree_id; 346 ck->key.btree_id = path->btree_id; 347 ck->key.pos = path->pos; 348 ck->valid = false; 349 ck->flags = 1U << BKEY_CACHED_ACCESSED; 350 351 if (unlikely(rhashtable_lookup_insert_fast(&bc->table, 352 &ck->hash, 353 bch2_btree_key_cache_params))) { 354 /* We raced with another fill: */ 355 356 if (likely(was_new)) { 357 six_unlock_write(&ck->c.lock); 358 six_unlock_intent(&ck->c.lock); 359 kfree(ck); 360 } else { 361 bkey_cached_free_fast(bc, ck); 362 } 363 364 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 365 return NULL; 366 } 367 368 atomic_long_inc(&bc->nr_keys); 369 370 six_unlock_write(&ck->c.lock); 371 372 return ck; 373 } 374 375 static int btree_key_cache_fill(struct btree_trans *trans, 376 struct btree_path *ck_path, 377 struct bkey_cached *ck) 378 { 379 struct btree_iter iter; 380 struct bkey_s_c k; 381 unsigned new_u64s = 0; 382 struct bkey_i *new_k = NULL; 383 int ret; 384 385 bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, 386 BTREE_ITER_key_cache_fill| 387 BTREE_ITER_cached_nofill); 388 iter.flags &= ~BTREE_ITER_with_journal; 389 k = bch2_btree_iter_peek_slot(&iter); 390 ret = bkey_err(k); 391 if (ret) 392 goto err; 393 394 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 395 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 396 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 397 goto err; 398 } 399 400 /* 401 * bch2_varint_decode can read past the end of the buffer by at 402 * most 7 bytes (it won't be used): 403 */ 404 new_u64s = k.k->u64s + 1; 405 406 /* 407 * Allocate some extra space so that the transaction commit path is less 408 * likely to have to reallocate, since that requires a transaction 409 * restart: 410 */ 411 new_u64s = min(256U, (new_u64s * 3) / 2); 412 413 if (new_u64s > ck->u64s) { 414 new_u64s = roundup_pow_of_two(new_u64s); 415 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); 416 if (!new_k) { 417 bch2_trans_unlock(trans); 418 419 new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); 420 if (!new_k) { 421 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 422 bch2_btree_id_str(ck->key.btree_id), new_u64s); 423 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 424 goto err; 425 } 426 427 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 428 kfree(new_k); 429 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 430 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 431 goto err; 432 } 433 434 ret = bch2_trans_relock(trans); 435 if (ret) { 436 kfree(new_k); 437 goto err; 438 } 439 } 440 } 441 442 ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); 443 if (ret) { 444 kfree(new_k); 445 goto err; 446 } 447 448 if (new_k) { 449 kfree(ck->k); 450 ck->u64s = new_u64s; 451 ck->k = new_k; 452 } 453 454 bkey_reassemble(ck->k, k); 455 ck->valid = true; 456 bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); 457 458 /* We're not likely to need this iterator again: */ 459 bch2_set_btree_iter_dontneed(&iter); 460 err: 461 bch2_trans_iter_exit(trans, &iter); 462 return ret; 463 } 464 465 static noinline int 466 bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, 467 unsigned flags) 468 { 469 struct bch_fs *c = trans->c; 470 struct bkey_cached *ck; 471 int ret = 0; 472 473 BUG_ON(path->level); 474 475 path->l[1].b = NULL; 476 477 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 478 ck = (void *) path->l[0].b; 479 goto fill; 480 } 481 retry: 482 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 483 if (!ck) { 484 ck = btree_key_cache_create(trans, path); 485 ret = PTR_ERR_OR_ZERO(ck); 486 if (ret) 487 goto err; 488 if (!ck) 489 goto retry; 490 491 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 492 path->locks_want = 1; 493 } else { 494 enum six_lock_type lock_want = __btree_lock_want(path, 0); 495 496 ret = btree_node_lock(trans, path, (void *) ck, 0, 497 lock_want, _THIS_IP_); 498 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 499 goto err; 500 501 BUG_ON(ret); 502 503 if (ck->key.btree_id != path->btree_id || 504 !bpos_eq(ck->key.pos, path->pos)) { 505 six_unlock_type(&ck->c.lock, lock_want); 506 goto retry; 507 } 508 509 mark_btree_node_locked(trans, path, 0, 510 (enum btree_node_locked_type) lock_want); 511 } 512 513 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 514 path->l[0].b = (void *) ck; 515 fill: 516 path->uptodate = BTREE_ITER_UPTODATE; 517 518 if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) { 519 ret = bch2_btree_path_upgrade(trans, path, 1) ?: 520 btree_key_cache_fill(trans, path, ck) ?: 521 bch2_btree_path_relock(trans, path, _THIS_IP_); 522 if (ret) 523 goto err; 524 525 path->uptodate = BTREE_ITER_UPTODATE; 526 } 527 528 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 529 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 530 531 BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 532 BUG_ON(path->uptodate); 533 534 return ret; 535 err: 536 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 537 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 538 btree_node_unlock(trans, path, 0); 539 path->l[0].b = ERR_PTR(ret); 540 } 541 return ret; 542 } 543 544 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 545 unsigned flags) 546 { 547 struct bch_fs *c = trans->c; 548 struct bkey_cached *ck; 549 int ret = 0; 550 551 EBUG_ON(path->level); 552 553 path->l[1].b = NULL; 554 555 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 556 ck = (void *) path->l[0].b; 557 goto fill; 558 } 559 retry: 560 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 561 if (!ck) { 562 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 563 } else { 564 enum six_lock_type lock_want = __btree_lock_want(path, 0); 565 566 ret = btree_node_lock(trans, path, (void *) ck, 0, 567 lock_want, _THIS_IP_); 568 EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); 569 570 if (ret) 571 return ret; 572 573 if (ck->key.btree_id != path->btree_id || 574 !bpos_eq(ck->key.pos, path->pos)) { 575 six_unlock_type(&ck->c.lock, lock_want); 576 goto retry; 577 } 578 579 mark_btree_node_locked(trans, path, 0, 580 (enum btree_node_locked_type) lock_want); 581 } 582 583 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 584 path->l[0].b = (void *) ck; 585 fill: 586 if (!ck->valid) 587 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 588 589 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 590 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 591 592 path->uptodate = BTREE_ITER_UPTODATE; 593 EBUG_ON(!ck->valid); 594 EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 595 596 return ret; 597 } 598 599 static int btree_key_cache_flush_pos(struct btree_trans *trans, 600 struct bkey_cached_key key, 601 u64 journal_seq, 602 unsigned commit_flags, 603 bool evict) 604 { 605 struct bch_fs *c = trans->c; 606 struct journal *j = &c->journal; 607 struct btree_iter c_iter, b_iter; 608 struct bkey_cached *ck = NULL; 609 int ret; 610 611 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 612 BTREE_ITER_slots| 613 BTREE_ITER_intent| 614 BTREE_ITER_all_snapshots); 615 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 616 BTREE_ITER_cached| 617 BTREE_ITER_intent); 618 b_iter.flags &= ~BTREE_ITER_with_key_cache; 619 620 ret = bch2_btree_iter_traverse(&c_iter); 621 if (ret) 622 goto out; 623 624 ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; 625 if (!ck) 626 goto out; 627 628 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 629 if (evict) 630 goto evict; 631 goto out; 632 } 633 634 BUG_ON(!ck->valid); 635 636 if (journal_seq && ck->journal.seq != journal_seq) 637 goto out; 638 639 trans->journal_res.seq = ck->journal.seq; 640 641 /* 642 * If we're at the end of the journal, we really want to free up space 643 * in the journal right away - we don't want to pin that old journal 644 * sequence number with a new btree node write, we want to re-journal 645 * the update 646 */ 647 if (ck->journal.seq == journal_last_seq(j)) 648 commit_flags |= BCH_WATERMARK_reclaim; 649 650 if (ck->journal.seq != journal_last_seq(j) || 651 !test_bit(JOURNAL_space_low, &c->journal.flags)) 652 commit_flags |= BCH_TRANS_COMMIT_no_journal_res; 653 654 ret = bch2_btree_iter_traverse(&b_iter) ?: 655 bch2_trans_update(trans, &b_iter, ck->k, 656 BTREE_UPDATE_key_cache_reclaim| 657 BTREE_UPDATE_internal_snapshot_node| 658 BTREE_TRIGGER_norun) ?: 659 bch2_trans_commit(trans, NULL, NULL, 660 BCH_TRANS_COMMIT_no_check_rw| 661 BCH_TRANS_COMMIT_no_enospc| 662 commit_flags); 663 664 bch2_fs_fatal_err_on(ret && 665 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 666 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 667 !bch2_journal_error(j), c, 668 "flushing key cache: %s", bch2_err_str(ret)); 669 if (ret) 670 goto out; 671 672 bch2_journal_pin_drop(j, &ck->journal); 673 674 struct btree_path *path = btree_iter_path(trans, &c_iter); 675 BUG_ON(!btree_node_locked(path, 0)); 676 677 if (!evict) { 678 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 679 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 680 atomic_long_dec(&c->btree_key_cache.nr_dirty); 681 } 682 } else { 683 struct btree_path *path2; 684 unsigned i; 685 evict: 686 trans_for_each_path(trans, path2, i) 687 if (path2 != path) 688 __bch2_btree_path_unlock(trans, path2); 689 690 bch2_btree_node_lock_write_nofail(trans, path, &ck->c); 691 692 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 693 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 694 atomic_long_dec(&c->btree_key_cache.nr_dirty); 695 } 696 697 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 698 bkey_cached_evict(&c->btree_key_cache, ck); 699 bkey_cached_free_fast(&c->btree_key_cache, ck); 700 } 701 out: 702 bch2_trans_iter_exit(trans, &b_iter); 703 bch2_trans_iter_exit(trans, &c_iter); 704 return ret; 705 } 706 707 int bch2_btree_key_cache_journal_flush(struct journal *j, 708 struct journal_entry_pin *pin, u64 seq) 709 { 710 struct bch_fs *c = container_of(j, struct bch_fs, journal); 711 struct bkey_cached *ck = 712 container_of(pin, struct bkey_cached, journal); 713 struct bkey_cached_key key; 714 struct btree_trans *trans = bch2_trans_get(c); 715 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 716 int ret = 0; 717 718 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 719 key = ck->key; 720 721 if (ck->journal.seq != seq || 722 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 723 six_unlock_read(&ck->c.lock); 724 goto unlock; 725 } 726 727 if (ck->seq != seq) { 728 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 729 bch2_btree_key_cache_journal_flush); 730 six_unlock_read(&ck->c.lock); 731 goto unlock; 732 } 733 six_unlock_read(&ck->c.lock); 734 735 ret = lockrestart_do(trans, 736 btree_key_cache_flush_pos(trans, key, seq, 737 BCH_TRANS_COMMIT_journal_reclaim, false)); 738 unlock: 739 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 740 741 bch2_trans_put(trans); 742 return ret; 743 } 744 745 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 746 unsigned flags, 747 struct btree_insert_entry *insert_entry) 748 { 749 struct bch_fs *c = trans->c; 750 struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; 751 struct bkey_i *insert = insert_entry->k; 752 bool kick_reclaim = false; 753 754 BUG_ON(insert->k.u64s > ck->u64s); 755 756 bkey_copy(ck->k, insert); 757 ck->valid = true; 758 759 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 760 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); 761 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 762 atomic_long_inc(&c->btree_key_cache.nr_dirty); 763 764 if (bch2_nr_btree_keys_need_flush(c)) 765 kick_reclaim = true; 766 } 767 768 /* 769 * To minimize lock contention, we only add the journal pin here and 770 * defer pin updates to the flush callback via ->seq. Be careful not to 771 * update ->seq on nojournal commits because we don't want to update the 772 * pin to a seq that doesn't include journal updates on disk. Otherwise 773 * we risk losing the update after a crash. 774 * 775 * The only exception is if the pin is not active in the first place. We 776 * have to add the pin because journal reclaim drives key cache 777 * flushing. The flush callback will not proceed unless ->seq matches 778 * the latest pin, so make sure it starts with a consistent value. 779 */ 780 if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || 781 !journal_pin_active(&ck->journal)) { 782 ck->seq = trans->journal_res.seq; 783 } 784 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 785 &ck->journal, bch2_btree_key_cache_journal_flush); 786 787 if (kick_reclaim) 788 journal_reclaim_kick(&c->journal); 789 return true; 790 } 791 792 void bch2_btree_key_cache_drop(struct btree_trans *trans, 793 struct btree_path *path) 794 { 795 struct bch_fs *c = trans->c; 796 struct bkey_cached *ck = (void *) path->l[0].b; 797 798 BUG_ON(!ck->valid); 799 800 /* 801 * We just did an update to the btree, bypassing the key cache: the key 802 * cache key is now stale and must be dropped, even if dirty: 803 */ 804 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 805 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 806 atomic_long_dec(&c->btree_key_cache.nr_dirty); 807 bch2_journal_pin_drop(&c->journal, &ck->journal); 808 } 809 810 ck->valid = false; 811 } 812 813 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 814 struct shrink_control *sc) 815 { 816 struct bch_fs *c = shrink->private_data; 817 struct btree_key_cache *bc = &c->btree_key_cache; 818 struct bucket_table *tbl; 819 struct bkey_cached *ck, *t; 820 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 821 unsigned start, flags; 822 int srcu_idx; 823 824 mutex_lock(&bc->lock); 825 bc->requested_to_free += sc->nr_to_scan; 826 827 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 828 flags = memalloc_nofs_save(); 829 830 /* 831 * Newest freed entries are at the end of the list - once we hit one 832 * that's too new to be freed, we can bail out: 833 */ 834 list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { 835 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 836 ck->btree_trans_barrier_seq)) 837 break; 838 839 list_del(&ck->list); 840 six_lock_exit(&ck->c.lock); 841 kmem_cache_free(bch2_key_cache, ck); 842 atomic_long_dec(&bc->nr_freed); 843 freed++; 844 bc->nr_freed_nonpcpu--; 845 bc->freed++; 846 } 847 848 list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { 849 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 850 ck->btree_trans_barrier_seq)) 851 break; 852 853 list_del(&ck->list); 854 six_lock_exit(&ck->c.lock); 855 kmem_cache_free(bch2_key_cache, ck); 856 atomic_long_dec(&bc->nr_freed); 857 freed++; 858 bc->nr_freed_pcpu--; 859 bc->freed++; 860 } 861 862 rcu_read_lock(); 863 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 864 if (bc->shrink_iter >= tbl->size) 865 bc->shrink_iter = 0; 866 start = bc->shrink_iter; 867 868 do { 869 struct rhash_head *pos, *next; 870 871 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); 872 873 while (!rht_is_a_nulls(pos)) { 874 next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); 875 ck = container_of(pos, struct bkey_cached, hash); 876 877 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 878 bc->skipped_dirty++; 879 goto next; 880 } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { 881 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 882 bc->skipped_accessed++; 883 goto next; 884 } else if (bkey_cached_lock_for_evict(ck)) { 885 bkey_cached_evict(bc, ck); 886 bkey_cached_free(bc, ck); 887 bc->moved_to_freelist++; 888 } else { 889 bc->skipped_lock_fail++; 890 } 891 892 scanned++; 893 if (scanned >= nr) 894 break; 895 next: 896 pos = next; 897 } 898 899 bc->shrink_iter++; 900 if (bc->shrink_iter >= tbl->size) 901 bc->shrink_iter = 0; 902 } while (scanned < nr && bc->shrink_iter != start); 903 904 rcu_read_unlock(); 905 memalloc_nofs_restore(flags); 906 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 907 mutex_unlock(&bc->lock); 908 909 return freed; 910 } 911 912 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 913 struct shrink_control *sc) 914 { 915 struct bch_fs *c = shrink->private_data; 916 struct btree_key_cache *bc = &c->btree_key_cache; 917 long nr = atomic_long_read(&bc->nr_keys) - 918 atomic_long_read(&bc->nr_dirty); 919 920 return max(0L, nr); 921 } 922 923 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 924 { 925 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 926 struct bucket_table *tbl; 927 struct bkey_cached *ck, *n; 928 struct rhash_head *pos; 929 LIST_HEAD(items); 930 unsigned i; 931 #ifdef __KERNEL__ 932 int cpu; 933 #endif 934 935 shrinker_free(bc->shrink); 936 937 mutex_lock(&bc->lock); 938 939 /* 940 * The loop is needed to guard against racing with rehash: 941 */ 942 while (atomic_long_read(&bc->nr_keys)) { 943 rcu_read_lock(); 944 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 945 if (tbl) 946 for (i = 0; i < tbl->size; i++) 947 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 948 bkey_cached_evict(bc, ck); 949 list_add(&ck->list, &items); 950 } 951 rcu_read_unlock(); 952 } 953 954 #ifdef __KERNEL__ 955 if (bc->pcpu_freed) { 956 for_each_possible_cpu(cpu) { 957 struct btree_key_cache_freelist *f = 958 per_cpu_ptr(bc->pcpu_freed, cpu); 959 960 for (i = 0; i < f->nr; i++) { 961 ck = f->objs[i]; 962 list_add(&ck->list, &items); 963 } 964 } 965 } 966 #endif 967 968 BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); 969 BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); 970 971 list_splice(&bc->freed_pcpu, &items); 972 list_splice(&bc->freed_nonpcpu, &items); 973 974 mutex_unlock(&bc->lock); 975 976 list_for_each_entry_safe(ck, n, &items, list) { 977 cond_resched(); 978 979 list_del(&ck->list); 980 kfree(ck->k); 981 six_lock_exit(&ck->c.lock); 982 kmem_cache_free(bch2_key_cache, ck); 983 } 984 985 if (atomic_long_read(&bc->nr_dirty) && 986 !bch2_journal_error(&c->journal) && 987 test_bit(BCH_FS_was_rw, &c->flags)) 988 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 989 atomic_long_read(&bc->nr_dirty)); 990 991 if (atomic_long_read(&bc->nr_keys)) 992 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 993 atomic_long_read(&bc->nr_keys)); 994 995 if (bc->table_init_done) 996 rhashtable_destroy(&bc->table); 997 998 free_percpu(bc->pcpu_freed); 999 } 1000 1001 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 1002 { 1003 mutex_init(&c->lock); 1004 INIT_LIST_HEAD(&c->freed_pcpu); 1005 INIT_LIST_HEAD(&c->freed_nonpcpu); 1006 } 1007 1008 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 1009 { 1010 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 1011 struct shrinker *shrink; 1012 1013 #ifdef __KERNEL__ 1014 bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); 1015 if (!bc->pcpu_freed) 1016 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1017 #endif 1018 1019 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 1020 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1021 1022 bc->table_init_done = true; 1023 1024 shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); 1025 if (!shrink) 1026 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1027 bc->shrink = shrink; 1028 shrink->seeks = 0; 1029 shrink->count_objects = bch2_btree_key_cache_count; 1030 shrink->scan_objects = bch2_btree_key_cache_scan; 1031 shrink->private_data = c; 1032 shrinker_register(shrink); 1033 return 0; 1034 } 1035 1036 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) 1037 { 1038 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 1039 1040 printbuf_tabstop_push(out, 24); 1041 printbuf_tabstop_push(out, 12); 1042 1043 unsigned flags = memalloc_nofs_save(); 1044 mutex_lock(&bc->lock); 1045 prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); 1046 prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); 1047 prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed)); 1048 prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu); 1049 prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu); 1050 1051 prt_printf(out, "\nshrinker:\n"); 1052 prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); 1053 prt_printf(out, "freed:\t%lu\r\n", bc->freed); 1054 prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist); 1055 prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); 1056 prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); 1057 prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); 1058 1059 prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier)); 1060 1061 struct bkey_cached *ck; 1062 unsigned iter = 0; 1063 list_for_each_entry(ck, &bc->freed_nonpcpu, list) { 1064 prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); 1065 if (++iter > 10) 1066 break; 1067 } 1068 1069 iter = 0; 1070 list_for_each_entry(ck, &bc->freed_pcpu, list) { 1071 prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); 1072 if (++iter > 10) 1073 break; 1074 } 1075 mutex_unlock(&bc->lock); 1076 memalloc_flags_restore(flags); 1077 } 1078 1079 void bch2_btree_key_cache_exit(void) 1080 { 1081 kmem_cache_destroy(bch2_key_cache); 1082 } 1083 1084 int __init bch2_btree_key_cache_init(void) 1085 { 1086 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 1087 if (!bch2_key_cache) 1088 return -ENOMEM; 1089 1090 return 0; 1091 } 1092