1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 .automatic_shrinking = true, 40 }; 41 42 __flatten 43 inline struct bkey_cached * 44 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 45 { 46 struct bkey_cached_key key = { 47 .btree_id = btree_id, 48 .pos = pos, 49 }; 50 51 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 52 bch2_btree_key_cache_params); 53 } 54 55 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 56 { 57 if (!six_trylock_intent(&ck->c.lock)) 58 return false; 59 60 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 61 six_unlock_intent(&ck->c.lock); 62 return false; 63 } 64 65 if (!six_trylock_write(&ck->c.lock)) { 66 six_unlock_intent(&ck->c.lock); 67 return false; 68 } 69 70 return true; 71 } 72 73 static void bkey_cached_evict(struct btree_key_cache *c, 74 struct bkey_cached *ck) 75 { 76 BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, 77 bch2_btree_key_cache_params)); 78 memset(&ck->key, ~0, sizeof(ck->key)); 79 80 atomic_long_dec(&c->nr_keys); 81 } 82 83 static void bkey_cached_free(struct btree_key_cache *bc, 84 struct bkey_cached *ck) 85 { 86 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 87 88 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 89 90 ck->btree_trans_barrier_seq = 91 start_poll_synchronize_srcu(&c->btree_trans_barrier); 92 93 if (ck->c.lock.readers) { 94 list_move_tail(&ck->list, &bc->freed_pcpu); 95 bc->nr_freed_pcpu++; 96 } else { 97 list_move_tail(&ck->list, &bc->freed_nonpcpu); 98 bc->nr_freed_nonpcpu++; 99 } 100 atomic_long_inc(&bc->nr_freed); 101 102 kfree(ck->k); 103 ck->k = NULL; 104 ck->u64s = 0; 105 106 six_unlock_write(&ck->c.lock); 107 six_unlock_intent(&ck->c.lock); 108 } 109 110 #ifdef __KERNEL__ 111 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, 112 struct bkey_cached *ck) 113 { 114 struct bkey_cached *pos; 115 116 bc->nr_freed_nonpcpu++; 117 118 list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { 119 if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, 120 pos->btree_trans_barrier_seq)) { 121 list_move(&ck->list, &pos->list); 122 return; 123 } 124 } 125 126 list_move(&ck->list, &bc->freed_nonpcpu); 127 } 128 #endif 129 130 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, 131 struct bkey_cached *ck) 132 { 133 BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); 134 135 if (!ck->c.lock.readers) { 136 #ifdef __KERNEL__ 137 struct btree_key_cache_freelist *f; 138 bool freed = false; 139 140 preempt_disable(); 141 f = this_cpu_ptr(bc->pcpu_freed); 142 143 if (f->nr < ARRAY_SIZE(f->objs)) { 144 f->objs[f->nr++] = ck; 145 freed = true; 146 } 147 preempt_enable(); 148 149 if (!freed) { 150 mutex_lock(&bc->lock); 151 preempt_disable(); 152 f = this_cpu_ptr(bc->pcpu_freed); 153 154 while (f->nr > ARRAY_SIZE(f->objs) / 2) { 155 struct bkey_cached *ck2 = f->objs[--f->nr]; 156 157 __bkey_cached_move_to_freelist_ordered(bc, ck2); 158 } 159 preempt_enable(); 160 161 __bkey_cached_move_to_freelist_ordered(bc, ck); 162 mutex_unlock(&bc->lock); 163 } 164 #else 165 mutex_lock(&bc->lock); 166 list_move_tail(&ck->list, &bc->freed_nonpcpu); 167 bc->nr_freed_nonpcpu++; 168 mutex_unlock(&bc->lock); 169 #endif 170 } else { 171 mutex_lock(&bc->lock); 172 list_move_tail(&ck->list, &bc->freed_pcpu); 173 bc->nr_freed_pcpu++; 174 mutex_unlock(&bc->lock); 175 } 176 } 177 178 static void bkey_cached_free_fast(struct btree_key_cache *bc, 179 struct bkey_cached *ck) 180 { 181 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 182 183 ck->btree_trans_barrier_seq = 184 start_poll_synchronize_srcu(&c->btree_trans_barrier); 185 186 list_del_init(&ck->list); 187 atomic_long_inc(&bc->nr_freed); 188 189 kfree(ck->k); 190 ck->k = NULL; 191 ck->u64s = 0; 192 193 bkey_cached_move_to_freelist(bc, ck); 194 195 six_unlock_write(&ck->c.lock); 196 six_unlock_intent(&ck->c.lock); 197 } 198 199 static struct bkey_cached * 200 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, 201 bool *was_new) 202 { 203 struct bch_fs *c = trans->c; 204 struct btree_key_cache *bc = &c->btree_key_cache; 205 struct bkey_cached *ck = NULL; 206 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 207 int ret; 208 209 if (!pcpu_readers) { 210 #ifdef __KERNEL__ 211 struct btree_key_cache_freelist *f; 212 213 preempt_disable(); 214 f = this_cpu_ptr(bc->pcpu_freed); 215 if (f->nr) 216 ck = f->objs[--f->nr]; 217 preempt_enable(); 218 219 if (!ck) { 220 mutex_lock(&bc->lock); 221 preempt_disable(); 222 f = this_cpu_ptr(bc->pcpu_freed); 223 224 while (!list_empty(&bc->freed_nonpcpu) && 225 f->nr < ARRAY_SIZE(f->objs) / 2) { 226 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 227 list_del_init(&ck->list); 228 bc->nr_freed_nonpcpu--; 229 f->objs[f->nr++] = ck; 230 } 231 232 ck = f->nr ? f->objs[--f->nr] : NULL; 233 preempt_enable(); 234 mutex_unlock(&bc->lock); 235 } 236 #else 237 mutex_lock(&bc->lock); 238 if (!list_empty(&bc->freed_nonpcpu)) { 239 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); 240 list_del_init(&ck->list); 241 bc->nr_freed_nonpcpu--; 242 } 243 mutex_unlock(&bc->lock); 244 #endif 245 } else { 246 mutex_lock(&bc->lock); 247 if (!list_empty(&bc->freed_pcpu)) { 248 ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); 249 list_del_init(&ck->list); 250 bc->nr_freed_pcpu--; 251 } 252 mutex_unlock(&bc->lock); 253 } 254 255 if (ck) { 256 ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); 257 if (unlikely(ret)) { 258 bkey_cached_move_to_freelist(bc, ck); 259 return ERR_PTR(ret); 260 } 261 262 path->l[0].b = (void *) ck; 263 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 264 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 265 266 ret = bch2_btree_node_lock_write(trans, path, &ck->c); 267 if (unlikely(ret)) { 268 btree_node_unlock(trans, path, 0); 269 bkey_cached_move_to_freelist(bc, ck); 270 return ERR_PTR(ret); 271 } 272 273 return ck; 274 } 275 276 ck = allocate_dropping_locks(trans, ret, 277 kmem_cache_zalloc(bch2_key_cache, _gfp)); 278 if (ret) { 279 kmem_cache_free(bch2_key_cache, ck); 280 return ERR_PTR(ret); 281 } 282 283 if (!ck) 284 return NULL; 285 286 INIT_LIST_HEAD(&ck->list); 287 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 288 289 ck->c.cached = true; 290 BUG_ON(!six_trylock_intent(&ck->c.lock)); 291 BUG_ON(!six_trylock_write(&ck->c.lock)); 292 *was_new = true; 293 return ck; 294 } 295 296 static struct bkey_cached * 297 bkey_cached_reuse(struct btree_key_cache *c) 298 { 299 struct bucket_table *tbl; 300 struct rhash_head *pos; 301 struct bkey_cached *ck; 302 unsigned i; 303 304 mutex_lock(&c->lock); 305 rcu_read_lock(); 306 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 307 for (i = 0; i < tbl->size; i++) 308 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 309 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 310 bkey_cached_lock_for_evict(ck)) { 311 bkey_cached_evict(c, ck); 312 goto out; 313 } 314 } 315 ck = NULL; 316 out: 317 rcu_read_unlock(); 318 mutex_unlock(&c->lock); 319 return ck; 320 } 321 322 static struct bkey_cached * 323 btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) 324 { 325 struct bch_fs *c = trans->c; 326 struct btree_key_cache *bc = &c->btree_key_cache; 327 struct bkey_cached *ck; 328 bool was_new = false; 329 330 ck = bkey_cached_alloc(trans, path, &was_new); 331 if (IS_ERR(ck)) 332 return ck; 333 334 if (unlikely(!ck)) { 335 ck = bkey_cached_reuse(bc); 336 if (unlikely(!ck)) { 337 bch_err(c, "error allocating memory for key cache item, btree %s", 338 bch2_btree_id_str(path->btree_id)); 339 return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); 340 } 341 342 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 343 } 344 345 ck->c.level = 0; 346 ck->c.btree_id = path->btree_id; 347 ck->key.btree_id = path->btree_id; 348 ck->key.pos = path->pos; 349 ck->valid = false; 350 ck->flags = 1U << BKEY_CACHED_ACCESSED; 351 352 if (unlikely(rhashtable_lookup_insert_fast(&bc->table, 353 &ck->hash, 354 bch2_btree_key_cache_params))) { 355 /* We raced with another fill: */ 356 357 if (likely(was_new)) { 358 six_unlock_write(&ck->c.lock); 359 six_unlock_intent(&ck->c.lock); 360 kfree(ck); 361 } else { 362 bkey_cached_free_fast(bc, ck); 363 } 364 365 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 366 return NULL; 367 } 368 369 atomic_long_inc(&bc->nr_keys); 370 371 six_unlock_write(&ck->c.lock); 372 373 return ck; 374 } 375 376 static int btree_key_cache_fill(struct btree_trans *trans, 377 struct btree_path *ck_path, 378 struct bkey_cached *ck) 379 { 380 struct btree_iter iter; 381 struct bkey_s_c k; 382 unsigned new_u64s = 0; 383 struct bkey_i *new_k = NULL; 384 int ret; 385 386 bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, 387 BTREE_ITER_key_cache_fill| 388 BTREE_ITER_cached_nofill); 389 iter.flags &= ~BTREE_ITER_with_journal; 390 k = bch2_btree_iter_peek_slot(&iter); 391 ret = bkey_err(k); 392 if (ret) 393 goto err; 394 395 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 396 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 397 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 398 goto err; 399 } 400 401 /* 402 * bch2_varint_decode can read past the end of the buffer by at 403 * most 7 bytes (it won't be used): 404 */ 405 new_u64s = k.k->u64s + 1; 406 407 /* 408 * Allocate some extra space so that the transaction commit path is less 409 * likely to have to reallocate, since that requires a transaction 410 * restart: 411 */ 412 new_u64s = min(256U, (new_u64s * 3) / 2); 413 414 if (new_u64s > ck->u64s) { 415 new_u64s = roundup_pow_of_two(new_u64s); 416 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); 417 if (!new_k) { 418 bch2_trans_unlock(trans); 419 420 new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); 421 if (!new_k) { 422 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 423 bch2_btree_id_str(ck->key.btree_id), new_u64s); 424 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 425 goto err; 426 } 427 428 ret = bch2_trans_relock(trans); 429 if (ret) { 430 kfree(new_k); 431 goto err; 432 } 433 434 if (!bch2_btree_node_relock(trans, ck_path, 0)) { 435 kfree(new_k); 436 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); 437 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); 438 goto err; 439 } 440 } 441 } 442 443 ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); 444 if (ret) { 445 kfree(new_k); 446 goto err; 447 } 448 449 if (new_k) { 450 kfree(ck->k); 451 ck->u64s = new_u64s; 452 ck->k = new_k; 453 } 454 455 bkey_reassemble(ck->k, k); 456 ck->valid = true; 457 bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); 458 459 /* We're not likely to need this iterator again: */ 460 bch2_set_btree_iter_dontneed(&iter); 461 err: 462 bch2_trans_iter_exit(trans, &iter); 463 return ret; 464 } 465 466 static noinline int 467 bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, 468 unsigned flags) 469 { 470 struct bch_fs *c = trans->c; 471 struct bkey_cached *ck; 472 int ret = 0; 473 474 BUG_ON(path->level); 475 476 path->l[1].b = NULL; 477 478 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 479 ck = (void *) path->l[0].b; 480 goto fill; 481 } 482 retry: 483 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 484 if (!ck) { 485 ck = btree_key_cache_create(trans, path); 486 ret = PTR_ERR_OR_ZERO(ck); 487 if (ret) 488 goto err; 489 if (!ck) 490 goto retry; 491 492 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); 493 path->locks_want = 1; 494 } else { 495 enum six_lock_type lock_want = __btree_lock_want(path, 0); 496 497 ret = btree_node_lock(trans, path, (void *) ck, 0, 498 lock_want, _THIS_IP_); 499 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 500 goto err; 501 502 BUG_ON(ret); 503 504 if (ck->key.btree_id != path->btree_id || 505 !bpos_eq(ck->key.pos, path->pos)) { 506 six_unlock_type(&ck->c.lock, lock_want); 507 goto retry; 508 } 509 510 mark_btree_node_locked(trans, path, 0, 511 (enum btree_node_locked_type) lock_want); 512 } 513 514 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 515 path->l[0].b = (void *) ck; 516 fill: 517 path->uptodate = BTREE_ITER_UPTODATE; 518 519 if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) { 520 ret = bch2_btree_path_upgrade(trans, path, 1) ?: 521 btree_key_cache_fill(trans, path, ck) ?: 522 bch2_btree_path_relock(trans, path, _THIS_IP_); 523 if (ret) 524 goto err; 525 526 path->uptodate = BTREE_ITER_UPTODATE; 527 } 528 529 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 530 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 531 532 BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 533 BUG_ON(path->uptodate); 534 535 return ret; 536 err: 537 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 538 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 539 btree_node_unlock(trans, path, 0); 540 path->l[0].b = ERR_PTR(ret); 541 } 542 return ret; 543 } 544 545 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 546 unsigned flags) 547 { 548 struct bch_fs *c = trans->c; 549 struct bkey_cached *ck; 550 int ret = 0; 551 552 EBUG_ON(path->level); 553 554 path->l[1].b = NULL; 555 556 if (bch2_btree_node_relock_notrace(trans, path, 0)) { 557 ck = (void *) path->l[0].b; 558 goto fill; 559 } 560 retry: 561 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 562 if (!ck) { 563 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 564 } else { 565 enum six_lock_type lock_want = __btree_lock_want(path, 0); 566 567 ret = btree_node_lock(trans, path, (void *) ck, 0, 568 lock_want, _THIS_IP_); 569 EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); 570 571 if (ret) 572 return ret; 573 574 if (ck->key.btree_id != path->btree_id || 575 !bpos_eq(ck->key.pos, path->pos)) { 576 six_unlock_type(&ck->c.lock, lock_want); 577 goto retry; 578 } 579 580 mark_btree_node_locked(trans, path, 0, 581 (enum btree_node_locked_type) lock_want); 582 } 583 584 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 585 path->l[0].b = (void *) ck; 586 fill: 587 if (!ck->valid) 588 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); 589 590 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 591 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 592 593 path->uptodate = BTREE_ITER_UPTODATE; 594 EBUG_ON(!ck->valid); 595 EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); 596 597 return ret; 598 } 599 600 static int btree_key_cache_flush_pos(struct btree_trans *trans, 601 struct bkey_cached_key key, 602 u64 journal_seq, 603 unsigned commit_flags, 604 bool evict) 605 { 606 struct bch_fs *c = trans->c; 607 struct journal *j = &c->journal; 608 struct btree_iter c_iter, b_iter; 609 struct bkey_cached *ck = NULL; 610 int ret; 611 612 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 613 BTREE_ITER_slots| 614 BTREE_ITER_intent| 615 BTREE_ITER_all_snapshots); 616 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 617 BTREE_ITER_cached| 618 BTREE_ITER_intent); 619 b_iter.flags &= ~BTREE_ITER_with_key_cache; 620 621 ret = bch2_btree_iter_traverse(&c_iter); 622 if (ret) 623 goto out; 624 625 ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; 626 if (!ck) 627 goto out; 628 629 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 630 if (evict) 631 goto evict; 632 goto out; 633 } 634 635 BUG_ON(!ck->valid); 636 637 if (journal_seq && ck->journal.seq != journal_seq) 638 goto out; 639 640 trans->journal_res.seq = ck->journal.seq; 641 642 /* 643 * If we're at the end of the journal, we really want to free up space 644 * in the journal right away - we don't want to pin that old journal 645 * sequence number with a new btree node write, we want to re-journal 646 * the update 647 */ 648 if (ck->journal.seq == journal_last_seq(j)) 649 commit_flags |= BCH_WATERMARK_reclaim; 650 651 if (ck->journal.seq != journal_last_seq(j) || 652 !test_bit(JOURNAL_space_low, &c->journal.flags)) 653 commit_flags |= BCH_TRANS_COMMIT_no_journal_res; 654 655 ret = bch2_btree_iter_traverse(&b_iter) ?: 656 bch2_trans_update(trans, &b_iter, ck->k, 657 BTREE_UPDATE_key_cache_reclaim| 658 BTREE_UPDATE_internal_snapshot_node| 659 BTREE_TRIGGER_norun) ?: 660 bch2_trans_commit(trans, NULL, NULL, 661 BCH_TRANS_COMMIT_no_check_rw| 662 BCH_TRANS_COMMIT_no_enospc| 663 commit_flags); 664 665 bch2_fs_fatal_err_on(ret && 666 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 667 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 668 !bch2_journal_error(j), c, 669 "flushing key cache: %s", bch2_err_str(ret)); 670 if (ret) 671 goto out; 672 673 bch2_journal_pin_drop(j, &ck->journal); 674 675 struct btree_path *path = btree_iter_path(trans, &c_iter); 676 BUG_ON(!btree_node_locked(path, 0)); 677 678 if (!evict) { 679 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 680 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 681 atomic_long_dec(&c->btree_key_cache.nr_dirty); 682 } 683 } else { 684 struct btree_path *path2; 685 unsigned i; 686 evict: 687 trans_for_each_path(trans, path2, i) 688 if (path2 != path) 689 __bch2_btree_path_unlock(trans, path2); 690 691 bch2_btree_node_lock_write_nofail(trans, path, &ck->c); 692 693 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 694 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 695 atomic_long_dec(&c->btree_key_cache.nr_dirty); 696 } 697 698 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 699 bkey_cached_evict(&c->btree_key_cache, ck); 700 bkey_cached_free_fast(&c->btree_key_cache, ck); 701 } 702 out: 703 bch2_trans_iter_exit(trans, &b_iter); 704 bch2_trans_iter_exit(trans, &c_iter); 705 return ret; 706 } 707 708 int bch2_btree_key_cache_journal_flush(struct journal *j, 709 struct journal_entry_pin *pin, u64 seq) 710 { 711 struct bch_fs *c = container_of(j, struct bch_fs, journal); 712 struct bkey_cached *ck = 713 container_of(pin, struct bkey_cached, journal); 714 struct bkey_cached_key key; 715 struct btree_trans *trans = bch2_trans_get(c); 716 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 717 int ret = 0; 718 719 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 720 key = ck->key; 721 722 if (ck->journal.seq != seq || 723 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 724 six_unlock_read(&ck->c.lock); 725 goto unlock; 726 } 727 728 if (ck->seq != seq) { 729 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 730 bch2_btree_key_cache_journal_flush); 731 six_unlock_read(&ck->c.lock); 732 goto unlock; 733 } 734 six_unlock_read(&ck->c.lock); 735 736 ret = lockrestart_do(trans, 737 btree_key_cache_flush_pos(trans, key, seq, 738 BCH_TRANS_COMMIT_journal_reclaim, false)); 739 unlock: 740 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 741 742 bch2_trans_put(trans); 743 return ret; 744 } 745 746 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 747 unsigned flags, 748 struct btree_insert_entry *insert_entry) 749 { 750 struct bch_fs *c = trans->c; 751 struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; 752 struct bkey_i *insert = insert_entry->k; 753 bool kick_reclaim = false; 754 755 BUG_ON(insert->k.u64s > ck->u64s); 756 757 bkey_copy(ck->k, insert); 758 ck->valid = true; 759 760 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 761 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); 762 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 763 atomic_long_inc(&c->btree_key_cache.nr_dirty); 764 765 if (bch2_nr_btree_keys_need_flush(c)) 766 kick_reclaim = true; 767 } 768 769 /* 770 * To minimize lock contention, we only add the journal pin here and 771 * defer pin updates to the flush callback via ->seq. Be careful not to 772 * update ->seq on nojournal commits because we don't want to update the 773 * pin to a seq that doesn't include journal updates on disk. Otherwise 774 * we risk losing the update after a crash. 775 * 776 * The only exception is if the pin is not active in the first place. We 777 * have to add the pin because journal reclaim drives key cache 778 * flushing. The flush callback will not proceed unless ->seq matches 779 * the latest pin, so make sure it starts with a consistent value. 780 */ 781 if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || 782 !journal_pin_active(&ck->journal)) { 783 ck->seq = trans->journal_res.seq; 784 } 785 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 786 &ck->journal, bch2_btree_key_cache_journal_flush); 787 788 if (kick_reclaim) 789 journal_reclaim_kick(&c->journal); 790 return true; 791 } 792 793 void bch2_btree_key_cache_drop(struct btree_trans *trans, 794 struct btree_path *path) 795 { 796 struct bch_fs *c = trans->c; 797 struct bkey_cached *ck = (void *) path->l[0].b; 798 799 BUG_ON(!ck->valid); 800 801 /* 802 * We just did an update to the btree, bypassing the key cache: the key 803 * cache key is now stale and must be dropped, even if dirty: 804 */ 805 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 806 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 807 atomic_long_dec(&c->btree_key_cache.nr_dirty); 808 bch2_journal_pin_drop(&c->journal, &ck->journal); 809 } 810 811 ck->valid = false; 812 } 813 814 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 815 struct shrink_control *sc) 816 { 817 struct bch_fs *c = shrink->private_data; 818 struct btree_key_cache *bc = &c->btree_key_cache; 819 struct bucket_table *tbl; 820 struct bkey_cached *ck, *t; 821 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 822 unsigned start, flags; 823 int srcu_idx; 824 825 mutex_lock(&bc->lock); 826 bc->requested_to_free += sc->nr_to_scan; 827 828 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 829 flags = memalloc_nofs_save(); 830 831 /* 832 * Newest freed entries are at the end of the list - once we hit one 833 * that's too new to be freed, we can bail out: 834 */ 835 list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { 836 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 837 ck->btree_trans_barrier_seq)) 838 break; 839 840 list_del(&ck->list); 841 six_lock_exit(&ck->c.lock); 842 kmem_cache_free(bch2_key_cache, ck); 843 atomic_long_dec(&bc->nr_freed); 844 bc->nr_freed_nonpcpu--; 845 bc->freed++; 846 } 847 848 list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { 849 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, 850 ck->btree_trans_barrier_seq)) 851 break; 852 853 list_del(&ck->list); 854 six_lock_exit(&ck->c.lock); 855 kmem_cache_free(bch2_key_cache, ck); 856 atomic_long_dec(&bc->nr_freed); 857 bc->nr_freed_pcpu--; 858 bc->freed++; 859 } 860 861 rcu_read_lock(); 862 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 863 if (bc->shrink_iter >= tbl->size) 864 bc->shrink_iter = 0; 865 start = bc->shrink_iter; 866 867 do { 868 struct rhash_head *pos, *next; 869 870 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); 871 872 while (!rht_is_a_nulls(pos)) { 873 next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); 874 ck = container_of(pos, struct bkey_cached, hash); 875 876 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 877 bc->skipped_dirty++; 878 } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { 879 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 880 bc->skipped_accessed++; 881 } else if (!bkey_cached_lock_for_evict(ck)) { 882 bc->skipped_lock_fail++; 883 } else { 884 bkey_cached_evict(bc, ck); 885 bkey_cached_free(bc, ck); 886 bc->moved_to_freelist++; 887 freed++; 888 } 889 890 scanned++; 891 if (scanned >= nr) 892 break; 893 894 pos = next; 895 } 896 897 bc->shrink_iter++; 898 if (bc->shrink_iter >= tbl->size) 899 bc->shrink_iter = 0; 900 } while (scanned < nr && bc->shrink_iter != start); 901 902 rcu_read_unlock(); 903 memalloc_nofs_restore(flags); 904 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 905 mutex_unlock(&bc->lock); 906 907 return freed; 908 } 909 910 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 911 struct shrink_control *sc) 912 { 913 struct bch_fs *c = shrink->private_data; 914 struct btree_key_cache *bc = &c->btree_key_cache; 915 long nr = atomic_long_read(&bc->nr_keys) - 916 atomic_long_read(&bc->nr_dirty); 917 918 /* 919 * Avoid hammering our shrinker too much if it's nearly empty - the 920 * shrinker code doesn't take into account how big our cache is, if it's 921 * mostly empty but the system is under memory pressure it causes nasty 922 * lock contention: 923 */ 924 nr -= 128; 925 926 return max(0L, nr); 927 } 928 929 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 930 { 931 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 932 struct bucket_table *tbl; 933 struct bkey_cached *ck, *n; 934 struct rhash_head *pos; 935 LIST_HEAD(items); 936 unsigned i; 937 #ifdef __KERNEL__ 938 int cpu; 939 #endif 940 941 shrinker_free(bc->shrink); 942 943 mutex_lock(&bc->lock); 944 945 /* 946 * The loop is needed to guard against racing with rehash: 947 */ 948 while (atomic_long_read(&bc->nr_keys)) { 949 rcu_read_lock(); 950 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 951 if (tbl) 952 for (i = 0; i < tbl->size; i++) 953 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 954 bkey_cached_evict(bc, ck); 955 list_add(&ck->list, &items); 956 } 957 rcu_read_unlock(); 958 } 959 960 #ifdef __KERNEL__ 961 if (bc->pcpu_freed) { 962 for_each_possible_cpu(cpu) { 963 struct btree_key_cache_freelist *f = 964 per_cpu_ptr(bc->pcpu_freed, cpu); 965 966 for (i = 0; i < f->nr; i++) { 967 ck = f->objs[i]; 968 list_add(&ck->list, &items); 969 } 970 } 971 } 972 #endif 973 974 BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); 975 BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); 976 977 list_splice(&bc->freed_pcpu, &items); 978 list_splice(&bc->freed_nonpcpu, &items); 979 980 mutex_unlock(&bc->lock); 981 982 list_for_each_entry_safe(ck, n, &items, list) { 983 cond_resched(); 984 985 list_del(&ck->list); 986 kfree(ck->k); 987 six_lock_exit(&ck->c.lock); 988 kmem_cache_free(bch2_key_cache, ck); 989 } 990 991 if (atomic_long_read(&bc->nr_dirty) && 992 !bch2_journal_error(&c->journal) && 993 test_bit(BCH_FS_was_rw, &c->flags)) 994 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 995 atomic_long_read(&bc->nr_dirty)); 996 997 if (atomic_long_read(&bc->nr_keys)) 998 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 999 atomic_long_read(&bc->nr_keys)); 1000 1001 if (bc->table_init_done) 1002 rhashtable_destroy(&bc->table); 1003 1004 free_percpu(bc->pcpu_freed); 1005 } 1006 1007 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 1008 { 1009 mutex_init(&c->lock); 1010 INIT_LIST_HEAD(&c->freed_pcpu); 1011 INIT_LIST_HEAD(&c->freed_nonpcpu); 1012 } 1013 1014 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 1015 { 1016 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 1017 struct shrinker *shrink; 1018 1019 #ifdef __KERNEL__ 1020 bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); 1021 if (!bc->pcpu_freed) 1022 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1023 #endif 1024 1025 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 1026 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1027 1028 bc->table_init_done = true; 1029 1030 shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); 1031 if (!shrink) 1032 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 1033 bc->shrink = shrink; 1034 shrink->count_objects = bch2_btree_key_cache_count; 1035 shrink->scan_objects = bch2_btree_key_cache_scan; 1036 shrink->batch = 1 << 14; 1037 shrink->seeks = 0; 1038 shrink->private_data = c; 1039 shrinker_register(shrink); 1040 return 0; 1041 } 1042 1043 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) 1044 { 1045 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 1046 1047 printbuf_tabstop_push(out, 24); 1048 printbuf_tabstop_push(out, 12); 1049 1050 unsigned flags = memalloc_nofs_save(); 1051 mutex_lock(&bc->lock); 1052 prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); 1053 prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); 1054 prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed)); 1055 prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu); 1056 prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu); 1057 1058 prt_printf(out, "\nshrinker:\n"); 1059 prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); 1060 prt_printf(out, "freed:\t%lu\r\n", bc->freed); 1061 prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist); 1062 prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); 1063 prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); 1064 prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); 1065 1066 prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier)); 1067 1068 struct bkey_cached *ck; 1069 unsigned iter = 0; 1070 list_for_each_entry(ck, &bc->freed_nonpcpu, list) { 1071 prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); 1072 if (++iter > 10) 1073 break; 1074 } 1075 1076 iter = 0; 1077 list_for_each_entry(ck, &bc->freed_pcpu, list) { 1078 prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); 1079 if (++iter > 10) 1080 break; 1081 } 1082 mutex_unlock(&bc->lock); 1083 memalloc_flags_restore(flags); 1084 } 1085 1086 void bch2_btree_key_cache_exit(void) 1087 { 1088 kmem_cache_destroy(bch2_key_cache); 1089 } 1090 1091 int __init bch2_btree_key_cache_init(void) 1092 { 1093 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 1094 if (!bch2_key_cache) 1095 return -ENOMEM; 1096 1097 return 0; 1098 } 1099