xref: /linux/fs/bcachefs/btree_key_cache.c (revision af8e51644a70f612974a6e767fa7d896d3c23f88)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "btree_cache.h"
5 #include "btree_iter.h"
6 #include "btree_key_cache.h"
7 #include "btree_locking.h"
8 #include "btree_update.h"
9 #include "errcode.h"
10 #include "error.h"
11 #include "journal.h"
12 #include "journal_reclaim.h"
13 #include "trace.h"
14 
15 #include <linux/sched/mm.h>
16 
17 static inline bool btree_uses_pcpu_readers(enum btree_id id)
18 {
19 	return id == BTREE_ID_subvolumes;
20 }
21 
22 static struct kmem_cache *bch2_key_cache;
23 
24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
25 				       const void *obj)
26 {
27 	const struct bkey_cached *ck = obj;
28 	const struct bkey_cached_key *key = arg->key;
29 
30 	return ck->key.btree_id != key->btree_id ||
31 		!bpos_eq(ck->key.pos, key->pos);
32 }
33 
34 static const struct rhashtable_params bch2_btree_key_cache_params = {
35 	.head_offset		= offsetof(struct bkey_cached, hash),
36 	.key_offset		= offsetof(struct bkey_cached, key),
37 	.key_len		= sizeof(struct bkey_cached_key),
38 	.obj_cmpfn		= bch2_btree_key_cache_cmp_fn,
39 	.automatic_shrinking	= true,
40 };
41 
42 __flatten
43 inline struct bkey_cached *
44 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
45 {
46 	struct bkey_cached_key key = {
47 		.btree_id	= btree_id,
48 		.pos		= pos,
49 	};
50 
51 	return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
52 				      bch2_btree_key_cache_params);
53 }
54 
55 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
56 {
57 	if (!six_trylock_intent(&ck->c.lock))
58 		return false;
59 
60 	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
61 		six_unlock_intent(&ck->c.lock);
62 		return false;
63 	}
64 
65 	if (!six_trylock_write(&ck->c.lock)) {
66 		six_unlock_intent(&ck->c.lock);
67 		return false;
68 	}
69 
70 	return true;
71 }
72 
73 static void bkey_cached_evict(struct btree_key_cache *c,
74 			      struct bkey_cached *ck)
75 {
76 	BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
77 				      bch2_btree_key_cache_params));
78 	memset(&ck->key, ~0, sizeof(ck->key));
79 
80 	atomic_long_dec(&c->nr_keys);
81 }
82 
83 static void bkey_cached_free(struct btree_key_cache *bc,
84 			     struct bkey_cached *ck)
85 {
86 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
87 
88 	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
89 
90 	ck->btree_trans_barrier_seq =
91 		start_poll_synchronize_srcu(&c->btree_trans_barrier);
92 
93 	if (ck->c.lock.readers) {
94 		list_move_tail(&ck->list, &bc->freed_pcpu);
95 		bc->nr_freed_pcpu++;
96 	} else {
97 		list_move_tail(&ck->list, &bc->freed_nonpcpu);
98 		bc->nr_freed_nonpcpu++;
99 	}
100 	atomic_long_inc(&bc->nr_freed);
101 
102 	kfree(ck->k);
103 	ck->k		= NULL;
104 	ck->u64s	= 0;
105 
106 	six_unlock_write(&ck->c.lock);
107 	six_unlock_intent(&ck->c.lock);
108 }
109 
110 #ifdef __KERNEL__
111 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
112 						   struct bkey_cached *ck)
113 {
114 	struct bkey_cached *pos;
115 
116 	bc->nr_freed_nonpcpu++;
117 
118 	list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
119 		if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
120 				 pos->btree_trans_barrier_seq)) {
121 			list_move(&ck->list, &pos->list);
122 			return;
123 		}
124 	}
125 
126 	list_move(&ck->list, &bc->freed_nonpcpu);
127 }
128 #endif
129 
130 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
131 					 struct bkey_cached *ck)
132 {
133 	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
134 
135 	if (!ck->c.lock.readers) {
136 #ifdef __KERNEL__
137 		struct btree_key_cache_freelist *f;
138 		bool freed = false;
139 
140 		preempt_disable();
141 		f = this_cpu_ptr(bc->pcpu_freed);
142 
143 		if (f->nr < ARRAY_SIZE(f->objs)) {
144 			f->objs[f->nr++] = ck;
145 			freed = true;
146 		}
147 		preempt_enable();
148 
149 		if (!freed) {
150 			mutex_lock(&bc->lock);
151 			preempt_disable();
152 			f = this_cpu_ptr(bc->pcpu_freed);
153 
154 			while (f->nr > ARRAY_SIZE(f->objs) / 2) {
155 				struct bkey_cached *ck2 = f->objs[--f->nr];
156 
157 				__bkey_cached_move_to_freelist_ordered(bc, ck2);
158 			}
159 			preempt_enable();
160 
161 			__bkey_cached_move_to_freelist_ordered(bc, ck);
162 			mutex_unlock(&bc->lock);
163 		}
164 #else
165 		mutex_lock(&bc->lock);
166 		list_move_tail(&ck->list, &bc->freed_nonpcpu);
167 		bc->nr_freed_nonpcpu++;
168 		mutex_unlock(&bc->lock);
169 #endif
170 	} else {
171 		mutex_lock(&bc->lock);
172 		list_move_tail(&ck->list, &bc->freed_pcpu);
173 		bc->nr_freed_pcpu++;
174 		mutex_unlock(&bc->lock);
175 	}
176 }
177 
178 static void bkey_cached_free_fast(struct btree_key_cache *bc,
179 				  struct bkey_cached *ck)
180 {
181 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
182 
183 	ck->btree_trans_barrier_seq =
184 		start_poll_synchronize_srcu(&c->btree_trans_barrier);
185 
186 	list_del_init(&ck->list);
187 	atomic_long_inc(&bc->nr_freed);
188 
189 	kfree(ck->k);
190 	ck->k		= NULL;
191 	ck->u64s	= 0;
192 
193 	bkey_cached_move_to_freelist(bc, ck);
194 
195 	six_unlock_write(&ck->c.lock);
196 	six_unlock_intent(&ck->c.lock);
197 }
198 
199 static struct bkey_cached *
200 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
201 		  bool *was_new)
202 {
203 	struct bch_fs *c = trans->c;
204 	struct btree_key_cache *bc = &c->btree_key_cache;
205 	struct bkey_cached *ck = NULL;
206 	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
207 	int ret;
208 
209 	if (!pcpu_readers) {
210 #ifdef __KERNEL__
211 		struct btree_key_cache_freelist *f;
212 
213 		preempt_disable();
214 		f = this_cpu_ptr(bc->pcpu_freed);
215 		if (f->nr)
216 			ck = f->objs[--f->nr];
217 		preempt_enable();
218 
219 		if (!ck) {
220 			mutex_lock(&bc->lock);
221 			preempt_disable();
222 			f = this_cpu_ptr(bc->pcpu_freed);
223 
224 			while (!list_empty(&bc->freed_nonpcpu) &&
225 			       f->nr < ARRAY_SIZE(f->objs) / 2) {
226 				ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
227 				list_del_init(&ck->list);
228 				bc->nr_freed_nonpcpu--;
229 				f->objs[f->nr++] = ck;
230 			}
231 
232 			ck = f->nr ? f->objs[--f->nr] : NULL;
233 			preempt_enable();
234 			mutex_unlock(&bc->lock);
235 		}
236 #else
237 		mutex_lock(&bc->lock);
238 		if (!list_empty(&bc->freed_nonpcpu)) {
239 			ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
240 			list_del_init(&ck->list);
241 			bc->nr_freed_nonpcpu--;
242 		}
243 		mutex_unlock(&bc->lock);
244 #endif
245 	} else {
246 		mutex_lock(&bc->lock);
247 		if (!list_empty(&bc->freed_pcpu)) {
248 			ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
249 			list_del_init(&ck->list);
250 			bc->nr_freed_pcpu--;
251 		}
252 		mutex_unlock(&bc->lock);
253 	}
254 
255 	if (ck) {
256 		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
257 		if (unlikely(ret)) {
258 			bkey_cached_move_to_freelist(bc, ck);
259 			return ERR_PTR(ret);
260 		}
261 
262 		path->l[0].b = (void *) ck;
263 		path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
264 		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
265 
266 		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
267 		if (unlikely(ret)) {
268 			btree_node_unlock(trans, path, 0);
269 			bkey_cached_move_to_freelist(bc, ck);
270 			return ERR_PTR(ret);
271 		}
272 
273 		return ck;
274 	}
275 
276 	ck = allocate_dropping_locks(trans, ret,
277 			kmem_cache_zalloc(bch2_key_cache, _gfp));
278 	if (ret) {
279 		kmem_cache_free(bch2_key_cache, ck);
280 		return ERR_PTR(ret);
281 	}
282 
283 	if (!ck)
284 		return NULL;
285 
286 	INIT_LIST_HEAD(&ck->list);
287 	bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
288 
289 	ck->c.cached = true;
290 	BUG_ON(!six_trylock_intent(&ck->c.lock));
291 	BUG_ON(!six_trylock_write(&ck->c.lock));
292 	*was_new = true;
293 	return ck;
294 }
295 
296 static struct bkey_cached *
297 bkey_cached_reuse(struct btree_key_cache *c)
298 {
299 	struct bucket_table *tbl;
300 	struct rhash_head *pos;
301 	struct bkey_cached *ck;
302 	unsigned i;
303 
304 	mutex_lock(&c->lock);
305 	rcu_read_lock();
306 	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
307 	for (i = 0; i < tbl->size; i++)
308 		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
309 			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
310 			    bkey_cached_lock_for_evict(ck)) {
311 				bkey_cached_evict(c, ck);
312 				goto out;
313 			}
314 		}
315 	ck = NULL;
316 out:
317 	rcu_read_unlock();
318 	mutex_unlock(&c->lock);
319 	return ck;
320 }
321 
322 static struct bkey_cached *
323 btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
324 {
325 	struct bch_fs *c = trans->c;
326 	struct btree_key_cache *bc = &c->btree_key_cache;
327 	struct bkey_cached *ck;
328 	bool was_new = false;
329 
330 	ck = bkey_cached_alloc(trans, path, &was_new);
331 	if (IS_ERR(ck))
332 		return ck;
333 
334 	if (unlikely(!ck)) {
335 		ck = bkey_cached_reuse(bc);
336 		if (unlikely(!ck)) {
337 			bch_err(c, "error allocating memory for key cache item, btree %s",
338 				bch2_btree_id_str(path->btree_id));
339 			return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
340 		}
341 
342 		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
343 	}
344 
345 	ck->c.level		= 0;
346 	ck->c.btree_id		= path->btree_id;
347 	ck->key.btree_id	= path->btree_id;
348 	ck->key.pos		= path->pos;
349 	ck->valid		= false;
350 	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
351 
352 	if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
353 					  &ck->hash,
354 					  bch2_btree_key_cache_params))) {
355 		/* We raced with another fill: */
356 
357 		if (likely(was_new)) {
358 			six_unlock_write(&ck->c.lock);
359 			six_unlock_intent(&ck->c.lock);
360 			kfree(ck);
361 		} else {
362 			bkey_cached_free_fast(bc, ck);
363 		}
364 
365 		mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
366 		return NULL;
367 	}
368 
369 	atomic_long_inc(&bc->nr_keys);
370 
371 	six_unlock_write(&ck->c.lock);
372 
373 	return ck;
374 }
375 
376 static int btree_key_cache_fill(struct btree_trans *trans,
377 				struct btree_path *ck_path,
378 				struct bkey_cached *ck)
379 {
380 	struct btree_iter iter;
381 	struct bkey_s_c k;
382 	unsigned new_u64s = 0;
383 	struct bkey_i *new_k = NULL;
384 	int ret;
385 
386 	bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
387 			     BTREE_ITER_key_cache_fill|
388 			     BTREE_ITER_cached_nofill);
389 	iter.flags &= ~BTREE_ITER_with_journal;
390 	k = bch2_btree_iter_peek_slot(&iter);
391 	ret = bkey_err(k);
392 	if (ret)
393 		goto err;
394 
395 	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
396 		trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
397 		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
398 		goto err;
399 	}
400 
401 	/*
402 	 * bch2_varint_decode can read past the end of the buffer by at
403 	 * most 7 bytes (it won't be used):
404 	 */
405 	new_u64s = k.k->u64s + 1;
406 
407 	/*
408 	 * Allocate some extra space so that the transaction commit path is less
409 	 * likely to have to reallocate, since that requires a transaction
410 	 * restart:
411 	 */
412 	new_u64s = min(256U, (new_u64s * 3) / 2);
413 
414 	if (new_u64s > ck->u64s) {
415 		new_u64s = roundup_pow_of_two(new_u64s);
416 		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
417 		if (!new_k) {
418 			bch2_trans_unlock(trans);
419 
420 			new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
421 			if (!new_k) {
422 				bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
423 					bch2_btree_id_str(ck->key.btree_id), new_u64s);
424 				ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
425 				goto err;
426 			}
427 
428 			ret = bch2_trans_relock(trans);
429 			if (ret) {
430 				kfree(new_k);
431 				goto err;
432 			}
433 
434 			if (!bch2_btree_node_relock(trans, ck_path, 0)) {
435 				kfree(new_k);
436 				trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
437 				ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
438 				goto err;
439 			}
440 		}
441 	}
442 
443 	ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
444 	if (ret) {
445 		kfree(new_k);
446 		goto err;
447 	}
448 
449 	if (new_k) {
450 		kfree(ck->k);
451 		ck->u64s = new_u64s;
452 		ck->k = new_k;
453 	}
454 
455 	bkey_reassemble(ck->k, k);
456 	ck->valid = true;
457 	bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
458 
459 	/* We're not likely to need this iterator again: */
460 	bch2_set_btree_iter_dontneed(&iter);
461 err:
462 	bch2_trans_iter_exit(trans, &iter);
463 	return ret;
464 }
465 
466 static noinline int
467 bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
468 					 unsigned flags)
469 {
470 	struct bch_fs *c = trans->c;
471 	struct bkey_cached *ck;
472 	int ret = 0;
473 
474 	BUG_ON(path->level);
475 
476 	path->l[1].b = NULL;
477 
478 	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
479 		ck = (void *) path->l[0].b;
480 		goto fill;
481 	}
482 retry:
483 	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
484 	if (!ck) {
485 		ck = btree_key_cache_create(trans, path);
486 		ret = PTR_ERR_OR_ZERO(ck);
487 		if (ret)
488 			goto err;
489 		if (!ck)
490 			goto retry;
491 
492 		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
493 		path->locks_want = 1;
494 	} else {
495 		enum six_lock_type lock_want = __btree_lock_want(path, 0);
496 
497 		ret = btree_node_lock(trans, path, (void *) ck, 0,
498 				      lock_want, _THIS_IP_);
499 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
500 			goto err;
501 
502 		BUG_ON(ret);
503 
504 		if (ck->key.btree_id != path->btree_id ||
505 		    !bpos_eq(ck->key.pos, path->pos)) {
506 			six_unlock_type(&ck->c.lock, lock_want);
507 			goto retry;
508 		}
509 
510 		mark_btree_node_locked(trans, path, 0,
511 				       (enum btree_node_locked_type) lock_want);
512 	}
513 
514 	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
515 	path->l[0].b		= (void *) ck;
516 fill:
517 	path->uptodate = BTREE_ITER_UPTODATE;
518 
519 	if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) {
520 		ret =   bch2_btree_path_upgrade(trans, path, 1) ?:
521 			btree_key_cache_fill(trans, path, ck) ?:
522 			bch2_btree_path_relock(trans, path, _THIS_IP_);
523 		if (ret)
524 			goto err;
525 
526 		path->uptodate = BTREE_ITER_UPTODATE;
527 	}
528 
529 	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
530 		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
531 
532 	BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
533 	BUG_ON(path->uptodate);
534 
535 	return ret;
536 err:
537 	path->uptodate = BTREE_ITER_NEED_TRAVERSE;
538 	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
539 		btree_node_unlock(trans, path, 0);
540 		path->l[0].b = ERR_PTR(ret);
541 	}
542 	return ret;
543 }
544 
545 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
546 				    unsigned flags)
547 {
548 	struct bch_fs *c = trans->c;
549 	struct bkey_cached *ck;
550 	int ret = 0;
551 
552 	EBUG_ON(path->level);
553 
554 	path->l[1].b = NULL;
555 
556 	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
557 		ck = (void *) path->l[0].b;
558 		goto fill;
559 	}
560 retry:
561 	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
562 	if (!ck) {
563 		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
564 	} else {
565 		enum six_lock_type lock_want = __btree_lock_want(path, 0);
566 
567 		ret = btree_node_lock(trans, path, (void *) ck, 0,
568 				      lock_want, _THIS_IP_);
569 		EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
570 
571 		if (ret)
572 			return ret;
573 
574 		if (ck->key.btree_id != path->btree_id ||
575 		    !bpos_eq(ck->key.pos, path->pos)) {
576 			six_unlock_type(&ck->c.lock, lock_want);
577 			goto retry;
578 		}
579 
580 		mark_btree_node_locked(trans, path, 0,
581 				       (enum btree_node_locked_type) lock_want);
582 	}
583 
584 	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
585 	path->l[0].b		= (void *) ck;
586 fill:
587 	if (!ck->valid)
588 		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
589 
590 	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
591 		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
592 
593 	path->uptodate = BTREE_ITER_UPTODATE;
594 	EBUG_ON(!ck->valid);
595 	EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
596 
597 	return ret;
598 }
599 
600 static int btree_key_cache_flush_pos(struct btree_trans *trans,
601 				     struct bkey_cached_key key,
602 				     u64 journal_seq,
603 				     unsigned commit_flags,
604 				     bool evict)
605 {
606 	struct bch_fs *c = trans->c;
607 	struct journal *j = &c->journal;
608 	struct btree_iter c_iter, b_iter;
609 	struct bkey_cached *ck = NULL;
610 	int ret;
611 
612 	bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
613 			     BTREE_ITER_slots|
614 			     BTREE_ITER_intent|
615 			     BTREE_ITER_all_snapshots);
616 	bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
617 			     BTREE_ITER_cached|
618 			     BTREE_ITER_intent);
619 	b_iter.flags &= ~BTREE_ITER_with_key_cache;
620 
621 	ret = bch2_btree_iter_traverse(&c_iter);
622 	if (ret)
623 		goto out;
624 
625 	ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b;
626 	if (!ck)
627 		goto out;
628 
629 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
630 		if (evict)
631 			goto evict;
632 		goto out;
633 	}
634 
635 	BUG_ON(!ck->valid);
636 
637 	if (journal_seq && ck->journal.seq != journal_seq)
638 		goto out;
639 
640 	trans->journal_res.seq = ck->journal.seq;
641 
642 	/*
643 	 * If we're at the end of the journal, we really want to free up space
644 	 * in the journal right away - we don't want to pin that old journal
645 	 * sequence number with a new btree node write, we want to re-journal
646 	 * the update
647 	 */
648 	if (ck->journal.seq == journal_last_seq(j))
649 		commit_flags |= BCH_WATERMARK_reclaim;
650 
651 	if (ck->journal.seq != journal_last_seq(j) ||
652 	    !test_bit(JOURNAL_space_low, &c->journal.flags))
653 		commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
654 
655 	ret   = bch2_btree_iter_traverse(&b_iter) ?:
656 		bch2_trans_update(trans, &b_iter, ck->k,
657 				  BTREE_UPDATE_key_cache_reclaim|
658 				  BTREE_UPDATE_internal_snapshot_node|
659 				  BTREE_TRIGGER_norun) ?:
660 		bch2_trans_commit(trans, NULL, NULL,
661 				  BCH_TRANS_COMMIT_no_check_rw|
662 				  BCH_TRANS_COMMIT_no_enospc|
663 				  commit_flags);
664 
665 	bch2_fs_fatal_err_on(ret &&
666 			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
667 			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
668 			     !bch2_journal_error(j), c,
669 			     "flushing key cache: %s", bch2_err_str(ret));
670 	if (ret)
671 		goto out;
672 
673 	bch2_journal_pin_drop(j, &ck->journal);
674 
675 	struct btree_path *path = btree_iter_path(trans, &c_iter);
676 	BUG_ON(!btree_node_locked(path, 0));
677 
678 	if (!evict) {
679 		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
680 			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
681 			atomic_long_dec(&c->btree_key_cache.nr_dirty);
682 		}
683 	} else {
684 		struct btree_path *path2;
685 		unsigned i;
686 evict:
687 		trans_for_each_path(trans, path2, i)
688 			if (path2 != path)
689 				__bch2_btree_path_unlock(trans, path2);
690 
691 		bch2_btree_node_lock_write_nofail(trans, path, &ck->c);
692 
693 		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
694 			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
695 			atomic_long_dec(&c->btree_key_cache.nr_dirty);
696 		}
697 
698 		mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
699 		bkey_cached_evict(&c->btree_key_cache, ck);
700 		bkey_cached_free_fast(&c->btree_key_cache, ck);
701 	}
702 out:
703 	bch2_trans_iter_exit(trans, &b_iter);
704 	bch2_trans_iter_exit(trans, &c_iter);
705 	return ret;
706 }
707 
708 int bch2_btree_key_cache_journal_flush(struct journal *j,
709 				struct journal_entry_pin *pin, u64 seq)
710 {
711 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
712 	struct bkey_cached *ck =
713 		container_of(pin, struct bkey_cached, journal);
714 	struct bkey_cached_key key;
715 	struct btree_trans *trans = bch2_trans_get(c);
716 	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
717 	int ret = 0;
718 
719 	btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
720 	key = ck->key;
721 
722 	if (ck->journal.seq != seq ||
723 	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
724 		six_unlock_read(&ck->c.lock);
725 		goto unlock;
726 	}
727 
728 	if (ck->seq != seq) {
729 		bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
730 					bch2_btree_key_cache_journal_flush);
731 		six_unlock_read(&ck->c.lock);
732 		goto unlock;
733 	}
734 	six_unlock_read(&ck->c.lock);
735 
736 	ret = lockrestart_do(trans,
737 		btree_key_cache_flush_pos(trans, key, seq,
738 				BCH_TRANS_COMMIT_journal_reclaim, false));
739 unlock:
740 	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
741 
742 	bch2_trans_put(trans);
743 	return ret;
744 }
745 
746 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
747 				  unsigned flags,
748 				  struct btree_insert_entry *insert_entry)
749 {
750 	struct bch_fs *c = trans->c;
751 	struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b;
752 	struct bkey_i *insert = insert_entry->k;
753 	bool kick_reclaim = false;
754 
755 	BUG_ON(insert->k.u64s > ck->u64s);
756 
757 	bkey_copy(ck->k, insert);
758 	ck->valid = true;
759 
760 	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
761 		EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
762 		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
763 		atomic_long_inc(&c->btree_key_cache.nr_dirty);
764 
765 		if (bch2_nr_btree_keys_need_flush(c))
766 			kick_reclaim = true;
767 	}
768 
769 	/*
770 	 * To minimize lock contention, we only add the journal pin here and
771 	 * defer pin updates to the flush callback via ->seq. Be careful not to
772 	 * update ->seq on nojournal commits because we don't want to update the
773 	 * pin to a seq that doesn't include journal updates on disk. Otherwise
774 	 * we risk losing the update after a crash.
775 	 *
776 	 * The only exception is if the pin is not active in the first place. We
777 	 * have to add the pin because journal reclaim drives key cache
778 	 * flushing. The flush callback will not proceed unless ->seq matches
779 	 * the latest pin, so make sure it starts with a consistent value.
780 	 */
781 	if (!(insert_entry->flags & BTREE_UPDATE_nojournal) ||
782 	    !journal_pin_active(&ck->journal)) {
783 		ck->seq = trans->journal_res.seq;
784 	}
785 	bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
786 			     &ck->journal, bch2_btree_key_cache_journal_flush);
787 
788 	if (kick_reclaim)
789 		journal_reclaim_kick(&c->journal);
790 	return true;
791 }
792 
793 void bch2_btree_key_cache_drop(struct btree_trans *trans,
794 			       struct btree_path *path)
795 {
796 	struct bch_fs *c = trans->c;
797 	struct bkey_cached *ck = (void *) path->l[0].b;
798 
799 	BUG_ON(!ck->valid);
800 
801 	/*
802 	 * We just did an update to the btree, bypassing the key cache: the key
803 	 * cache key is now stale and must be dropped, even if dirty:
804 	 */
805 	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
806 		clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
807 		atomic_long_dec(&c->btree_key_cache.nr_dirty);
808 		bch2_journal_pin_drop(&c->journal, &ck->journal);
809 	}
810 
811 	ck->valid = false;
812 }
813 
814 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
815 					   struct shrink_control *sc)
816 {
817 	struct bch_fs *c = shrink->private_data;
818 	struct btree_key_cache *bc = &c->btree_key_cache;
819 	struct bucket_table *tbl;
820 	struct bkey_cached *ck, *t;
821 	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
822 	unsigned start, flags;
823 	int srcu_idx;
824 
825 	mutex_lock(&bc->lock);
826 	bc->requested_to_free += sc->nr_to_scan;
827 
828 	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
829 	flags = memalloc_nofs_save();
830 
831 	/*
832 	 * Newest freed entries are at the end of the list - once we hit one
833 	 * that's too new to be freed, we can bail out:
834 	 */
835 	list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
836 		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
837 						 ck->btree_trans_barrier_seq))
838 			break;
839 
840 		list_del(&ck->list);
841 		six_lock_exit(&ck->c.lock);
842 		kmem_cache_free(bch2_key_cache, ck);
843 		atomic_long_dec(&bc->nr_freed);
844 		bc->nr_freed_nonpcpu--;
845 		bc->freed++;
846 	}
847 
848 	list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
849 		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
850 						 ck->btree_trans_barrier_seq))
851 			break;
852 
853 		list_del(&ck->list);
854 		six_lock_exit(&ck->c.lock);
855 		kmem_cache_free(bch2_key_cache, ck);
856 		atomic_long_dec(&bc->nr_freed);
857 		bc->nr_freed_pcpu--;
858 		bc->freed++;
859 	}
860 
861 	rcu_read_lock();
862 	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
863 	if (bc->shrink_iter >= tbl->size)
864 		bc->shrink_iter = 0;
865 	start = bc->shrink_iter;
866 
867 	do {
868 		struct rhash_head *pos, *next;
869 
870 		pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
871 
872 		while (!rht_is_a_nulls(pos)) {
873 			next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
874 			ck = container_of(pos, struct bkey_cached, hash);
875 
876 			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
877 				bc->skipped_dirty++;
878 			} else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
879 				clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
880 				bc->skipped_accessed++;
881 			} else if (!bkey_cached_lock_for_evict(ck)) {
882 				bc->skipped_lock_fail++;
883 			} else {
884 				bkey_cached_evict(bc, ck);
885 				bkey_cached_free(bc, ck);
886 				bc->moved_to_freelist++;
887 				freed++;
888 			}
889 
890 			scanned++;
891 			if (scanned >= nr)
892 				break;
893 
894 			pos = next;
895 		}
896 
897 		bc->shrink_iter++;
898 		if (bc->shrink_iter >= tbl->size)
899 			bc->shrink_iter = 0;
900 	} while (scanned < nr && bc->shrink_iter != start);
901 
902 	rcu_read_unlock();
903 	memalloc_nofs_restore(flags);
904 	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
905 	mutex_unlock(&bc->lock);
906 
907 	return freed;
908 }
909 
910 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
911 					    struct shrink_control *sc)
912 {
913 	struct bch_fs *c = shrink->private_data;
914 	struct btree_key_cache *bc = &c->btree_key_cache;
915 	long nr = atomic_long_read(&bc->nr_keys) -
916 		atomic_long_read(&bc->nr_dirty);
917 
918 	/*
919 	 * Avoid hammering our shrinker too much if it's nearly empty - the
920 	 * shrinker code doesn't take into account how big our cache is, if it's
921 	 * mostly empty but the system is under memory pressure it causes nasty
922 	 * lock contention:
923 	 */
924 	nr -= 128;
925 
926 	return max(0L, nr);
927 }
928 
929 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
930 {
931 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
932 	struct bucket_table *tbl;
933 	struct bkey_cached *ck, *n;
934 	struct rhash_head *pos;
935 	LIST_HEAD(items);
936 	unsigned i;
937 #ifdef __KERNEL__
938 	int cpu;
939 #endif
940 
941 	shrinker_free(bc->shrink);
942 
943 	mutex_lock(&bc->lock);
944 
945 	/*
946 	 * The loop is needed to guard against racing with rehash:
947 	 */
948 	while (atomic_long_read(&bc->nr_keys)) {
949 		rcu_read_lock();
950 		tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
951 		if (tbl)
952 			for (i = 0; i < tbl->size; i++)
953 				rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
954 					bkey_cached_evict(bc, ck);
955 					list_add(&ck->list, &items);
956 				}
957 		rcu_read_unlock();
958 	}
959 
960 #ifdef __KERNEL__
961 	if (bc->pcpu_freed) {
962 		for_each_possible_cpu(cpu) {
963 			struct btree_key_cache_freelist *f =
964 				per_cpu_ptr(bc->pcpu_freed, cpu);
965 
966 			for (i = 0; i < f->nr; i++) {
967 				ck = f->objs[i];
968 				list_add(&ck->list, &items);
969 			}
970 		}
971 	}
972 #endif
973 
974 	BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
975 	BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
976 
977 	list_splice(&bc->freed_pcpu,	&items);
978 	list_splice(&bc->freed_nonpcpu,	&items);
979 
980 	mutex_unlock(&bc->lock);
981 
982 	list_for_each_entry_safe(ck, n, &items, list) {
983 		cond_resched();
984 
985 		list_del(&ck->list);
986 		kfree(ck->k);
987 		six_lock_exit(&ck->c.lock);
988 		kmem_cache_free(bch2_key_cache, ck);
989 	}
990 
991 	if (atomic_long_read(&bc->nr_dirty) &&
992 	    !bch2_journal_error(&c->journal) &&
993 	    test_bit(BCH_FS_was_rw, &c->flags))
994 		panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
995 		      atomic_long_read(&bc->nr_dirty));
996 
997 	if (atomic_long_read(&bc->nr_keys))
998 		panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
999 		      atomic_long_read(&bc->nr_keys));
1000 
1001 	if (bc->table_init_done)
1002 		rhashtable_destroy(&bc->table);
1003 
1004 	free_percpu(bc->pcpu_freed);
1005 }
1006 
1007 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
1008 {
1009 	mutex_init(&c->lock);
1010 	INIT_LIST_HEAD(&c->freed_pcpu);
1011 	INIT_LIST_HEAD(&c->freed_nonpcpu);
1012 }
1013 
1014 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
1015 {
1016 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
1017 	struct shrinker *shrink;
1018 
1019 #ifdef __KERNEL__
1020 	bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
1021 	if (!bc->pcpu_freed)
1022 		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
1023 #endif
1024 
1025 	if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
1026 		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
1027 
1028 	bc->table_init_done = true;
1029 
1030 	shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
1031 	if (!shrink)
1032 		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
1033 	bc->shrink = shrink;
1034 	shrink->count_objects	= bch2_btree_key_cache_count;
1035 	shrink->scan_objects	= bch2_btree_key_cache_scan;
1036 	shrink->batch		= 1 << 14;
1037 	shrink->seeks		= 0;
1038 	shrink->private_data	= c;
1039 	shrinker_register(shrink);
1040 	return 0;
1041 }
1042 
1043 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
1044 {
1045 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
1046 
1047 	printbuf_tabstop_push(out, 24);
1048 	printbuf_tabstop_push(out, 12);
1049 
1050 	unsigned flags = memalloc_nofs_save();
1051 	mutex_lock(&bc->lock);
1052 	prt_printf(out, "keys:\t%lu\r\n",		atomic_long_read(&bc->nr_keys));
1053 	prt_printf(out, "dirty:\t%lu\r\n",		atomic_long_read(&bc->nr_dirty));
1054 	prt_printf(out, "freelist:\t%lu\r\n",		atomic_long_read(&bc->nr_freed));
1055 	prt_printf(out, "nonpcpu freelist:\t%zu\r\n",	bc->nr_freed_nonpcpu);
1056 	prt_printf(out, "pcpu freelist:\t%zu\r\n",	bc->nr_freed_pcpu);
1057 
1058 	prt_printf(out, "\nshrinker:\n");
1059 	prt_printf(out, "requested_to_free:\t%lu\r\n",	bc->requested_to_free);
1060 	prt_printf(out, "freed:\t%lu\r\n",		bc->freed);
1061 	prt_printf(out, "moved_to_freelist:\t%lu\r\n",	bc->moved_to_freelist);
1062 	prt_printf(out, "skipped_dirty:\t%lu\r\n",	bc->skipped_dirty);
1063 	prt_printf(out, "skipped_accessed:\t%lu\r\n",	bc->skipped_accessed);
1064 	prt_printf(out, "skipped_lock_fail:\t%lu\r\n",	bc->skipped_lock_fail);
1065 
1066 	prt_printf(out, "srcu seq:\t%lu\r\n",		get_state_synchronize_srcu(&c->btree_trans_barrier));
1067 
1068 	struct bkey_cached *ck;
1069 	unsigned iter = 0;
1070 	list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
1071 		prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
1072 		if (++iter > 10)
1073 			break;
1074 	}
1075 
1076 	iter = 0;
1077 	list_for_each_entry(ck, &bc->freed_pcpu, list) {
1078 		prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
1079 		if (++iter > 10)
1080 			break;
1081 	}
1082 	mutex_unlock(&bc->lock);
1083 	memalloc_flags_restore(flags);
1084 }
1085 
1086 void bch2_btree_key_cache_exit(void)
1087 {
1088 	kmem_cache_destroy(bch2_key_cache);
1089 }
1090 
1091 int __init bch2_btree_key_cache_init(void)
1092 {
1093 	bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
1094 	if (!bch2_key_cache)
1095 		return -ENOMEM;
1096 
1097 	return 0;
1098 }
1099