xref: /linux/fs/bcachefs/btree_cache.c (revision 566ab427f827b0256d3e8ce0235d088e6a9c28bd)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bbpos.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_io.h"
8 #include "btree_iter.h"
9 #include "btree_locking.h"
10 #include "debug.h"
11 #include "errcode.h"
12 #include "error.h"
13 #include "journal.h"
14 #include "trace.h"
15 
16 #include <linux/prefetch.h>
17 #include <linux/sched/mm.h>
18 #include <linux/swap.h>
19 
20 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
21 do {						 \
22 	if (shrinker_counter)			 \
23 		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++;	 \
24 } while (0)
25 
26 const char * const bch2_btree_node_flags[] = {
27 #define x(f)	#f,
28 	BTREE_FLAGS()
29 #undef x
30 	NULL
31 };
32 
33 void bch2_recalc_btree_reserve(struct bch_fs *c)
34 {
35 	unsigned reserve = 16;
36 
37 	if (!c->btree_roots_known[0].b)
38 		reserve += 8;
39 
40 	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
41 		struct btree_root *r = bch2_btree_id_root(c, i);
42 
43 		if (r->b)
44 			reserve += min_t(unsigned, 1, r->b->c.level) * 8;
45 	}
46 
47 	c->btree_cache.nr_reserve = reserve;
48 }
49 
50 static inline size_t btree_cache_can_free(struct btree_cache_list *list)
51 {
52 	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
53 
54 	size_t can_free = list->nr;
55 	if (!list->idx)
56 		can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
57 	return can_free;
58 }
59 
60 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
61 {
62 	if (b->c.lock.readers)
63 		list_move(&b->list, &bc->freed_pcpu);
64 	else
65 		list_move(&b->list, &bc->freed_nonpcpu);
66 }
67 
68 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
69 {
70 	struct btree_cache *bc = &c->btree_cache;
71 
72 	BUG_ON(btree_node_hashed(b));
73 
74 	/*
75 	 * This should really be done in slub/vmalloc, but we're using the
76 	 * kmalloc_large() path, so we're working around a slub bug by doing
77 	 * this here:
78 	 */
79 	if (b->data)
80 		mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
81 	if (b->aux_data)
82 		mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
83 
84 	EBUG_ON(btree_node_write_in_flight(b));
85 
86 	clear_btree_node_just_written(b);
87 
88 	kvfree(b->data);
89 	b->data = NULL;
90 #ifdef __KERNEL__
91 	kvfree(b->aux_data);
92 #else
93 	munmap(b->aux_data, btree_aux_data_bytes(b));
94 #endif
95 	b->aux_data = NULL;
96 
97 	bc->nr_freeable--;
98 
99 	btree_node_to_freedlist(bc, b);
100 }
101 
102 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
103 				   const void *obj)
104 {
105 	const struct btree *b = obj;
106 	const u64 *v = arg->key;
107 
108 	return b->hash_val == *v ? 0 : 1;
109 }
110 
111 static const struct rhashtable_params bch_btree_cache_params = {
112 	.head_offset		= offsetof(struct btree, hash),
113 	.key_offset		= offsetof(struct btree, hash_val),
114 	.key_len		= sizeof(u64),
115 	.obj_cmpfn		= bch2_btree_cache_cmp_fn,
116 	.automatic_shrinking	= true,
117 };
118 
119 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
120 {
121 	BUG_ON(b->data || b->aux_data);
122 
123 	gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
124 
125 	b->data = kvmalloc(btree_buf_bytes(b), gfp);
126 	if (!b->data)
127 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
128 #ifdef __KERNEL__
129 	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
130 #else
131 	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
132 			   PROT_READ|PROT_WRITE|PROT_EXEC,
133 			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
134 	if (b->aux_data == MAP_FAILED)
135 		b->aux_data = NULL;
136 #endif
137 	if (!b->aux_data) {
138 		kvfree(b->data);
139 		b->data = NULL;
140 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
141 	}
142 
143 	return 0;
144 }
145 
146 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
147 {
148 	struct btree *b;
149 
150 	b = kzalloc(sizeof(struct btree), gfp);
151 	if (!b)
152 		return NULL;
153 
154 	bkey_btree_ptr_init(&b->key);
155 	INIT_LIST_HEAD(&b->list);
156 	INIT_LIST_HEAD(&b->write_blocked);
157 	b->byte_order = ilog2(c->opts.btree_node_size);
158 	return b;
159 }
160 
161 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
162 {
163 	struct btree_cache *bc = &c->btree_cache;
164 	struct btree *b;
165 
166 	b = __btree_node_mem_alloc(c, GFP_KERNEL);
167 	if (!b)
168 		return NULL;
169 
170 	if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
171 		kfree(b);
172 		return NULL;
173 	}
174 
175 	bch2_btree_lock_init(&b->c, 0);
176 
177 	bc->nr_freeable++;
178 	list_add(&b->list, &bc->freeable);
179 	return b;
180 }
181 
182 void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
183 {
184 	mutex_lock(&c->btree_cache.lock);
185 	list_move(&b->list, &c->btree_cache.freeable);
186 	mutex_unlock(&c->btree_cache.lock);
187 
188 	six_unlock_write(&b->c.lock);
189 	six_unlock_intent(&b->c.lock);
190 }
191 
192 static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
193 {
194 	struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
195 
196 	u64 mask = bc->pinned_nodes_mask[!!b->c.level];
197 
198 	return ((mask & BIT_ULL(b->c.btree_id)) &&
199 		bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
200 		bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
201 }
202 
203 void bch2_node_pin(struct bch_fs *c, struct btree *b)
204 {
205 	struct btree_cache *bc = &c->btree_cache;
206 
207 	mutex_lock(&bc->lock);
208 	BUG_ON(!__btree_node_pinned(bc, b));
209 	if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
210 		set_btree_node_pinned(b);
211 		list_move(&b->list, &bc->live[1].list);
212 		bc->live[0].nr--;
213 		bc->live[1].nr++;
214 	}
215 	mutex_unlock(&bc->lock);
216 }
217 
218 void bch2_btree_cache_unpin(struct bch_fs *c)
219 {
220 	struct btree_cache *bc = &c->btree_cache;
221 	struct btree *b, *n;
222 
223 	mutex_lock(&bc->lock);
224 	c->btree_cache.pinned_nodes_mask[0] = 0;
225 	c->btree_cache.pinned_nodes_mask[1] = 0;
226 
227 	list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
228 		clear_btree_node_pinned(b);
229 		list_move(&b->list, &bc->live[0].list);
230 		bc->live[0].nr++;
231 		bc->live[1].nr--;
232 	}
233 
234 	mutex_unlock(&bc->lock);
235 }
236 
237 /* Btree in memory cache - hash table */
238 
239 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
240 {
241 	lockdep_assert_held(&bc->lock);
242 	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
243 
244 	BUG_ON(ret);
245 
246 	/* Cause future lookups for this node to fail: */
247 	b->hash_val = 0;
248 
249 	if (b->c.btree_id < BTREE_ID_NR)
250 		--bc->nr_by_btree[b->c.btree_id];
251 
252 	bc->live[btree_node_pinned(b)].nr--;
253 	bc->nr_freeable++;
254 	list_move(&b->list, &bc->freeable);
255 }
256 
257 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
258 {
259 	BUG_ON(b->hash_val);
260 	b->hash_val = btree_ptr_hash_val(&b->key);
261 
262 	int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
263 						bch_btree_cache_params);
264 	if (ret)
265 		return ret;
266 
267 	if (b->c.btree_id < BTREE_ID_NR)
268 		bc->nr_by_btree[b->c.btree_id]++;
269 
270 	bool p = __btree_node_pinned(bc, b);
271 	mod_bit(BTREE_NODE_pinned, &b->flags, p);
272 
273 	list_move_tail(&b->list, &bc->live[p].list);
274 	bc->live[p].nr++;
275 
276 	bc->nr_freeable--;
277 	return 0;
278 }
279 
280 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
281 				unsigned level, enum btree_id id)
282 {
283 	b->c.level	= level;
284 	b->c.btree_id	= id;
285 
286 	mutex_lock(&bc->lock);
287 	int ret = __bch2_btree_node_hash_insert(bc, b);
288 	mutex_unlock(&bc->lock);
289 
290 	return ret;
291 }
292 
293 void bch2_btree_node_update_key_early(struct btree_trans *trans,
294 				      enum btree_id btree, unsigned level,
295 				      struct bkey_s_c old, struct bkey_i *new)
296 {
297 	struct bch_fs *c = trans->c;
298 	struct btree *b;
299 	struct bkey_buf tmp;
300 	int ret;
301 
302 	bch2_bkey_buf_init(&tmp);
303 	bch2_bkey_buf_reassemble(&tmp, c, old);
304 
305 	b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
306 	if (!IS_ERR_OR_NULL(b)) {
307 		mutex_lock(&c->btree_cache.lock);
308 
309 		bch2_btree_node_hash_remove(&c->btree_cache, b);
310 
311 		bkey_copy(&b->key, new);
312 		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
313 		BUG_ON(ret);
314 
315 		mutex_unlock(&c->btree_cache.lock);
316 		six_unlock_read(&b->c.lock);
317 	}
318 
319 	bch2_bkey_buf_exit(&tmp, c);
320 }
321 
322 __flatten
323 static inline struct btree *btree_cache_find(struct btree_cache *bc,
324 				     const struct bkey_i *k)
325 {
326 	u64 v = btree_ptr_hash_val(k);
327 
328 	return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
329 }
330 
331 /*
332  * this version is for btree nodes that have already been freed (we're not
333  * reaping a real btree node)
334  */
335 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
336 {
337 	struct btree_cache *bc = &c->btree_cache;
338 	int ret = 0;
339 
340 	lockdep_assert_held(&bc->lock);
341 wait_on_io:
342 	if (b->flags & ((1U << BTREE_NODE_dirty)|
343 			(1U << BTREE_NODE_read_in_flight)|
344 			(1U << BTREE_NODE_write_in_flight))) {
345 		if (!flush) {
346 			if (btree_node_dirty(b))
347 				BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
348 			else if (btree_node_read_in_flight(b))
349 				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
350 			else if (btree_node_write_in_flight(b))
351 				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
352 			return -BCH_ERR_ENOMEM_btree_node_reclaim;
353 		}
354 
355 		/* XXX: waiting on IO with btree cache lock held */
356 		bch2_btree_node_wait_on_read(b);
357 		bch2_btree_node_wait_on_write(b);
358 	}
359 
360 	if (!six_trylock_intent(&b->c.lock)) {
361 		BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
362 		return -BCH_ERR_ENOMEM_btree_node_reclaim;
363 	}
364 
365 	if (!six_trylock_write(&b->c.lock)) {
366 		BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
367 		goto out_unlock_intent;
368 	}
369 
370 	/* recheck under lock */
371 	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
372 			(1U << BTREE_NODE_write_in_flight))) {
373 		if (!flush) {
374 			if (btree_node_read_in_flight(b))
375 				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
376 			else if (btree_node_write_in_flight(b))
377 				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
378 			goto out_unlock;
379 		}
380 		six_unlock_write(&b->c.lock);
381 		six_unlock_intent(&b->c.lock);
382 		goto wait_on_io;
383 	}
384 
385 	if (btree_node_noevict(b)) {
386 		BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
387 		goto out_unlock;
388 	}
389 	if (btree_node_write_blocked(b)) {
390 		BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
391 		goto out_unlock;
392 	}
393 	if (btree_node_will_make_reachable(b)) {
394 		BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
395 		goto out_unlock;
396 	}
397 
398 	if (btree_node_dirty(b)) {
399 		if (!flush) {
400 			BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
401 			goto out_unlock;
402 		}
403 		/*
404 		 * Using the underscore version because we don't want to compact
405 		 * bsets after the write, since this node is about to be evicted
406 		 * - unless btree verify mode is enabled, since it runs out of
407 		 * the post write cleanup:
408 		 */
409 		if (bch2_verify_btree_ondisk)
410 			bch2_btree_node_write(c, b, SIX_LOCK_intent,
411 					      BTREE_WRITE_cache_reclaim);
412 		else
413 			__bch2_btree_node_write(c, b,
414 						BTREE_WRITE_cache_reclaim);
415 
416 		six_unlock_write(&b->c.lock);
417 		six_unlock_intent(&b->c.lock);
418 		goto wait_on_io;
419 	}
420 out:
421 	if (b->hash_val && !ret)
422 		trace_and_count(c, btree_cache_reap, c, b);
423 	return ret;
424 out_unlock:
425 	six_unlock_write(&b->c.lock);
426 out_unlock_intent:
427 	six_unlock_intent(&b->c.lock);
428 	ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
429 	goto out;
430 }
431 
432 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
433 {
434 	return __btree_node_reclaim(c, b, false, shrinker_counter);
435 }
436 
437 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
438 {
439 	return __btree_node_reclaim(c, b, true, false);
440 }
441 
442 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
443 					   struct shrink_control *sc)
444 {
445 	struct btree_cache_list *list = shrink->private_data;
446 	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
447 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
448 	struct btree *b, *t;
449 	unsigned long nr = sc->nr_to_scan;
450 	unsigned long can_free = 0;
451 	unsigned long freed = 0;
452 	unsigned long touched = 0;
453 	unsigned i, flags;
454 	unsigned long ret = SHRINK_STOP;
455 	bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
456 
457 	if (bch2_btree_shrinker_disabled)
458 		return SHRINK_STOP;
459 
460 	mutex_lock(&bc->lock);
461 	flags = memalloc_nofs_save();
462 
463 	/*
464 	 * It's _really_ critical that we don't free too many btree nodes - we
465 	 * have to always leave ourselves a reserve. The reserve is how we
466 	 * guarantee that allocating memory for a new btree node can always
467 	 * succeed, so that inserting keys into the btree can always succeed and
468 	 * IO can always make forward progress:
469 	 */
470 	can_free = btree_cache_can_free(list);
471 	nr = min_t(unsigned long, nr, can_free);
472 
473 	i = 0;
474 	list_for_each_entry_safe(b, t, &bc->freeable, list) {
475 		/*
476 		 * Leave a few nodes on the freeable list, so that a btree split
477 		 * won't have to hit the system allocator:
478 		 */
479 		if (++i <= 3)
480 			continue;
481 
482 		touched++;
483 
484 		if (touched >= nr)
485 			goto out;
486 
487 		if (!btree_node_reclaim(c, b, true)) {
488 			btree_node_data_free(c, b);
489 			six_unlock_write(&b->c.lock);
490 			six_unlock_intent(&b->c.lock);
491 			freed++;
492 			bc->nr_freed++;
493 		}
494 	}
495 restart:
496 	list_for_each_entry_safe(b, t, &list->list, list) {
497 		touched++;
498 
499 		if (btree_node_accessed(b)) {
500 			clear_btree_node_accessed(b);
501 			bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
502 			--touched;;
503 		} else if (!btree_node_reclaim(c, b, true)) {
504 			bch2_btree_node_hash_remove(bc, b);
505 
506 			freed++;
507 			btree_node_data_free(c, b);
508 			bc->nr_freed++;
509 
510 			six_unlock_write(&b->c.lock);
511 			six_unlock_intent(&b->c.lock);
512 
513 			if (freed == nr)
514 				goto out_rotate;
515 		} else if (trigger_writes &&
516 			   btree_node_dirty(b) &&
517 			   !btree_node_will_make_reachable(b) &&
518 			   !btree_node_write_blocked(b) &&
519 			   six_trylock_read(&b->c.lock)) {
520 			list_move(&list->list, &b->list);
521 			mutex_unlock(&bc->lock);
522 			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
523 			six_unlock_read(&b->c.lock);
524 			if (touched >= nr)
525 				goto out_nounlock;
526 			mutex_lock(&bc->lock);
527 			goto restart;
528 		}
529 
530 		if (touched >= nr)
531 			break;
532 	}
533 out_rotate:
534 	if (&t->list != &list->list)
535 		list_move_tail(&list->list, &t->list);
536 out:
537 	mutex_unlock(&bc->lock);
538 out_nounlock:
539 	ret = freed;
540 	memalloc_nofs_restore(flags);
541 	trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
542 	return ret;
543 }
544 
545 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
546 					    struct shrink_control *sc)
547 {
548 	struct btree_cache_list *list = shrink->private_data;
549 
550 	if (bch2_btree_shrinker_disabled)
551 		return 0;
552 
553 	return btree_cache_can_free(list);
554 }
555 
556 void bch2_fs_btree_cache_exit(struct bch_fs *c)
557 {
558 	struct btree_cache *bc = &c->btree_cache;
559 	struct btree *b, *t;
560 	unsigned long flags;
561 
562 	shrinker_free(bc->live[1].shrink);
563 	shrinker_free(bc->live[0].shrink);
564 
565 	/* vfree() can allocate memory: */
566 	flags = memalloc_nofs_save();
567 	mutex_lock(&bc->lock);
568 
569 	if (c->verify_data)
570 		list_move(&c->verify_data->list, &bc->live[0].list);
571 
572 	kvfree(c->verify_ondisk);
573 
574 	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
575 		struct btree_root *r = bch2_btree_id_root(c, i);
576 
577 		if (r->b)
578 			list_add(&r->b->list, &bc->live[0].list);
579 	}
580 
581 	list_for_each_entry_safe(b, t, &bc->live[1].list, list)
582 		bch2_btree_node_hash_remove(bc, b);
583 	list_for_each_entry_safe(b, t, &bc->live[0].list, list)
584 		bch2_btree_node_hash_remove(bc, b);
585 
586 	list_for_each_entry_safe(b, t, &bc->freeable, list) {
587 		BUG_ON(btree_node_read_in_flight(b) ||
588 		       btree_node_write_in_flight(b));
589 
590 		btree_node_data_free(c, b);
591 	}
592 
593 	BUG_ON(!bch2_journal_error(&c->journal) &&
594 	       atomic_long_read(&c->btree_cache.nr_dirty));
595 
596 	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
597 
598 	list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
599 		list_del(&b->list);
600 		six_lock_exit(&b->c.lock);
601 		kfree(b);
602 	}
603 
604 	mutex_unlock(&bc->lock);
605 	memalloc_nofs_restore(flags);
606 
607 	for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
608 		BUG_ON(bc->nr_by_btree[i]);
609 	BUG_ON(bc->live[0].nr);
610 	BUG_ON(bc->live[1].nr);
611 	BUG_ON(bc->nr_freeable);
612 
613 	if (bc->table_init_done)
614 		rhashtable_destroy(&bc->table);
615 }
616 
617 int bch2_fs_btree_cache_init(struct bch_fs *c)
618 {
619 	struct btree_cache *bc = &c->btree_cache;
620 	struct shrinker *shrink;
621 	unsigned i;
622 	int ret = 0;
623 
624 	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
625 	if (ret)
626 		goto err;
627 
628 	bc->table_init_done = true;
629 
630 	bch2_recalc_btree_reserve(c);
631 
632 	for (i = 0; i < bc->nr_reserve; i++)
633 		if (!__bch2_btree_node_mem_alloc(c))
634 			goto err;
635 
636 	list_splice_init(&bc->live[0].list, &bc->freeable);
637 
638 	mutex_init(&c->verify_lock);
639 
640 	shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
641 	if (!shrink)
642 		goto err;
643 	bc->live[0].shrink	= shrink;
644 	shrink->count_objects	= bch2_btree_cache_count;
645 	shrink->scan_objects	= bch2_btree_cache_scan;
646 	shrink->seeks		= 2;
647 	shrink->private_data	= &bc->live[0];
648 	shrinker_register(shrink);
649 
650 	shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
651 	if (!shrink)
652 		goto err;
653 	bc->live[1].shrink	= shrink;
654 	shrink->count_objects	= bch2_btree_cache_count;
655 	shrink->scan_objects	= bch2_btree_cache_scan;
656 	shrink->seeks		= 8;
657 	shrink->private_data	= &bc->live[1];
658 	shrinker_register(shrink);
659 
660 	return 0;
661 err:
662 	return -BCH_ERR_ENOMEM_fs_btree_cache_init;
663 }
664 
665 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
666 {
667 	mutex_init(&bc->lock);
668 	for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
669 		bc->live[i].idx = i;
670 		INIT_LIST_HEAD(&bc->live[i].list);
671 	}
672 	INIT_LIST_HEAD(&bc->freeable);
673 	INIT_LIST_HEAD(&bc->freed_pcpu);
674 	INIT_LIST_HEAD(&bc->freed_nonpcpu);
675 }
676 
677 /*
678  * We can only have one thread cannibalizing other cached btree nodes at a time,
679  * or we'll deadlock. We use an open coded mutex to ensure that, which a
680  * cannibalize_bucket() will take. This means every time we unlock the root of
681  * the btree, we need to release this lock if we have it held.
682  */
683 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
684 {
685 	struct bch_fs *c = trans->c;
686 	struct btree_cache *bc = &c->btree_cache;
687 
688 	if (bc->alloc_lock == current) {
689 		trace_and_count(c, btree_cache_cannibalize_unlock, trans);
690 		bc->alloc_lock = NULL;
691 		closure_wake_up(&bc->alloc_wait);
692 	}
693 }
694 
695 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
696 {
697 	struct bch_fs *c = trans->c;
698 	struct btree_cache *bc = &c->btree_cache;
699 	struct task_struct *old;
700 
701 	old = NULL;
702 	if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
703 		goto success;
704 
705 	if (!cl) {
706 		trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
707 		return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
708 	}
709 
710 	closure_wait(&bc->alloc_wait, cl);
711 
712 	/* Try again, after adding ourselves to waitlist */
713 	old = NULL;
714 	if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
715 		/* We raced */
716 		closure_wake_up(&bc->alloc_wait);
717 		goto success;
718 	}
719 
720 	trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
721 	return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
722 
723 success:
724 	trace_and_count(c, btree_cache_cannibalize_lock, trans);
725 	return 0;
726 }
727 
728 static struct btree *btree_node_cannibalize(struct bch_fs *c)
729 {
730 	struct btree_cache *bc = &c->btree_cache;
731 	struct btree *b;
732 
733 	for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
734 		list_for_each_entry_reverse(b, &bc->live[i].list, list)
735 			if (!btree_node_reclaim(c, b, false))
736 				return b;
737 
738 	while (1) {
739 		for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
740 			list_for_each_entry_reverse(b, &bc->live[i].list, list)
741 				if (!btree_node_write_and_reclaim(c, b))
742 					return b;
743 
744 		/*
745 		 * Rare case: all nodes were intent-locked.
746 		 * Just busy-wait.
747 		 */
748 		WARN_ONCE(1, "btree cache cannibalize failed\n");
749 		cond_resched();
750 	}
751 }
752 
753 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
754 {
755 	struct bch_fs *c = trans->c;
756 	struct btree_cache *bc = &c->btree_cache;
757 	struct list_head *freed = pcpu_read_locks
758 		? &bc->freed_pcpu
759 		: &bc->freed_nonpcpu;
760 	struct btree *b, *b2;
761 	u64 start_time = local_clock();
762 
763 	mutex_lock(&bc->lock);
764 
765 	/*
766 	 * We never free struct btree itself, just the memory that holds the on
767 	 * disk node. Check the freed list before allocating a new one:
768 	 */
769 	list_for_each_entry(b, freed, list)
770 		if (!btree_node_reclaim(c, b, false)) {
771 			list_del_init(&b->list);
772 			goto got_node;
773 		}
774 
775 	b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
776 	if (!b) {
777 		mutex_unlock(&bc->lock);
778 		bch2_trans_unlock(trans);
779 		b = __btree_node_mem_alloc(c, GFP_KERNEL);
780 		if (!b)
781 			goto err;
782 		mutex_lock(&bc->lock);
783 	}
784 
785 	bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
786 
787 	BUG_ON(!six_trylock_intent(&b->c.lock));
788 	BUG_ON(!six_trylock_write(&b->c.lock));
789 got_node:
790 
791 	/*
792 	 * btree_free() doesn't free memory; it sticks the node on the end of
793 	 * the list. Check if there's any freed nodes there:
794 	 */
795 	list_for_each_entry(b2, &bc->freeable, list)
796 		if (!btree_node_reclaim(c, b2, false)) {
797 			swap(b->data, b2->data);
798 			swap(b->aux_data, b2->aux_data);
799 			btree_node_to_freedlist(bc, b2);
800 			six_unlock_write(&b2->c.lock);
801 			six_unlock_intent(&b2->c.lock);
802 			goto got_mem;
803 		}
804 
805 	mutex_unlock(&bc->lock);
806 
807 	if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
808 		bch2_trans_unlock(trans);
809 		if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
810 			goto err;
811 	}
812 
813 	mutex_lock(&bc->lock);
814 	bc->nr_freeable++;
815 got_mem:
816 	mutex_unlock(&bc->lock);
817 
818 	BUG_ON(btree_node_hashed(b));
819 	BUG_ON(btree_node_dirty(b));
820 	BUG_ON(btree_node_write_in_flight(b));
821 out:
822 	b->flags		= 0;
823 	b->written		= 0;
824 	b->nsets		= 0;
825 	b->sib_u64s[0]		= 0;
826 	b->sib_u64s[1]		= 0;
827 	b->whiteout_u64s	= 0;
828 	bch2_btree_keys_init(b);
829 	set_btree_node_accessed(b);
830 
831 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
832 			       start_time);
833 
834 	int ret = bch2_trans_relock(trans);
835 	if (unlikely(ret)) {
836 		bch2_btree_node_to_freelist(c, b);
837 		return ERR_PTR(ret);
838 	}
839 
840 	return b;
841 err:
842 	mutex_lock(&bc->lock);
843 
844 	/* Try to cannibalize another cached btree node: */
845 	if (bc->alloc_lock == current) {
846 		b2 = btree_node_cannibalize(c);
847 		clear_btree_node_just_written(b2);
848 		bch2_btree_node_hash_remove(bc, b2);
849 
850 		if (b) {
851 			swap(b->data, b2->data);
852 			swap(b->aux_data, b2->aux_data);
853 			btree_node_to_freedlist(bc, b2);
854 			six_unlock_write(&b2->c.lock);
855 			six_unlock_intent(&b2->c.lock);
856 		} else {
857 			b = b2;
858 			list_del_init(&b->list);
859 		}
860 
861 		mutex_unlock(&bc->lock);
862 
863 		trace_and_count(c, btree_cache_cannibalize, trans);
864 		goto out;
865 	}
866 
867 	mutex_unlock(&bc->lock);
868 	return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
869 }
870 
871 /* Slowpath, don't want it inlined into btree_iter_traverse() */
872 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
873 				struct btree_path *path,
874 				const struct bkey_i *k,
875 				enum btree_id btree_id,
876 				unsigned level,
877 				enum six_lock_type lock_type,
878 				bool sync)
879 {
880 	struct bch_fs *c = trans->c;
881 	struct btree_cache *bc = &c->btree_cache;
882 	struct btree *b;
883 
884 	if (unlikely(level >= BTREE_MAX_DEPTH)) {
885 		int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
886 						 level, BTREE_MAX_DEPTH);
887 		return ERR_PTR(ret);
888 	}
889 
890 	if (unlikely(!bkey_is_btree_ptr(&k->k))) {
891 		struct printbuf buf = PRINTBUF;
892 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
893 
894 		int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
895 		printbuf_exit(&buf);
896 		return ERR_PTR(ret);
897 	}
898 
899 	if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
900 		struct printbuf buf = PRINTBUF;
901 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
902 
903 		int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
904 		printbuf_exit(&buf);
905 		return ERR_PTR(ret);
906 	}
907 
908 	/*
909 	 * Parent node must be locked, else we could read in a btree node that's
910 	 * been freed:
911 	 */
912 	if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
913 		trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
914 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
915 	}
916 
917 	b = bch2_btree_node_mem_alloc(trans, level != 0);
918 
919 	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
920 		if (!path)
921 			return b;
922 
923 		trans->memory_allocation_failure = true;
924 		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
925 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
926 	}
927 
928 	if (IS_ERR(b))
929 		return b;
930 
931 	bkey_copy(&b->key, k);
932 	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
933 		/* raced with another fill: */
934 
935 		/* mark as unhashed... */
936 		b->hash_val = 0;
937 
938 		mutex_lock(&bc->lock);
939 		list_add(&b->list, &bc->freeable);
940 		mutex_unlock(&bc->lock);
941 
942 		six_unlock_write(&b->c.lock);
943 		six_unlock_intent(&b->c.lock);
944 		return NULL;
945 	}
946 
947 	set_btree_node_read_in_flight(b);
948 	six_unlock_write(&b->c.lock);
949 
950 	if (path) {
951 		u32 seq = six_lock_seq(&b->c.lock);
952 
953 		/* Unlock before doing IO: */
954 		six_unlock_intent(&b->c.lock);
955 		bch2_trans_unlock_noassert(trans);
956 
957 		bch2_btree_node_read(trans, b, sync);
958 
959 		int ret = bch2_trans_relock(trans);
960 		if (ret)
961 			return ERR_PTR(ret);
962 
963 		if (!sync)
964 			return NULL;
965 
966 		if (!six_relock_type(&b->c.lock, lock_type, seq))
967 			b = NULL;
968 	} else {
969 		bch2_btree_node_read(trans, b, sync);
970 		if (lock_type == SIX_LOCK_read)
971 			six_lock_downgrade(&b->c.lock);
972 	}
973 
974 	return b;
975 }
976 
977 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
978 {
979 	struct printbuf buf = PRINTBUF;
980 
981 	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
982 		return;
983 
984 	prt_printf(&buf,
985 	       "btree node header doesn't match ptr\n"
986 	       "btree %s level %u\n"
987 	       "ptr: ",
988 	       bch2_btree_id_str(b->c.btree_id), b->c.level);
989 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
990 
991 	prt_printf(&buf, "\nheader: btree %s level %llu\n"
992 	       "min ",
993 	       bch2_btree_id_str(BTREE_NODE_ID(b->data)),
994 	       BTREE_NODE_LEVEL(b->data));
995 	bch2_bpos_to_text(&buf, b->data->min_key);
996 
997 	prt_printf(&buf, "\nmax ");
998 	bch2_bpos_to_text(&buf, b->data->max_key);
999 
1000 	bch2_fs_topology_error(c, "%s", buf.buf);
1001 
1002 	printbuf_exit(&buf);
1003 }
1004 
1005 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
1006 {
1007 	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
1008 	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
1009 	    !bpos_eq(b->data->max_key, b->key.k.p) ||
1010 	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
1011 	     !bpos_eq(b->data->min_key,
1012 		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
1013 		btree_bad_header(c, b);
1014 }
1015 
1016 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
1017 					   const struct bkey_i *k, unsigned level,
1018 					   enum six_lock_type lock_type,
1019 					   unsigned long trace_ip)
1020 {
1021 	struct bch_fs *c = trans->c;
1022 	struct btree_cache *bc = &c->btree_cache;
1023 	struct btree *b;
1024 	bool need_relock = false;
1025 	int ret;
1026 
1027 	EBUG_ON(level >= BTREE_MAX_DEPTH);
1028 retry:
1029 	b = btree_cache_find(bc, k);
1030 	if (unlikely(!b)) {
1031 		/*
1032 		 * We must have the parent locked to call bch2_btree_node_fill(),
1033 		 * else we could read in a btree node from disk that's been
1034 		 * freed:
1035 		 */
1036 		b = bch2_btree_node_fill(trans, path, k, path->btree_id,
1037 					 level, lock_type, true);
1038 		need_relock = true;
1039 
1040 		/* We raced and found the btree node in the cache */
1041 		if (!b)
1042 			goto retry;
1043 
1044 		if (IS_ERR(b))
1045 			return b;
1046 	} else {
1047 		if (btree_node_read_locked(path, level + 1))
1048 			btree_node_unlock(trans, path, level + 1);
1049 
1050 		ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
1051 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1052 			return ERR_PTR(ret);
1053 
1054 		BUG_ON(ret);
1055 
1056 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1057 			     b->c.level != level ||
1058 			     race_fault())) {
1059 			six_unlock_type(&b->c.lock, lock_type);
1060 			if (bch2_btree_node_relock(trans, path, level + 1))
1061 				goto retry;
1062 
1063 			trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1064 			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1065 		}
1066 
1067 		/* avoid atomic set bit if it's not needed: */
1068 		if (!btree_node_accessed(b))
1069 			set_btree_node_accessed(b);
1070 	}
1071 
1072 	if (unlikely(btree_node_read_in_flight(b))) {
1073 		u32 seq = six_lock_seq(&b->c.lock);
1074 
1075 		six_unlock_type(&b->c.lock, lock_type);
1076 		bch2_trans_unlock(trans);
1077 		need_relock = true;
1078 
1079 		bch2_btree_node_wait_on_read(b);
1080 
1081 		ret = bch2_trans_relock(trans);
1082 		if (ret)
1083 			return ERR_PTR(ret);
1084 
1085 		/*
1086 		 * should_be_locked is not set on this path yet, so we need to
1087 		 * relock it specifically:
1088 		 */
1089 		if (!six_relock_type(&b->c.lock, lock_type, seq))
1090 			goto retry;
1091 	}
1092 
1093 	if (unlikely(need_relock)) {
1094 		ret = bch2_trans_relock(trans) ?:
1095 			bch2_btree_path_relock_intent(trans, path);
1096 		if (ret) {
1097 			six_unlock_type(&b->c.lock, lock_type);
1098 			return ERR_PTR(ret);
1099 		}
1100 	}
1101 
1102 	prefetch(b->aux_data);
1103 
1104 	for_each_bset(b, t) {
1105 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
1106 
1107 		prefetch(p + L1_CACHE_BYTES * 0);
1108 		prefetch(p + L1_CACHE_BYTES * 1);
1109 		prefetch(p + L1_CACHE_BYTES * 2);
1110 	}
1111 
1112 	if (unlikely(btree_node_read_error(b))) {
1113 		six_unlock_type(&b->c.lock, lock_type);
1114 		return ERR_PTR(-BCH_ERR_btree_node_read_error);
1115 	}
1116 
1117 	EBUG_ON(b->c.btree_id != path->btree_id);
1118 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1119 	btree_check_header(c, b);
1120 
1121 	return b;
1122 }
1123 
1124 /**
1125  * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
1126  * in from disk if necessary.
1127  *
1128  * @trans:	btree transaction object
1129  * @path:	btree_path being traversed
1130  * @k:		pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
1131  * @level:	level of btree node being looked up (0 == leaf node)
1132  * @lock_type:	SIX_LOCK_read or SIX_LOCK_intent
1133  * @trace_ip:	ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
1134  *
1135  * The btree node will have either a read or a write lock held, depending on
1136  * the @write parameter.
1137  *
1138  * Returns: btree node or ERR_PTR()
1139  */
1140 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
1141 				  const struct bkey_i *k, unsigned level,
1142 				  enum six_lock_type lock_type,
1143 				  unsigned long trace_ip)
1144 {
1145 	struct bch_fs *c = trans->c;
1146 	struct btree *b;
1147 	int ret;
1148 
1149 	EBUG_ON(level >= BTREE_MAX_DEPTH);
1150 
1151 	b = btree_node_mem_ptr(k);
1152 
1153 	/*
1154 	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
1155 	 * be the node we want anymore, and trying to lock the wrong node could
1156 	 * cause an unneccessary transaction restart:
1157 	 */
1158 	if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
1159 		     !b ||
1160 		     b->hash_val != btree_ptr_hash_val(k)))
1161 		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1162 
1163 	if (btree_node_read_locked(path, level + 1))
1164 		btree_node_unlock(trans, path, level + 1);
1165 
1166 	ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
1167 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1168 		return ERR_PTR(ret);
1169 
1170 	BUG_ON(ret);
1171 
1172 	if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1173 		     b->c.level != level ||
1174 		     race_fault())) {
1175 		six_unlock_type(&b->c.lock, lock_type);
1176 		if (bch2_btree_node_relock(trans, path, level + 1))
1177 			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1178 
1179 		trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1180 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1181 	}
1182 
1183 	if (unlikely(btree_node_read_in_flight(b))) {
1184 		six_unlock_type(&b->c.lock, lock_type);
1185 		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1186 	}
1187 
1188 	prefetch(b->aux_data);
1189 
1190 	for_each_bset(b, t) {
1191 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
1192 
1193 		prefetch(p + L1_CACHE_BYTES * 0);
1194 		prefetch(p + L1_CACHE_BYTES * 1);
1195 		prefetch(p + L1_CACHE_BYTES * 2);
1196 	}
1197 
1198 	/* avoid atomic set bit if it's not needed: */
1199 	if (!btree_node_accessed(b))
1200 		set_btree_node_accessed(b);
1201 
1202 	if (unlikely(btree_node_read_error(b))) {
1203 		six_unlock_type(&b->c.lock, lock_type);
1204 		return ERR_PTR(-BCH_ERR_btree_node_read_error);
1205 	}
1206 
1207 	EBUG_ON(b->c.btree_id != path->btree_id);
1208 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1209 	btree_check_header(c, b);
1210 
1211 	return b;
1212 }
1213 
1214 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
1215 					 const struct bkey_i *k,
1216 					 enum btree_id btree_id,
1217 					 unsigned level,
1218 					 bool nofill)
1219 {
1220 	struct bch_fs *c = trans->c;
1221 	struct btree_cache *bc = &c->btree_cache;
1222 	struct btree *b;
1223 	int ret;
1224 
1225 	EBUG_ON(level >= BTREE_MAX_DEPTH);
1226 
1227 	if (c->opts.btree_node_mem_ptr_optimization) {
1228 		b = btree_node_mem_ptr(k);
1229 		if (b)
1230 			goto lock_node;
1231 	}
1232 retry:
1233 	b = btree_cache_find(bc, k);
1234 	if (unlikely(!b)) {
1235 		if (nofill)
1236 			goto out;
1237 
1238 		b = bch2_btree_node_fill(trans, NULL, k, btree_id,
1239 					 level, SIX_LOCK_read, true);
1240 
1241 		/* We raced and found the btree node in the cache */
1242 		if (!b)
1243 			goto retry;
1244 
1245 		if (IS_ERR(b) &&
1246 		    !bch2_btree_cache_cannibalize_lock(trans, NULL))
1247 			goto retry;
1248 
1249 		if (IS_ERR(b))
1250 			goto out;
1251 	} else {
1252 lock_node:
1253 		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
1254 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1255 			return ERR_PTR(ret);
1256 
1257 		BUG_ON(ret);
1258 
1259 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1260 			     b->c.btree_id != btree_id ||
1261 			     b->c.level != level)) {
1262 			six_unlock_read(&b->c.lock);
1263 			goto retry;
1264 		}
1265 	}
1266 
1267 	/* XXX: waiting on IO with btree locks held: */
1268 	__bch2_btree_node_wait_on_read(b);
1269 
1270 	prefetch(b->aux_data);
1271 
1272 	for_each_bset(b, t) {
1273 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
1274 
1275 		prefetch(p + L1_CACHE_BYTES * 0);
1276 		prefetch(p + L1_CACHE_BYTES * 1);
1277 		prefetch(p + L1_CACHE_BYTES * 2);
1278 	}
1279 
1280 	/* avoid atomic set bit if it's not needed: */
1281 	if (!btree_node_accessed(b))
1282 		set_btree_node_accessed(b);
1283 
1284 	if (unlikely(btree_node_read_error(b))) {
1285 		six_unlock_read(&b->c.lock);
1286 		b = ERR_PTR(-BCH_ERR_btree_node_read_error);
1287 		goto out;
1288 	}
1289 
1290 	EBUG_ON(b->c.btree_id != btree_id);
1291 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1292 	btree_check_header(c, b);
1293 out:
1294 	bch2_btree_cache_cannibalize_unlock(trans);
1295 	return b;
1296 }
1297 
1298 int bch2_btree_node_prefetch(struct btree_trans *trans,
1299 			     struct btree_path *path,
1300 			     const struct bkey_i *k,
1301 			     enum btree_id btree_id, unsigned level)
1302 {
1303 	struct bch_fs *c = trans->c;
1304 	struct btree_cache *bc = &c->btree_cache;
1305 
1306 	BUG_ON(path && !btree_node_locked(path, level + 1));
1307 	BUG_ON(level >= BTREE_MAX_DEPTH);
1308 
1309 	struct btree *b = btree_cache_find(bc, k);
1310 	if (b)
1311 		return 0;
1312 
1313 	b = bch2_btree_node_fill(trans, path, k, btree_id,
1314 				 level, SIX_LOCK_read, false);
1315 	if (!IS_ERR_OR_NULL(b))
1316 		six_unlock_read(&b->c.lock);
1317 	return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
1318 }
1319 
1320 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1321 {
1322 	struct bch_fs *c = trans->c;
1323 	struct btree_cache *bc = &c->btree_cache;
1324 	struct btree *b;
1325 
1326 	b = btree_cache_find(bc, k);
1327 	if (!b)
1328 		return;
1329 
1330 	BUG_ON(b == btree_node_root(trans->c, b));
1331 wait_on_io:
1332 	/* not allowed to wait on io with btree locks held: */
1333 
1334 	/* XXX we're called from btree_gc which will be holding other btree
1335 	 * nodes locked
1336 	 */
1337 	__bch2_btree_node_wait_on_read(b);
1338 	__bch2_btree_node_wait_on_write(b);
1339 
1340 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1341 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1342 	if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
1343 		goto out;
1344 
1345 	if (btree_node_dirty(b)) {
1346 		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1347 		six_unlock_write(&b->c.lock);
1348 		six_unlock_intent(&b->c.lock);
1349 		goto wait_on_io;
1350 	}
1351 
1352 	BUG_ON(btree_node_dirty(b));
1353 
1354 	mutex_lock(&bc->lock);
1355 	bch2_btree_node_hash_remove(bc, b);
1356 	btree_node_data_free(c, b);
1357 	mutex_unlock(&bc->lock);
1358 out:
1359 	six_unlock_write(&b->c.lock);
1360 	six_unlock_intent(&b->c.lock);
1361 }
1362 
1363 const char *bch2_btree_id_str(enum btree_id btree)
1364 {
1365 	return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
1366 }
1367 
1368 void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
1369 {
1370 	if (btree < BTREE_ID_NR)
1371 		prt_str(out, __bch2_btree_ids[btree]);
1372 	else
1373 		prt_printf(out, "(unknown btree %u)", btree);
1374 }
1375 
1376 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
1377 {
1378 	prt_printf(out, "%s level %u/%u\n  ",
1379 	       bch2_btree_id_str(b->c.btree_id),
1380 	       b->c.level,
1381 	       bch2_btree_id_root(c, b->c.btree_id)->level);
1382 	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1383 }
1384 
1385 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
1386 {
1387 	struct bset_stats stats;
1388 
1389 	memset(&stats, 0, sizeof(stats));
1390 
1391 	bch2_btree_keys_stats(b, &stats);
1392 
1393 	prt_printf(out, "l %u ", b->c.level);
1394 	bch2_bpos_to_text(out, b->data->min_key);
1395 	prt_printf(out, " - ");
1396 	bch2_bpos_to_text(out, b->data->max_key);
1397 	prt_printf(out, ":\n"
1398 	       "    ptrs: ");
1399 	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1400 	prt_newline(out);
1401 
1402 	prt_printf(out,
1403 	       "    format: ");
1404 	bch2_bkey_format_to_text(out, &b->format);
1405 
1406 	prt_printf(out,
1407 	       "    unpack fn len: %u\n"
1408 	       "    bytes used %zu/%zu (%zu%% full)\n"
1409 	       "    sib u64s: %u, %u (merge threshold %u)\n"
1410 	       "    nr packed keys %u\n"
1411 	       "    nr unpacked keys %u\n"
1412 	       "    floats %zu\n"
1413 	       "    failed unpacked %zu\n",
1414 	       b->unpack_fn_len,
1415 	       b->nr.live_u64s * sizeof(u64),
1416 	       btree_buf_bytes(b) - sizeof(struct btree_node),
1417 	       b->nr.live_u64s * 100 / btree_max_u64s(c),
1418 	       b->sib_u64s[0],
1419 	       b->sib_u64s[1],
1420 	       c->btree_foreground_merge_threshold,
1421 	       b->nr.packed_keys,
1422 	       b->nr.unpacked_keys,
1423 	       stats.floats,
1424 	       stats.failed);
1425 }
1426 
1427 static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
1428 				 const char *label, size_t nr)
1429 {
1430 	prt_printf(out, "%s\t", label);
1431 	prt_human_readable_u64(out, nr * c->opts.btree_node_size);
1432 	prt_printf(out, " (%zu)\n", nr);
1433 }
1434 
1435 static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
1436 #define x(n) #n,
1437 	BCH_BTREE_CACHE_NOT_FREED_REASONS()
1438 #undef x
1439 	NULL
1440 };
1441 
1442 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
1443 {
1444 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
1445 
1446 	if (!out->nr_tabstops)
1447 		printbuf_tabstop_push(out, 32);
1448 
1449 	prt_btree_cache_line(out, c, "live:",		bc->live[0].nr);
1450 	prt_btree_cache_line(out, c, "pinned:",		bc->live[1].nr);
1451 	prt_btree_cache_line(out, c, "freeable:",	bc->nr_freeable);
1452 	prt_btree_cache_line(out, c, "dirty:",		atomic_long_read(&bc->nr_dirty));
1453 	prt_printf(out, "cannibalize lock:\t%p\n",	bc->alloc_lock);
1454 	prt_newline(out);
1455 
1456 	for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
1457 		prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]);
1458 
1459 	prt_newline(out);
1460 	prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
1461 	prt_printf(out, "not freed:\n");
1462 
1463 	for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
1464 		prt_printf(out, "  %s\t%llu\n",
1465 			   bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
1466 }
1467