xref: /linux/fs/bcachefs/btree_cache.c (revision 001821b0e79716c4e17c71d8e053a23599a7a508)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bbpos.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_io.h"
8 #include "btree_iter.h"
9 #include "btree_locking.h"
10 #include "debug.h"
11 #include "errcode.h"
12 #include "error.h"
13 #include "journal.h"
14 #include "trace.h"
15 
16 #include <linux/prefetch.h>
17 #include <linux/sched/mm.h>
18 
19 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
20 do {						 \
21 	if (shrinker_counter)			 \
22 		bc->not_freed_##counter++;	 \
23 } while (0)
24 
25 const char * const bch2_btree_node_flags[] = {
26 #define x(f)	#f,
27 	BTREE_FLAGS()
28 #undef x
29 	NULL
30 };
31 
32 void bch2_recalc_btree_reserve(struct bch_fs *c)
33 {
34 	unsigned i, reserve = 16;
35 
36 	if (!c->btree_roots_known[0].b)
37 		reserve += 8;
38 
39 	for (i = 0; i < btree_id_nr_alive(c); i++) {
40 		struct btree_root *r = bch2_btree_id_root(c, i);
41 
42 		if (r->b)
43 			reserve += min_t(unsigned, 1, r->b->c.level) * 8;
44 	}
45 
46 	c->btree_cache.reserve = reserve;
47 }
48 
49 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
50 {
51 	return max_t(int, 0, bc->used - bc->reserve);
52 }
53 
54 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
55 {
56 	if (b->c.lock.readers)
57 		list_move(&b->list, &bc->freed_pcpu);
58 	else
59 		list_move(&b->list, &bc->freed_nonpcpu);
60 }
61 
62 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
63 {
64 	struct btree_cache *bc = &c->btree_cache;
65 
66 	EBUG_ON(btree_node_write_in_flight(b));
67 
68 	clear_btree_node_just_written(b);
69 
70 	kvfree(b->data);
71 	b->data = NULL;
72 #ifdef __KERNEL__
73 	kvfree(b->aux_data);
74 #else
75 	munmap(b->aux_data, btree_aux_data_bytes(b));
76 #endif
77 	b->aux_data = NULL;
78 
79 	bc->used--;
80 
81 	btree_node_to_freedlist(bc, b);
82 }
83 
84 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
85 				   const void *obj)
86 {
87 	const struct btree *b = obj;
88 	const u64 *v = arg->key;
89 
90 	return b->hash_val == *v ? 0 : 1;
91 }
92 
93 static const struct rhashtable_params bch_btree_cache_params = {
94 	.head_offset	= offsetof(struct btree, hash),
95 	.key_offset	= offsetof(struct btree, hash_val),
96 	.key_len	= sizeof(u64),
97 	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
98 };
99 
100 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
101 {
102 	BUG_ON(b->data || b->aux_data);
103 
104 	b->data = kvmalloc(btree_buf_bytes(b), gfp);
105 	if (!b->data)
106 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
107 #ifdef __KERNEL__
108 	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
109 #else
110 	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
111 			   PROT_READ|PROT_WRITE|PROT_EXEC,
112 			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
113 	if (b->aux_data == MAP_FAILED)
114 		b->aux_data = NULL;
115 #endif
116 	if (!b->aux_data) {
117 		kvfree(b->data);
118 		b->data = NULL;
119 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
120 	}
121 
122 	return 0;
123 }
124 
125 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
126 {
127 	struct btree *b;
128 
129 	b = kzalloc(sizeof(struct btree), gfp);
130 	if (!b)
131 		return NULL;
132 
133 	bkey_btree_ptr_init(&b->key);
134 	INIT_LIST_HEAD(&b->list);
135 	INIT_LIST_HEAD(&b->write_blocked);
136 	b->byte_order = ilog2(c->opts.btree_node_size);
137 	return b;
138 }
139 
140 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
141 {
142 	struct btree_cache *bc = &c->btree_cache;
143 	struct btree *b;
144 
145 	b = __btree_node_mem_alloc(c, GFP_KERNEL);
146 	if (!b)
147 		return NULL;
148 
149 	if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
150 		kfree(b);
151 		return NULL;
152 	}
153 
154 	bch2_btree_lock_init(&b->c, 0);
155 
156 	bc->used++;
157 	list_add(&b->list, &bc->freeable);
158 	return b;
159 }
160 
161 /* Btree in memory cache - hash table */
162 
163 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
164 {
165 	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
166 
167 	BUG_ON(ret);
168 
169 	/* Cause future lookups for this node to fail: */
170 	b->hash_val = 0;
171 
172 	if (b->c.btree_id < BTREE_ID_NR)
173 		--bc->used_by_btree[b->c.btree_id];
174 }
175 
176 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
177 {
178 	BUG_ON(b->hash_val);
179 	b->hash_val = btree_ptr_hash_val(&b->key);
180 
181 	int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
182 						bch_btree_cache_params);
183 	if (!ret && b->c.btree_id < BTREE_ID_NR)
184 		bc->used_by_btree[b->c.btree_id]++;
185 	return ret;
186 }
187 
188 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
189 				unsigned level, enum btree_id id)
190 {
191 	int ret;
192 
193 	b->c.level	= level;
194 	b->c.btree_id	= id;
195 
196 	mutex_lock(&bc->lock);
197 	ret = __bch2_btree_node_hash_insert(bc, b);
198 	if (!ret)
199 		list_add_tail(&b->list, &bc->live);
200 	mutex_unlock(&bc->lock);
201 
202 	return ret;
203 }
204 
205 void bch2_btree_node_update_key_early(struct btree_trans *trans,
206 				      enum btree_id btree, unsigned level,
207 				      struct bkey_s_c old, struct bkey_i *new)
208 {
209 	struct bch_fs *c = trans->c;
210 	struct btree *b;
211 	struct bkey_buf tmp;
212 	int ret;
213 
214 	bch2_bkey_buf_init(&tmp);
215 	bch2_bkey_buf_reassemble(&tmp, c, old);
216 
217 	b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
218 	if (!IS_ERR_OR_NULL(b)) {
219 		mutex_lock(&c->btree_cache.lock);
220 
221 		bch2_btree_node_hash_remove(&c->btree_cache, b);
222 
223 		bkey_copy(&b->key, new);
224 		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
225 		BUG_ON(ret);
226 
227 		mutex_unlock(&c->btree_cache.lock);
228 		six_unlock_read(&b->c.lock);
229 	}
230 
231 	bch2_bkey_buf_exit(&tmp, c);
232 }
233 
234 __flatten
235 static inline struct btree *btree_cache_find(struct btree_cache *bc,
236 				     const struct bkey_i *k)
237 {
238 	u64 v = btree_ptr_hash_val(k);
239 
240 	return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
241 }
242 
243 /*
244  * this version is for btree nodes that have already been freed (we're not
245  * reaping a real btree node)
246  */
247 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
248 {
249 	struct btree_cache *bc = &c->btree_cache;
250 	int ret = 0;
251 
252 	lockdep_assert_held(&bc->lock);
253 
254 	struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
255 
256 	u64 mask = b->c.level
257 		? bc->pinned_nodes_interior_mask
258 		: bc->pinned_nodes_leaf_mask;
259 
260 	if ((mask & BIT_ULL(b->c.btree_id)) &&
261 	    bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
262 	    bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
263 		return -BCH_ERR_ENOMEM_btree_node_reclaim;
264 
265 wait_on_io:
266 	if (b->flags & ((1U << BTREE_NODE_dirty)|
267 			(1U << BTREE_NODE_read_in_flight)|
268 			(1U << BTREE_NODE_write_in_flight))) {
269 		if (!flush) {
270 			if (btree_node_dirty(b))
271 				BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
272 			else if (btree_node_read_in_flight(b))
273 				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
274 			else if (btree_node_write_in_flight(b))
275 				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
276 			return -BCH_ERR_ENOMEM_btree_node_reclaim;
277 		}
278 
279 		/* XXX: waiting on IO with btree cache lock held */
280 		bch2_btree_node_wait_on_read(b);
281 		bch2_btree_node_wait_on_write(b);
282 	}
283 
284 	if (!six_trylock_intent(&b->c.lock)) {
285 		BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
286 		return -BCH_ERR_ENOMEM_btree_node_reclaim;
287 	}
288 
289 	if (!six_trylock_write(&b->c.lock)) {
290 		BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
291 		goto out_unlock_intent;
292 	}
293 
294 	/* recheck under lock */
295 	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
296 			(1U << BTREE_NODE_write_in_flight))) {
297 		if (!flush) {
298 			if (btree_node_read_in_flight(b))
299 				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
300 			else if (btree_node_write_in_flight(b))
301 				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
302 			goto out_unlock;
303 		}
304 		six_unlock_write(&b->c.lock);
305 		six_unlock_intent(&b->c.lock);
306 		goto wait_on_io;
307 	}
308 
309 	if (btree_node_noevict(b)) {
310 		BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
311 		goto out_unlock;
312 	}
313 	if (btree_node_write_blocked(b)) {
314 		BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
315 		goto out_unlock;
316 	}
317 	if (btree_node_will_make_reachable(b)) {
318 		BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
319 		goto out_unlock;
320 	}
321 
322 	if (btree_node_dirty(b)) {
323 		if (!flush) {
324 			BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
325 			goto out_unlock;
326 		}
327 		/*
328 		 * Using the underscore version because we don't want to compact
329 		 * bsets after the write, since this node is about to be evicted
330 		 * - unless btree verify mode is enabled, since it runs out of
331 		 * the post write cleanup:
332 		 */
333 		if (bch2_verify_btree_ondisk)
334 			bch2_btree_node_write(c, b, SIX_LOCK_intent,
335 					      BTREE_WRITE_cache_reclaim);
336 		else
337 			__bch2_btree_node_write(c, b,
338 						BTREE_WRITE_cache_reclaim);
339 
340 		six_unlock_write(&b->c.lock);
341 		six_unlock_intent(&b->c.lock);
342 		goto wait_on_io;
343 	}
344 out:
345 	if (b->hash_val && !ret)
346 		trace_and_count(c, btree_cache_reap, c, b);
347 	return ret;
348 out_unlock:
349 	six_unlock_write(&b->c.lock);
350 out_unlock_intent:
351 	six_unlock_intent(&b->c.lock);
352 	ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
353 	goto out;
354 }
355 
356 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
357 {
358 	return __btree_node_reclaim(c, b, false, shrinker_counter);
359 }
360 
361 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
362 {
363 	return __btree_node_reclaim(c, b, true, false);
364 }
365 
366 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
367 					   struct shrink_control *sc)
368 {
369 	struct bch_fs *c = shrink->private_data;
370 	struct btree_cache *bc = &c->btree_cache;
371 	struct btree *b, *t;
372 	unsigned long nr = sc->nr_to_scan;
373 	unsigned long can_free = 0;
374 	unsigned long freed = 0;
375 	unsigned long touched = 0;
376 	unsigned i, flags;
377 	unsigned long ret = SHRINK_STOP;
378 	bool trigger_writes = atomic_read(&bc->dirty) + nr >=
379 		bc->used * 3 / 4;
380 
381 	if (bch2_btree_shrinker_disabled)
382 		return SHRINK_STOP;
383 
384 	mutex_lock(&bc->lock);
385 	flags = memalloc_nofs_save();
386 
387 	/*
388 	 * It's _really_ critical that we don't free too many btree nodes - we
389 	 * have to always leave ourselves a reserve. The reserve is how we
390 	 * guarantee that allocating memory for a new btree node can always
391 	 * succeed, so that inserting keys into the btree can always succeed and
392 	 * IO can always make forward progress:
393 	 */
394 	can_free = btree_cache_can_free(bc);
395 	nr = min_t(unsigned long, nr, can_free);
396 
397 	i = 0;
398 	list_for_each_entry_safe(b, t, &bc->freeable, list) {
399 		/*
400 		 * Leave a few nodes on the freeable list, so that a btree split
401 		 * won't have to hit the system allocator:
402 		 */
403 		if (++i <= 3)
404 			continue;
405 
406 		touched++;
407 
408 		if (touched >= nr)
409 			goto out;
410 
411 		if (!btree_node_reclaim(c, b, true)) {
412 			btree_node_data_free(c, b);
413 			six_unlock_write(&b->c.lock);
414 			six_unlock_intent(&b->c.lock);
415 			freed++;
416 			bc->freed++;
417 		}
418 	}
419 restart:
420 	list_for_each_entry_safe(b, t, &bc->live, list) {
421 		touched++;
422 
423 		if (btree_node_accessed(b)) {
424 			clear_btree_node_accessed(b);
425 			bc->not_freed_access_bit++;
426 		} else if (!btree_node_reclaim(c, b, true)) {
427 			freed++;
428 			btree_node_data_free(c, b);
429 			bc->freed++;
430 
431 			bch2_btree_node_hash_remove(bc, b);
432 			six_unlock_write(&b->c.lock);
433 			six_unlock_intent(&b->c.lock);
434 
435 			if (freed == nr)
436 				goto out_rotate;
437 		} else if (trigger_writes &&
438 			   btree_node_dirty(b) &&
439 			   !btree_node_will_make_reachable(b) &&
440 			   !btree_node_write_blocked(b) &&
441 			   six_trylock_read(&b->c.lock)) {
442 			list_move(&bc->live, &b->list);
443 			mutex_unlock(&bc->lock);
444 			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
445 			six_unlock_read(&b->c.lock);
446 			if (touched >= nr)
447 				goto out_nounlock;
448 			mutex_lock(&bc->lock);
449 			goto restart;
450 		}
451 
452 		if (touched >= nr)
453 			break;
454 	}
455 out_rotate:
456 	if (&t->list != &bc->live)
457 		list_move_tail(&bc->live, &t->list);
458 out:
459 	mutex_unlock(&bc->lock);
460 out_nounlock:
461 	ret = freed;
462 	memalloc_nofs_restore(flags);
463 	trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
464 	return ret;
465 }
466 
467 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
468 					    struct shrink_control *sc)
469 {
470 	struct bch_fs *c = shrink->private_data;
471 	struct btree_cache *bc = &c->btree_cache;
472 
473 	if (bch2_btree_shrinker_disabled)
474 		return 0;
475 
476 	return btree_cache_can_free(bc);
477 }
478 
479 void bch2_fs_btree_cache_exit(struct bch_fs *c)
480 {
481 	struct btree_cache *bc = &c->btree_cache;
482 	struct btree *b;
483 	unsigned i, flags;
484 
485 	shrinker_free(bc->shrink);
486 
487 	/* vfree() can allocate memory: */
488 	flags = memalloc_nofs_save();
489 	mutex_lock(&bc->lock);
490 
491 	if (c->verify_data)
492 		list_move(&c->verify_data->list, &bc->live);
493 
494 	kvfree(c->verify_ondisk);
495 
496 	for (i = 0; i < btree_id_nr_alive(c); i++) {
497 		struct btree_root *r = bch2_btree_id_root(c, i);
498 
499 		if (r->b)
500 			list_add(&r->b->list, &bc->live);
501 	}
502 
503 	list_splice(&bc->freeable, &bc->live);
504 
505 	while (!list_empty(&bc->live)) {
506 		b = list_first_entry(&bc->live, struct btree, list);
507 
508 		BUG_ON(btree_node_read_in_flight(b) ||
509 		       btree_node_write_in_flight(b));
510 
511 		btree_node_data_free(c, b);
512 	}
513 
514 	BUG_ON(!bch2_journal_error(&c->journal) &&
515 	       atomic_read(&c->btree_cache.dirty));
516 
517 	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
518 
519 	while (!list_empty(&bc->freed_nonpcpu)) {
520 		b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
521 		list_del(&b->list);
522 		six_lock_exit(&b->c.lock);
523 		kfree(b);
524 	}
525 
526 	mutex_unlock(&bc->lock);
527 	memalloc_nofs_restore(flags);
528 
529 	if (bc->table_init_done)
530 		rhashtable_destroy(&bc->table);
531 }
532 
533 int bch2_fs_btree_cache_init(struct bch_fs *c)
534 {
535 	struct btree_cache *bc = &c->btree_cache;
536 	struct shrinker *shrink;
537 	unsigned i;
538 	int ret = 0;
539 
540 	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
541 	if (ret)
542 		goto err;
543 
544 	bc->table_init_done = true;
545 
546 	bch2_recalc_btree_reserve(c);
547 
548 	for (i = 0; i < bc->reserve; i++)
549 		if (!__bch2_btree_node_mem_alloc(c))
550 			goto err;
551 
552 	list_splice_init(&bc->live, &bc->freeable);
553 
554 	mutex_init(&c->verify_lock);
555 
556 	shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
557 	if (!shrink)
558 		goto err;
559 	bc->shrink = shrink;
560 	shrink->count_objects	= bch2_btree_cache_count;
561 	shrink->scan_objects	= bch2_btree_cache_scan;
562 	shrink->seeks		= 4;
563 	shrink->private_data	= c;
564 	shrinker_register(shrink);
565 
566 	return 0;
567 err:
568 	return -BCH_ERR_ENOMEM_fs_btree_cache_init;
569 }
570 
571 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
572 {
573 	mutex_init(&bc->lock);
574 	INIT_LIST_HEAD(&bc->live);
575 	INIT_LIST_HEAD(&bc->freeable);
576 	INIT_LIST_HEAD(&bc->freed_pcpu);
577 	INIT_LIST_HEAD(&bc->freed_nonpcpu);
578 }
579 
580 /*
581  * We can only have one thread cannibalizing other cached btree nodes at a time,
582  * or we'll deadlock. We use an open coded mutex to ensure that, which a
583  * cannibalize_bucket() will take. This means every time we unlock the root of
584  * the btree, we need to release this lock if we have it held.
585  */
586 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
587 {
588 	struct bch_fs *c = trans->c;
589 	struct btree_cache *bc = &c->btree_cache;
590 
591 	if (bc->alloc_lock == current) {
592 		trace_and_count(c, btree_cache_cannibalize_unlock, trans);
593 		bc->alloc_lock = NULL;
594 		closure_wake_up(&bc->alloc_wait);
595 	}
596 }
597 
598 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
599 {
600 	struct bch_fs *c = trans->c;
601 	struct btree_cache *bc = &c->btree_cache;
602 	struct task_struct *old;
603 
604 	old = cmpxchg(&bc->alloc_lock, NULL, current);
605 	if (old == NULL || old == current)
606 		goto success;
607 
608 	if (!cl) {
609 		trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
610 		return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
611 	}
612 
613 	closure_wait(&bc->alloc_wait, cl);
614 
615 	/* Try again, after adding ourselves to waitlist */
616 	old = cmpxchg(&bc->alloc_lock, NULL, current);
617 	if (old == NULL || old == current) {
618 		/* We raced */
619 		closure_wake_up(&bc->alloc_wait);
620 		goto success;
621 	}
622 
623 	trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
624 	return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
625 
626 success:
627 	trace_and_count(c, btree_cache_cannibalize_lock, trans);
628 	return 0;
629 }
630 
631 static struct btree *btree_node_cannibalize(struct bch_fs *c)
632 {
633 	struct btree_cache *bc = &c->btree_cache;
634 	struct btree *b;
635 
636 	list_for_each_entry_reverse(b, &bc->live, list)
637 		if (!btree_node_reclaim(c, b, false))
638 			return b;
639 
640 	while (1) {
641 		list_for_each_entry_reverse(b, &bc->live, list)
642 			if (!btree_node_write_and_reclaim(c, b))
643 				return b;
644 
645 		/*
646 		 * Rare case: all nodes were intent-locked.
647 		 * Just busy-wait.
648 		 */
649 		WARN_ONCE(1, "btree cache cannibalize failed\n");
650 		cond_resched();
651 	}
652 }
653 
654 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
655 {
656 	struct bch_fs *c = trans->c;
657 	struct btree_cache *bc = &c->btree_cache;
658 	struct list_head *freed = pcpu_read_locks
659 		? &bc->freed_pcpu
660 		: &bc->freed_nonpcpu;
661 	struct btree *b, *b2;
662 	u64 start_time = local_clock();
663 	unsigned flags;
664 
665 	flags = memalloc_nofs_save();
666 	mutex_lock(&bc->lock);
667 
668 	/*
669 	 * We never free struct btree itself, just the memory that holds the on
670 	 * disk node. Check the freed list before allocating a new one:
671 	 */
672 	list_for_each_entry(b, freed, list)
673 		if (!btree_node_reclaim(c, b, false)) {
674 			list_del_init(&b->list);
675 			goto got_node;
676 		}
677 
678 	b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
679 	if (!b) {
680 		mutex_unlock(&bc->lock);
681 		bch2_trans_unlock(trans);
682 		b = __btree_node_mem_alloc(c, GFP_KERNEL);
683 		if (!b)
684 			goto err;
685 		mutex_lock(&bc->lock);
686 	}
687 
688 	bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
689 
690 	BUG_ON(!six_trylock_intent(&b->c.lock));
691 	BUG_ON(!six_trylock_write(&b->c.lock));
692 got_node:
693 
694 	/*
695 	 * btree_free() doesn't free memory; it sticks the node on the end of
696 	 * the list. Check if there's any freed nodes there:
697 	 */
698 	list_for_each_entry(b2, &bc->freeable, list)
699 		if (!btree_node_reclaim(c, b2, false)) {
700 			swap(b->data, b2->data);
701 			swap(b->aux_data, b2->aux_data);
702 			btree_node_to_freedlist(bc, b2);
703 			six_unlock_write(&b2->c.lock);
704 			six_unlock_intent(&b2->c.lock);
705 			goto got_mem;
706 		}
707 
708 	mutex_unlock(&bc->lock);
709 
710 	if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
711 		bch2_trans_unlock(trans);
712 		if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
713 			goto err;
714 	}
715 
716 	mutex_lock(&bc->lock);
717 	bc->used++;
718 got_mem:
719 	mutex_unlock(&bc->lock);
720 
721 	BUG_ON(btree_node_hashed(b));
722 	BUG_ON(btree_node_dirty(b));
723 	BUG_ON(btree_node_write_in_flight(b));
724 out:
725 	b->flags		= 0;
726 	b->written		= 0;
727 	b->nsets		= 0;
728 	b->sib_u64s[0]		= 0;
729 	b->sib_u64s[1]		= 0;
730 	b->whiteout_u64s	= 0;
731 	bch2_btree_keys_init(b);
732 	set_btree_node_accessed(b);
733 
734 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
735 			       start_time);
736 
737 	memalloc_nofs_restore(flags);
738 	return b;
739 err:
740 	mutex_lock(&bc->lock);
741 
742 	/* Try to cannibalize another cached btree node: */
743 	if (bc->alloc_lock == current) {
744 		b2 = btree_node_cannibalize(c);
745 		clear_btree_node_just_written(b2);
746 		bch2_btree_node_hash_remove(bc, b2);
747 
748 		if (b) {
749 			swap(b->data, b2->data);
750 			swap(b->aux_data, b2->aux_data);
751 			btree_node_to_freedlist(bc, b2);
752 			six_unlock_write(&b2->c.lock);
753 			six_unlock_intent(&b2->c.lock);
754 		} else {
755 			b = b2;
756 			list_del_init(&b->list);
757 		}
758 
759 		mutex_unlock(&bc->lock);
760 
761 		trace_and_count(c, btree_cache_cannibalize, trans);
762 		goto out;
763 	}
764 
765 	mutex_unlock(&bc->lock);
766 	memalloc_nofs_restore(flags);
767 	return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
768 }
769 
770 /* Slowpath, don't want it inlined into btree_iter_traverse() */
771 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
772 				struct btree_path *path,
773 				const struct bkey_i *k,
774 				enum btree_id btree_id,
775 				unsigned level,
776 				enum six_lock_type lock_type,
777 				bool sync)
778 {
779 	struct bch_fs *c = trans->c;
780 	struct btree_cache *bc = &c->btree_cache;
781 	struct btree *b;
782 
783 	if (unlikely(level >= BTREE_MAX_DEPTH)) {
784 		int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
785 						 level, BTREE_MAX_DEPTH);
786 		return ERR_PTR(ret);
787 	}
788 
789 	if (unlikely(!bkey_is_btree_ptr(&k->k))) {
790 		struct printbuf buf = PRINTBUF;
791 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
792 
793 		int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
794 		printbuf_exit(&buf);
795 		return ERR_PTR(ret);
796 	}
797 
798 	if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
799 		struct printbuf buf = PRINTBUF;
800 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
801 
802 		int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
803 		printbuf_exit(&buf);
804 		return ERR_PTR(ret);
805 	}
806 
807 	/*
808 	 * Parent node must be locked, else we could read in a btree node that's
809 	 * been freed:
810 	 */
811 	if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
812 		trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
813 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
814 	}
815 
816 	b = bch2_btree_node_mem_alloc(trans, level != 0);
817 
818 	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
819 		if (!path)
820 			return b;
821 
822 		trans->memory_allocation_failure = true;
823 		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
824 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
825 	}
826 
827 	if (IS_ERR(b))
828 		return b;
829 
830 	bkey_copy(&b->key, k);
831 	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
832 		/* raced with another fill: */
833 
834 		/* mark as unhashed... */
835 		b->hash_val = 0;
836 
837 		mutex_lock(&bc->lock);
838 		list_add(&b->list, &bc->freeable);
839 		mutex_unlock(&bc->lock);
840 
841 		six_unlock_write(&b->c.lock);
842 		six_unlock_intent(&b->c.lock);
843 		return NULL;
844 	}
845 
846 	set_btree_node_read_in_flight(b);
847 	six_unlock_write(&b->c.lock);
848 
849 	if (path) {
850 		u32 seq = six_lock_seq(&b->c.lock);
851 
852 		/* Unlock before doing IO: */
853 		six_unlock_intent(&b->c.lock);
854 		bch2_trans_unlock_noassert(trans);
855 
856 		bch2_btree_node_read(trans, b, sync);
857 
858 		if (!sync)
859 			return NULL;
860 
861 		if (!six_relock_type(&b->c.lock, lock_type, seq))
862 			b = NULL;
863 	} else {
864 		bch2_btree_node_read(trans, b, sync);
865 		if (lock_type == SIX_LOCK_read)
866 			six_lock_downgrade(&b->c.lock);
867 	}
868 
869 	return b;
870 }
871 
872 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
873 {
874 	struct printbuf buf = PRINTBUF;
875 
876 	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
877 		return;
878 
879 	prt_printf(&buf,
880 	       "btree node header doesn't match ptr\n"
881 	       "btree %s level %u\n"
882 	       "ptr: ",
883 	       bch2_btree_id_str(b->c.btree_id), b->c.level);
884 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
885 
886 	prt_printf(&buf, "\nheader: btree %s level %llu\n"
887 	       "min ",
888 	       bch2_btree_id_str(BTREE_NODE_ID(b->data)),
889 	       BTREE_NODE_LEVEL(b->data));
890 	bch2_bpos_to_text(&buf, b->data->min_key);
891 
892 	prt_printf(&buf, "\nmax ");
893 	bch2_bpos_to_text(&buf, b->data->max_key);
894 
895 	bch2_fs_topology_error(c, "%s", buf.buf);
896 
897 	printbuf_exit(&buf);
898 }
899 
900 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
901 {
902 	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
903 	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
904 	    !bpos_eq(b->data->max_key, b->key.k.p) ||
905 	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
906 	     !bpos_eq(b->data->min_key,
907 		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
908 		btree_bad_header(c, b);
909 }
910 
911 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
912 					   const struct bkey_i *k, unsigned level,
913 					   enum six_lock_type lock_type,
914 					   unsigned long trace_ip)
915 {
916 	struct bch_fs *c = trans->c;
917 	struct btree_cache *bc = &c->btree_cache;
918 	struct btree *b;
919 	bool need_relock = false;
920 	int ret;
921 
922 	EBUG_ON(level >= BTREE_MAX_DEPTH);
923 retry:
924 	b = btree_cache_find(bc, k);
925 	if (unlikely(!b)) {
926 		/*
927 		 * We must have the parent locked to call bch2_btree_node_fill(),
928 		 * else we could read in a btree node from disk that's been
929 		 * freed:
930 		 */
931 		b = bch2_btree_node_fill(trans, path, k, path->btree_id,
932 					 level, lock_type, true);
933 		need_relock = true;
934 
935 		/* We raced and found the btree node in the cache */
936 		if (!b)
937 			goto retry;
938 
939 		if (IS_ERR(b))
940 			return b;
941 	} else {
942 		if (btree_node_read_locked(path, level + 1))
943 			btree_node_unlock(trans, path, level + 1);
944 
945 		ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
946 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
947 			return ERR_PTR(ret);
948 
949 		BUG_ON(ret);
950 
951 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
952 			     b->c.level != level ||
953 			     race_fault())) {
954 			six_unlock_type(&b->c.lock, lock_type);
955 			if (bch2_btree_node_relock(trans, path, level + 1))
956 				goto retry;
957 
958 			trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
959 			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
960 		}
961 
962 		/* avoid atomic set bit if it's not needed: */
963 		if (!btree_node_accessed(b))
964 			set_btree_node_accessed(b);
965 	}
966 
967 	if (unlikely(btree_node_read_in_flight(b))) {
968 		u32 seq = six_lock_seq(&b->c.lock);
969 
970 		six_unlock_type(&b->c.lock, lock_type);
971 		bch2_trans_unlock(trans);
972 		need_relock = true;
973 
974 		bch2_btree_node_wait_on_read(b);
975 
976 		/*
977 		 * should_be_locked is not set on this path yet, so we need to
978 		 * relock it specifically:
979 		 */
980 		if (!six_relock_type(&b->c.lock, lock_type, seq))
981 			goto retry;
982 	}
983 
984 	if (unlikely(need_relock)) {
985 		ret = bch2_trans_relock(trans) ?:
986 			bch2_btree_path_relock_intent(trans, path);
987 		if (ret) {
988 			six_unlock_type(&b->c.lock, lock_type);
989 			return ERR_PTR(ret);
990 		}
991 	}
992 
993 	prefetch(b->aux_data);
994 
995 	for_each_bset(b, t) {
996 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
997 
998 		prefetch(p + L1_CACHE_BYTES * 0);
999 		prefetch(p + L1_CACHE_BYTES * 1);
1000 		prefetch(p + L1_CACHE_BYTES * 2);
1001 	}
1002 
1003 	if (unlikely(btree_node_read_error(b))) {
1004 		six_unlock_type(&b->c.lock, lock_type);
1005 		return ERR_PTR(-BCH_ERR_btree_node_read_error);
1006 	}
1007 
1008 	EBUG_ON(b->c.btree_id != path->btree_id);
1009 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1010 	btree_check_header(c, b);
1011 
1012 	return b;
1013 }
1014 
1015 /**
1016  * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
1017  * in from disk if necessary.
1018  *
1019  * @trans:	btree transaction object
1020  * @path:	btree_path being traversed
1021  * @k:		pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
1022  * @level:	level of btree node being looked up (0 == leaf node)
1023  * @lock_type:	SIX_LOCK_read or SIX_LOCK_intent
1024  * @trace_ip:	ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
1025  *
1026  * The btree node will have either a read or a write lock held, depending on
1027  * the @write parameter.
1028  *
1029  * Returns: btree node or ERR_PTR()
1030  */
1031 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
1032 				  const struct bkey_i *k, unsigned level,
1033 				  enum six_lock_type lock_type,
1034 				  unsigned long trace_ip)
1035 {
1036 	struct bch_fs *c = trans->c;
1037 	struct btree *b;
1038 	int ret;
1039 
1040 	EBUG_ON(level >= BTREE_MAX_DEPTH);
1041 
1042 	b = btree_node_mem_ptr(k);
1043 
1044 	/*
1045 	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
1046 	 * be the node we want anymore, and trying to lock the wrong node could
1047 	 * cause an unneccessary transaction restart:
1048 	 */
1049 	if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
1050 		     !b ||
1051 		     b->hash_val != btree_ptr_hash_val(k)))
1052 		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1053 
1054 	if (btree_node_read_locked(path, level + 1))
1055 		btree_node_unlock(trans, path, level + 1);
1056 
1057 	ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
1058 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1059 		return ERR_PTR(ret);
1060 
1061 	BUG_ON(ret);
1062 
1063 	if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1064 		     b->c.level != level ||
1065 		     race_fault())) {
1066 		six_unlock_type(&b->c.lock, lock_type);
1067 		if (bch2_btree_node_relock(trans, path, level + 1))
1068 			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1069 
1070 		trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1071 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1072 	}
1073 
1074 	if (unlikely(btree_node_read_in_flight(b))) {
1075 		six_unlock_type(&b->c.lock, lock_type);
1076 		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1077 	}
1078 
1079 	prefetch(b->aux_data);
1080 
1081 	for_each_bset(b, t) {
1082 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
1083 
1084 		prefetch(p + L1_CACHE_BYTES * 0);
1085 		prefetch(p + L1_CACHE_BYTES * 1);
1086 		prefetch(p + L1_CACHE_BYTES * 2);
1087 	}
1088 
1089 	/* avoid atomic set bit if it's not needed: */
1090 	if (!btree_node_accessed(b))
1091 		set_btree_node_accessed(b);
1092 
1093 	if (unlikely(btree_node_read_error(b))) {
1094 		six_unlock_type(&b->c.lock, lock_type);
1095 		return ERR_PTR(-BCH_ERR_btree_node_read_error);
1096 	}
1097 
1098 	EBUG_ON(b->c.btree_id != path->btree_id);
1099 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1100 	btree_check_header(c, b);
1101 
1102 	return b;
1103 }
1104 
1105 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
1106 					 const struct bkey_i *k,
1107 					 enum btree_id btree_id,
1108 					 unsigned level,
1109 					 bool nofill)
1110 {
1111 	struct bch_fs *c = trans->c;
1112 	struct btree_cache *bc = &c->btree_cache;
1113 	struct btree *b;
1114 	int ret;
1115 
1116 	EBUG_ON(level >= BTREE_MAX_DEPTH);
1117 
1118 	if (c->opts.btree_node_mem_ptr_optimization) {
1119 		b = btree_node_mem_ptr(k);
1120 		if (b)
1121 			goto lock_node;
1122 	}
1123 retry:
1124 	b = btree_cache_find(bc, k);
1125 	if (unlikely(!b)) {
1126 		if (nofill)
1127 			goto out;
1128 
1129 		b = bch2_btree_node_fill(trans, NULL, k, btree_id,
1130 					 level, SIX_LOCK_read, true);
1131 
1132 		/* We raced and found the btree node in the cache */
1133 		if (!b)
1134 			goto retry;
1135 
1136 		if (IS_ERR(b) &&
1137 		    !bch2_btree_cache_cannibalize_lock(trans, NULL))
1138 			goto retry;
1139 
1140 		if (IS_ERR(b))
1141 			goto out;
1142 	} else {
1143 lock_node:
1144 		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
1145 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1146 			return ERR_PTR(ret);
1147 
1148 		BUG_ON(ret);
1149 
1150 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1151 			     b->c.btree_id != btree_id ||
1152 			     b->c.level != level)) {
1153 			six_unlock_read(&b->c.lock);
1154 			goto retry;
1155 		}
1156 	}
1157 
1158 	/* XXX: waiting on IO with btree locks held: */
1159 	__bch2_btree_node_wait_on_read(b);
1160 
1161 	prefetch(b->aux_data);
1162 
1163 	for_each_bset(b, t) {
1164 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
1165 
1166 		prefetch(p + L1_CACHE_BYTES * 0);
1167 		prefetch(p + L1_CACHE_BYTES * 1);
1168 		prefetch(p + L1_CACHE_BYTES * 2);
1169 	}
1170 
1171 	/* avoid atomic set bit if it's not needed: */
1172 	if (!btree_node_accessed(b))
1173 		set_btree_node_accessed(b);
1174 
1175 	if (unlikely(btree_node_read_error(b))) {
1176 		six_unlock_read(&b->c.lock);
1177 		b = ERR_PTR(-BCH_ERR_btree_node_read_error);
1178 		goto out;
1179 	}
1180 
1181 	EBUG_ON(b->c.btree_id != btree_id);
1182 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1183 	btree_check_header(c, b);
1184 out:
1185 	bch2_btree_cache_cannibalize_unlock(trans);
1186 	return b;
1187 }
1188 
1189 int bch2_btree_node_prefetch(struct btree_trans *trans,
1190 			     struct btree_path *path,
1191 			     const struct bkey_i *k,
1192 			     enum btree_id btree_id, unsigned level)
1193 {
1194 	struct bch_fs *c = trans->c;
1195 	struct btree_cache *bc = &c->btree_cache;
1196 
1197 	BUG_ON(path && !btree_node_locked(path, level + 1));
1198 	BUG_ON(level >= BTREE_MAX_DEPTH);
1199 
1200 	struct btree *b = btree_cache_find(bc, k);
1201 	if (b)
1202 		return 0;
1203 
1204 	b = bch2_btree_node_fill(trans, path, k, btree_id,
1205 				 level, SIX_LOCK_read, false);
1206 	if (!IS_ERR_OR_NULL(b))
1207 		six_unlock_read(&b->c.lock);
1208 	return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
1209 }
1210 
1211 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1212 {
1213 	struct bch_fs *c = trans->c;
1214 	struct btree_cache *bc = &c->btree_cache;
1215 	struct btree *b;
1216 
1217 	b = btree_cache_find(bc, k);
1218 	if (!b)
1219 		return;
1220 
1221 	BUG_ON(b == btree_node_root(trans->c, b));
1222 wait_on_io:
1223 	/* not allowed to wait on io with btree locks held: */
1224 
1225 	/* XXX we're called from btree_gc which will be holding other btree
1226 	 * nodes locked
1227 	 */
1228 	__bch2_btree_node_wait_on_read(b);
1229 	__bch2_btree_node_wait_on_write(b);
1230 
1231 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1232 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1233 	if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
1234 		goto out;
1235 
1236 	if (btree_node_dirty(b)) {
1237 		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1238 		six_unlock_write(&b->c.lock);
1239 		six_unlock_intent(&b->c.lock);
1240 		goto wait_on_io;
1241 	}
1242 
1243 	BUG_ON(btree_node_dirty(b));
1244 
1245 	mutex_lock(&bc->lock);
1246 	btree_node_data_free(c, b);
1247 	bch2_btree_node_hash_remove(bc, b);
1248 	mutex_unlock(&bc->lock);
1249 out:
1250 	six_unlock_write(&b->c.lock);
1251 	six_unlock_intent(&b->c.lock);
1252 }
1253 
1254 const char *bch2_btree_id_str(enum btree_id btree)
1255 {
1256 	return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
1257 }
1258 
1259 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
1260 {
1261 	prt_printf(out, "%s level %u/%u\n  ",
1262 	       bch2_btree_id_str(b->c.btree_id),
1263 	       b->c.level,
1264 	       bch2_btree_id_root(c, b->c.btree_id)->level);
1265 	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1266 }
1267 
1268 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
1269 {
1270 	struct bset_stats stats;
1271 
1272 	memset(&stats, 0, sizeof(stats));
1273 
1274 	bch2_btree_keys_stats(b, &stats);
1275 
1276 	prt_printf(out, "l %u ", b->c.level);
1277 	bch2_bpos_to_text(out, b->data->min_key);
1278 	prt_printf(out, " - ");
1279 	bch2_bpos_to_text(out, b->data->max_key);
1280 	prt_printf(out, ":\n"
1281 	       "    ptrs: ");
1282 	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1283 	prt_newline(out);
1284 
1285 	prt_printf(out,
1286 	       "    format: ");
1287 	bch2_bkey_format_to_text(out, &b->format);
1288 
1289 	prt_printf(out,
1290 	       "    unpack fn len: %u\n"
1291 	       "    bytes used %zu/%zu (%zu%% full)\n"
1292 	       "    sib u64s: %u, %u (merge threshold %u)\n"
1293 	       "    nr packed keys %u\n"
1294 	       "    nr unpacked keys %u\n"
1295 	       "    floats %zu\n"
1296 	       "    failed unpacked %zu\n",
1297 	       b->unpack_fn_len,
1298 	       b->nr.live_u64s * sizeof(u64),
1299 	       btree_buf_bytes(b) - sizeof(struct btree_node),
1300 	       b->nr.live_u64s * 100 / btree_max_u64s(c),
1301 	       b->sib_u64s[0],
1302 	       b->sib_u64s[1],
1303 	       c->btree_foreground_merge_threshold,
1304 	       b->nr.packed_keys,
1305 	       b->nr.unpacked_keys,
1306 	       stats.floats,
1307 	       stats.failed);
1308 }
1309 
1310 static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
1311 				 const char *label, unsigned nr)
1312 {
1313 	prt_printf(out, "%s\t", label);
1314 	prt_human_readable_u64(out, nr * c->opts.btree_node_size);
1315 	prt_printf(out, " (%u)\n", nr);
1316 }
1317 
1318 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
1319 {
1320 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
1321 
1322 	if (!out->nr_tabstops)
1323 		printbuf_tabstop_push(out, 32);
1324 
1325 	prt_btree_cache_line(out, c, "total:",		bc->used);
1326 	prt_btree_cache_line(out, c, "nr dirty:",	atomic_read(&bc->dirty));
1327 	prt_printf(out, "cannibalize lock:\t%p\n",	bc->alloc_lock);
1328 	prt_newline(out);
1329 
1330 	for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++)
1331 		prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]);
1332 
1333 	prt_newline(out);
1334 	prt_printf(out, "freed:\t%u\n", bc->freed);
1335 	prt_printf(out, "not freed:\n");
1336 	prt_printf(out, "  dirty\t%u\n", bc->not_freed_dirty);
1337 	prt_printf(out, "  write in flight\t%u\n", bc->not_freed_write_in_flight);
1338 	prt_printf(out, "  read in flight\t%u\n", bc->not_freed_read_in_flight);
1339 	prt_printf(out, "  lock intent failed\t%u\n", bc->not_freed_lock_intent);
1340 	prt_printf(out, "  lock write failed\t%u\n", bc->not_freed_lock_write);
1341 	prt_printf(out, "  access bit\t%u\n", bc->not_freed_access_bit);
1342 	prt_printf(out, "  no evict failed\t%u\n", bc->not_freed_noevict);
1343 	prt_printf(out, "  write blocked\t%u\n", bc->not_freed_write_blocked);
1344 	prt_printf(out, "  will make reachable\t%u\n", bc->not_freed_will_make_reachable);
1345 }
1346