xref: /linux/fs/bcachefs/btree_cache.c (revision 024bfd2e9d80d7131f1178eb2235030b96f7ef0e)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bbpos.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_io.h"
8 #include "btree_iter.h"
9 #include "btree_locking.h"
10 #include "debug.h"
11 #include "errcode.h"
12 #include "error.h"
13 #include "journal.h"
14 #include "trace.h"
15 
16 #include <linux/prefetch.h>
17 #include <linux/sched/mm.h>
18 #include <linux/swap.h>
19 
20 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
21 do {						 \
22 	if (shrinker_counter)			 \
23 		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++;	 \
24 } while (0)
25 
26 const char * const bch2_btree_node_flags[] = {
27 #define x(f)	#f,
28 	BTREE_FLAGS()
29 #undef x
30 	NULL
31 };
32 
33 void bch2_recalc_btree_reserve(struct bch_fs *c)
34 {
35 	unsigned reserve = 16;
36 
37 	if (!c->btree_roots_known[0].b)
38 		reserve += 8;
39 
40 	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
41 		struct btree_root *r = bch2_btree_id_root(c, i);
42 
43 		if (r->b)
44 			reserve += min_t(unsigned, 1, r->b->c.level) * 8;
45 	}
46 
47 	c->btree_cache.nr_reserve = reserve;
48 }
49 
50 static inline size_t btree_cache_can_free(struct btree_cache_list *list)
51 {
52 	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
53 
54 	size_t can_free = list->nr;
55 	if (!list->idx)
56 		can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
57 	return can_free;
58 }
59 
60 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
61 {
62 	BUG_ON(!list_empty(&b->list));
63 
64 	if (b->c.lock.readers)
65 		list_add(&b->list, &bc->freed_pcpu);
66 	else
67 		list_add(&b->list, &bc->freed_nonpcpu);
68 }
69 
70 static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b)
71 {
72 	BUG_ON(!list_empty(&b->list));
73 	BUG_ON(!b->data);
74 
75 	bc->nr_freeable++;
76 	list_add(&b->list, &bc->freeable);
77 }
78 
79 void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
80 {
81 	struct btree_cache *bc = &c->btree_cache;
82 
83 	mutex_lock(&bc->lock);
84 	__bch2_btree_node_to_freelist(bc, b);
85 	mutex_unlock(&bc->lock);
86 
87 	six_unlock_write(&b->c.lock);
88 	six_unlock_intent(&b->c.lock);
89 }
90 
91 static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
92 {
93 	BUG_ON(!list_empty(&b->list));
94 	BUG_ON(btree_node_hashed(b));
95 
96 	/*
97 	 * This should really be done in slub/vmalloc, but we're using the
98 	 * kmalloc_large() path, so we're working around a slub bug by doing
99 	 * this here:
100 	 */
101 	if (b->data)
102 		mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
103 	if (b->aux_data)
104 		mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
105 
106 	EBUG_ON(btree_node_write_in_flight(b));
107 
108 	clear_btree_node_just_written(b);
109 
110 	kvfree(b->data);
111 	b->data = NULL;
112 #ifdef __KERNEL__
113 	kvfree(b->aux_data);
114 #else
115 	munmap(b->aux_data, btree_aux_data_bytes(b));
116 #endif
117 	b->aux_data = NULL;
118 
119 	btree_node_to_freedlist(bc, b);
120 }
121 
122 static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
123 {
124 	BUG_ON(list_empty(&b->list));
125 	list_del_init(&b->list);
126 	--bc->nr_freeable;
127 	__btree_node_data_free(bc, b);
128 }
129 
130 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
131 				   const void *obj)
132 {
133 	const struct btree *b = obj;
134 	const u64 *v = arg->key;
135 
136 	return b->hash_val == *v ? 0 : 1;
137 }
138 
139 static const struct rhashtable_params bch_btree_cache_params = {
140 	.head_offset		= offsetof(struct btree, hash),
141 	.key_offset		= offsetof(struct btree, hash_val),
142 	.key_len		= sizeof(u64),
143 	.obj_cmpfn		= bch2_btree_cache_cmp_fn,
144 	.automatic_shrinking	= true,
145 };
146 
147 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
148 {
149 	BUG_ON(b->data || b->aux_data);
150 
151 	gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
152 
153 	b->data = kvmalloc(btree_buf_bytes(b), gfp);
154 	if (!b->data)
155 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
156 #ifdef __KERNEL__
157 	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
158 #else
159 	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
160 			   PROT_READ|PROT_WRITE|PROT_EXEC,
161 			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
162 	if (b->aux_data == MAP_FAILED)
163 		b->aux_data = NULL;
164 #endif
165 	if (!b->aux_data) {
166 		kvfree(b->data);
167 		b->data = NULL;
168 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
169 	}
170 
171 	return 0;
172 }
173 
174 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
175 {
176 	struct btree *b;
177 
178 	b = kzalloc(sizeof(struct btree), gfp);
179 	if (!b)
180 		return NULL;
181 
182 	bkey_btree_ptr_init(&b->key);
183 	INIT_LIST_HEAD(&b->list);
184 	INIT_LIST_HEAD(&b->write_blocked);
185 	b->byte_order = ilog2(c->opts.btree_node_size);
186 	return b;
187 }
188 
189 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
190 {
191 	struct btree_cache *bc = &c->btree_cache;
192 	struct btree *b;
193 
194 	b = __btree_node_mem_alloc(c, GFP_KERNEL);
195 	if (!b)
196 		return NULL;
197 
198 	if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
199 		kfree(b);
200 		return NULL;
201 	}
202 
203 	bch2_btree_lock_init(&b->c, 0);
204 
205 	__bch2_btree_node_to_freelist(bc, b);
206 	return b;
207 }
208 
209 static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
210 {
211 	struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
212 
213 	u64 mask = bc->pinned_nodes_mask[!!b->c.level];
214 
215 	return ((mask & BIT_ULL(b->c.btree_id)) &&
216 		bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
217 		bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
218 }
219 
220 void bch2_node_pin(struct bch_fs *c, struct btree *b)
221 {
222 	struct btree_cache *bc = &c->btree_cache;
223 
224 	mutex_lock(&bc->lock);
225 	BUG_ON(!__btree_node_pinned(bc, b));
226 	if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
227 		set_btree_node_pinned(b);
228 		list_move(&b->list, &bc->live[1].list);
229 		bc->live[0].nr--;
230 		bc->live[1].nr++;
231 	}
232 	mutex_unlock(&bc->lock);
233 }
234 
235 void bch2_btree_cache_unpin(struct bch_fs *c)
236 {
237 	struct btree_cache *bc = &c->btree_cache;
238 	struct btree *b, *n;
239 
240 	mutex_lock(&bc->lock);
241 	c->btree_cache.pinned_nodes_mask[0] = 0;
242 	c->btree_cache.pinned_nodes_mask[1] = 0;
243 
244 	list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
245 		clear_btree_node_pinned(b);
246 		list_move(&b->list, &bc->live[0].list);
247 		bc->live[0].nr++;
248 		bc->live[1].nr--;
249 	}
250 
251 	mutex_unlock(&bc->lock);
252 }
253 
254 /* Btree in memory cache - hash table */
255 
256 void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
257 {
258 	lockdep_assert_held(&bc->lock);
259 
260 	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
261 	BUG_ON(ret);
262 
263 	/* Cause future lookups for this node to fail: */
264 	b->hash_val = 0;
265 
266 	if (b->c.btree_id < BTREE_ID_NR)
267 		--bc->nr_by_btree[b->c.btree_id];
268 	--bc->live[btree_node_pinned(b)].nr;
269 	list_del_init(&b->list);
270 }
271 
272 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
273 {
274 	__bch2_btree_node_hash_remove(bc, b);
275 	__bch2_btree_node_to_freelist(bc, b);
276 }
277 
278 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
279 {
280 	BUG_ON(!list_empty(&b->list));
281 	BUG_ON(b->hash_val);
282 
283 	b->hash_val = btree_ptr_hash_val(&b->key);
284 	int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
285 						bch_btree_cache_params);
286 	if (ret)
287 		return ret;
288 
289 	if (b->c.btree_id < BTREE_ID_NR)
290 		bc->nr_by_btree[b->c.btree_id]++;
291 
292 	bool p = __btree_node_pinned(bc, b);
293 	mod_bit(BTREE_NODE_pinned, &b->flags, p);
294 
295 	list_add_tail(&b->list, &bc->live[p].list);
296 	bc->live[p].nr++;
297 	return 0;
298 }
299 
300 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
301 				unsigned level, enum btree_id id)
302 {
303 	b->c.level	= level;
304 	b->c.btree_id	= id;
305 
306 	mutex_lock(&bc->lock);
307 	int ret = __bch2_btree_node_hash_insert(bc, b);
308 	mutex_unlock(&bc->lock);
309 
310 	return ret;
311 }
312 
313 void bch2_btree_node_update_key_early(struct btree_trans *trans,
314 				      enum btree_id btree, unsigned level,
315 				      struct bkey_s_c old, struct bkey_i *new)
316 {
317 	struct bch_fs *c = trans->c;
318 	struct btree *b;
319 	struct bkey_buf tmp;
320 	int ret;
321 
322 	bch2_bkey_buf_init(&tmp);
323 	bch2_bkey_buf_reassemble(&tmp, c, old);
324 
325 	b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
326 	if (!IS_ERR_OR_NULL(b)) {
327 		mutex_lock(&c->btree_cache.lock);
328 
329 		bch2_btree_node_hash_remove(&c->btree_cache, b);
330 
331 		bkey_copy(&b->key, new);
332 		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
333 		BUG_ON(ret);
334 
335 		mutex_unlock(&c->btree_cache.lock);
336 		six_unlock_read(&b->c.lock);
337 	}
338 
339 	bch2_bkey_buf_exit(&tmp, c);
340 }
341 
342 __flatten
343 static inline struct btree *btree_cache_find(struct btree_cache *bc,
344 				     const struct bkey_i *k)
345 {
346 	u64 v = btree_ptr_hash_val(k);
347 
348 	return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
349 }
350 
351 /*
352  * this version is for btree nodes that have already been freed (we're not
353  * reaping a real btree node)
354  */
355 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
356 {
357 	struct btree_cache *bc = &c->btree_cache;
358 	int ret = 0;
359 
360 	lockdep_assert_held(&bc->lock);
361 wait_on_io:
362 	if (b->flags & ((1U << BTREE_NODE_dirty)|
363 			(1U << BTREE_NODE_read_in_flight)|
364 			(1U << BTREE_NODE_write_in_flight))) {
365 		if (!flush) {
366 			if (btree_node_dirty(b))
367 				BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
368 			else if (btree_node_read_in_flight(b))
369 				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
370 			else if (btree_node_write_in_flight(b))
371 				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
372 			return -BCH_ERR_ENOMEM_btree_node_reclaim;
373 		}
374 
375 		/* XXX: waiting on IO with btree cache lock held */
376 		bch2_btree_node_wait_on_read(b);
377 		bch2_btree_node_wait_on_write(b);
378 	}
379 
380 	if (!six_trylock_intent(&b->c.lock)) {
381 		BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
382 		return -BCH_ERR_ENOMEM_btree_node_reclaim;
383 	}
384 
385 	if (!six_trylock_write(&b->c.lock)) {
386 		BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
387 		goto out_unlock_intent;
388 	}
389 
390 	/* recheck under lock */
391 	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
392 			(1U << BTREE_NODE_write_in_flight))) {
393 		if (!flush) {
394 			if (btree_node_read_in_flight(b))
395 				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
396 			else if (btree_node_write_in_flight(b))
397 				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
398 			goto out_unlock;
399 		}
400 		six_unlock_write(&b->c.lock);
401 		six_unlock_intent(&b->c.lock);
402 		goto wait_on_io;
403 	}
404 
405 	if (btree_node_noevict(b)) {
406 		BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
407 		goto out_unlock;
408 	}
409 	if (btree_node_write_blocked(b)) {
410 		BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
411 		goto out_unlock;
412 	}
413 	if (btree_node_will_make_reachable(b)) {
414 		BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
415 		goto out_unlock;
416 	}
417 
418 	if (btree_node_dirty(b)) {
419 		if (!flush) {
420 			BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
421 			goto out_unlock;
422 		}
423 		/*
424 		 * Using the underscore version because we don't want to compact
425 		 * bsets after the write, since this node is about to be evicted
426 		 * - unless btree verify mode is enabled, since it runs out of
427 		 * the post write cleanup:
428 		 */
429 		if (bch2_verify_btree_ondisk)
430 			bch2_btree_node_write(c, b, SIX_LOCK_intent,
431 					      BTREE_WRITE_cache_reclaim);
432 		else
433 			__bch2_btree_node_write(c, b,
434 						BTREE_WRITE_cache_reclaim);
435 
436 		six_unlock_write(&b->c.lock);
437 		six_unlock_intent(&b->c.lock);
438 		goto wait_on_io;
439 	}
440 out:
441 	if (b->hash_val && !ret)
442 		trace_and_count(c, btree_cache_reap, c, b);
443 	return ret;
444 out_unlock:
445 	six_unlock_write(&b->c.lock);
446 out_unlock_intent:
447 	six_unlock_intent(&b->c.lock);
448 	ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
449 	goto out;
450 }
451 
452 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
453 {
454 	return __btree_node_reclaim(c, b, false, shrinker_counter);
455 }
456 
457 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
458 {
459 	return __btree_node_reclaim(c, b, true, false);
460 }
461 
462 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
463 					   struct shrink_control *sc)
464 {
465 	struct btree_cache_list *list = shrink->private_data;
466 	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
467 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
468 	struct btree *b, *t;
469 	unsigned long nr = sc->nr_to_scan;
470 	unsigned long can_free = 0;
471 	unsigned long freed = 0;
472 	unsigned long touched = 0;
473 	unsigned i, flags;
474 	unsigned long ret = SHRINK_STOP;
475 	bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
476 
477 	if (bch2_btree_shrinker_disabled)
478 		return SHRINK_STOP;
479 
480 	mutex_lock(&bc->lock);
481 	flags = memalloc_nofs_save();
482 
483 	/*
484 	 * It's _really_ critical that we don't free too many btree nodes - we
485 	 * have to always leave ourselves a reserve. The reserve is how we
486 	 * guarantee that allocating memory for a new btree node can always
487 	 * succeed, so that inserting keys into the btree can always succeed and
488 	 * IO can always make forward progress:
489 	 */
490 	can_free = btree_cache_can_free(list);
491 	nr = min_t(unsigned long, nr, can_free);
492 
493 	i = 0;
494 	list_for_each_entry_safe(b, t, &bc->freeable, list) {
495 		/*
496 		 * Leave a few nodes on the freeable list, so that a btree split
497 		 * won't have to hit the system allocator:
498 		 */
499 		if (++i <= 3)
500 			continue;
501 
502 		touched++;
503 
504 		if (touched >= nr)
505 			goto out;
506 
507 		if (!btree_node_reclaim(c, b, true)) {
508 			btree_node_data_free(bc, b);
509 			six_unlock_write(&b->c.lock);
510 			six_unlock_intent(&b->c.lock);
511 			freed++;
512 			bc->nr_freed++;
513 		}
514 	}
515 restart:
516 	list_for_each_entry_safe(b, t, &list->list, list) {
517 		touched++;
518 
519 		if (btree_node_accessed(b)) {
520 			clear_btree_node_accessed(b);
521 			bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
522 			--touched;;
523 		} else if (!btree_node_reclaim(c, b, true)) {
524 			__bch2_btree_node_hash_remove(bc, b);
525 			__btree_node_data_free(bc, b);
526 
527 			freed++;
528 			bc->nr_freed++;
529 
530 			six_unlock_write(&b->c.lock);
531 			six_unlock_intent(&b->c.lock);
532 
533 			if (freed == nr)
534 				goto out_rotate;
535 		} else if (trigger_writes &&
536 			   btree_node_dirty(b) &&
537 			   !btree_node_will_make_reachable(b) &&
538 			   !btree_node_write_blocked(b) &&
539 			   six_trylock_read(&b->c.lock)) {
540 			list_move(&list->list, &b->list);
541 			mutex_unlock(&bc->lock);
542 			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
543 			six_unlock_read(&b->c.lock);
544 			if (touched >= nr)
545 				goto out_nounlock;
546 			mutex_lock(&bc->lock);
547 			goto restart;
548 		}
549 
550 		if (touched >= nr)
551 			break;
552 	}
553 out_rotate:
554 	if (&t->list != &list->list)
555 		list_move_tail(&list->list, &t->list);
556 out:
557 	mutex_unlock(&bc->lock);
558 out_nounlock:
559 	ret = freed;
560 	memalloc_nofs_restore(flags);
561 	trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
562 	return ret;
563 }
564 
565 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
566 					    struct shrink_control *sc)
567 {
568 	struct btree_cache_list *list = shrink->private_data;
569 
570 	if (bch2_btree_shrinker_disabled)
571 		return 0;
572 
573 	return btree_cache_can_free(list);
574 }
575 
576 void bch2_fs_btree_cache_exit(struct bch_fs *c)
577 {
578 	struct btree_cache *bc = &c->btree_cache;
579 	struct btree *b, *t;
580 	unsigned long flags;
581 
582 	shrinker_free(bc->live[1].shrink);
583 	shrinker_free(bc->live[0].shrink);
584 
585 	/* vfree() can allocate memory: */
586 	flags = memalloc_nofs_save();
587 	mutex_lock(&bc->lock);
588 
589 	if (c->verify_data)
590 		list_move(&c->verify_data->list, &bc->live[0].list);
591 
592 	kvfree(c->verify_ondisk);
593 
594 	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
595 		struct btree_root *r = bch2_btree_id_root(c, i);
596 
597 		if (r->b)
598 			list_add(&r->b->list, &bc->live[0].list);
599 	}
600 
601 	list_for_each_entry_safe(b, t, &bc->live[1].list, list)
602 		bch2_btree_node_hash_remove(bc, b);
603 	list_for_each_entry_safe(b, t, &bc->live[0].list, list)
604 		bch2_btree_node_hash_remove(bc, b);
605 
606 	list_for_each_entry_safe(b, t, &bc->freeable, list) {
607 		BUG_ON(btree_node_read_in_flight(b) ||
608 		       btree_node_write_in_flight(b));
609 
610 		btree_node_data_free(bc, b);
611 	}
612 
613 	BUG_ON(!bch2_journal_error(&c->journal) &&
614 	       atomic_long_read(&c->btree_cache.nr_dirty));
615 
616 	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
617 
618 	list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
619 		list_del(&b->list);
620 		six_lock_exit(&b->c.lock);
621 		kfree(b);
622 	}
623 
624 	mutex_unlock(&bc->lock);
625 	memalloc_nofs_restore(flags);
626 
627 	for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
628 		BUG_ON(bc->nr_by_btree[i]);
629 	BUG_ON(bc->live[0].nr);
630 	BUG_ON(bc->live[1].nr);
631 	BUG_ON(bc->nr_freeable);
632 
633 	if (bc->table_init_done)
634 		rhashtable_destroy(&bc->table);
635 }
636 
637 int bch2_fs_btree_cache_init(struct bch_fs *c)
638 {
639 	struct btree_cache *bc = &c->btree_cache;
640 	struct shrinker *shrink;
641 	unsigned i;
642 	int ret = 0;
643 
644 	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
645 	if (ret)
646 		goto err;
647 
648 	bc->table_init_done = true;
649 
650 	bch2_recalc_btree_reserve(c);
651 
652 	for (i = 0; i < bc->nr_reserve; i++)
653 		if (!__bch2_btree_node_mem_alloc(c))
654 			goto err;
655 
656 	list_splice_init(&bc->live[0].list, &bc->freeable);
657 
658 	mutex_init(&c->verify_lock);
659 
660 	shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
661 	if (!shrink)
662 		goto err;
663 	bc->live[0].shrink	= shrink;
664 	shrink->count_objects	= bch2_btree_cache_count;
665 	shrink->scan_objects	= bch2_btree_cache_scan;
666 	shrink->seeks		= 2;
667 	shrink->private_data	= &bc->live[0];
668 	shrinker_register(shrink);
669 
670 	shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
671 	if (!shrink)
672 		goto err;
673 	bc->live[1].shrink	= shrink;
674 	shrink->count_objects	= bch2_btree_cache_count;
675 	shrink->scan_objects	= bch2_btree_cache_scan;
676 	shrink->seeks		= 8;
677 	shrink->private_data	= &bc->live[1];
678 	shrinker_register(shrink);
679 
680 	return 0;
681 err:
682 	return -BCH_ERR_ENOMEM_fs_btree_cache_init;
683 }
684 
685 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
686 {
687 	mutex_init(&bc->lock);
688 	for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
689 		bc->live[i].idx = i;
690 		INIT_LIST_HEAD(&bc->live[i].list);
691 	}
692 	INIT_LIST_HEAD(&bc->freeable);
693 	INIT_LIST_HEAD(&bc->freed_pcpu);
694 	INIT_LIST_HEAD(&bc->freed_nonpcpu);
695 }
696 
697 /*
698  * We can only have one thread cannibalizing other cached btree nodes at a time,
699  * or we'll deadlock. We use an open coded mutex to ensure that, which a
700  * cannibalize_bucket() will take. This means every time we unlock the root of
701  * the btree, we need to release this lock if we have it held.
702  */
703 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
704 {
705 	struct bch_fs *c = trans->c;
706 	struct btree_cache *bc = &c->btree_cache;
707 
708 	if (bc->alloc_lock == current) {
709 		trace_and_count(c, btree_cache_cannibalize_unlock, trans);
710 		bc->alloc_lock = NULL;
711 		closure_wake_up(&bc->alloc_wait);
712 	}
713 }
714 
715 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
716 {
717 	struct bch_fs *c = trans->c;
718 	struct btree_cache *bc = &c->btree_cache;
719 	struct task_struct *old;
720 
721 	old = NULL;
722 	if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
723 		goto success;
724 
725 	if (!cl) {
726 		trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
727 		return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
728 	}
729 
730 	closure_wait(&bc->alloc_wait, cl);
731 
732 	/* Try again, after adding ourselves to waitlist */
733 	old = NULL;
734 	if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
735 		/* We raced */
736 		closure_wake_up(&bc->alloc_wait);
737 		goto success;
738 	}
739 
740 	trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
741 	return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
742 
743 success:
744 	trace_and_count(c, btree_cache_cannibalize_lock, trans);
745 	return 0;
746 }
747 
748 static struct btree *btree_node_cannibalize(struct bch_fs *c)
749 {
750 	struct btree_cache *bc = &c->btree_cache;
751 	struct btree *b;
752 
753 	for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
754 		list_for_each_entry_reverse(b, &bc->live[i].list, list)
755 			if (!btree_node_reclaim(c, b, false))
756 				return b;
757 
758 	while (1) {
759 		for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
760 			list_for_each_entry_reverse(b, &bc->live[i].list, list)
761 				if (!btree_node_write_and_reclaim(c, b))
762 					return b;
763 
764 		/*
765 		 * Rare case: all nodes were intent-locked.
766 		 * Just busy-wait.
767 		 */
768 		WARN_ONCE(1, "btree cache cannibalize failed\n");
769 		cond_resched();
770 	}
771 }
772 
773 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
774 {
775 	struct bch_fs *c = trans->c;
776 	struct btree_cache *bc = &c->btree_cache;
777 	struct list_head *freed = pcpu_read_locks
778 		? &bc->freed_pcpu
779 		: &bc->freed_nonpcpu;
780 	struct btree *b, *b2;
781 	u64 start_time = local_clock();
782 
783 	mutex_lock(&bc->lock);
784 
785 	/*
786 	 * We never free struct btree itself, just the memory that holds the on
787 	 * disk node. Check the freed list before allocating a new one:
788 	 */
789 	list_for_each_entry(b, freed, list)
790 		if (!btree_node_reclaim(c, b, false)) {
791 			list_del_init(&b->list);
792 			goto got_node;
793 		}
794 
795 	b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
796 	if (!b) {
797 		mutex_unlock(&bc->lock);
798 		bch2_trans_unlock(trans);
799 		b = __btree_node_mem_alloc(c, GFP_KERNEL);
800 		if (!b)
801 			goto err;
802 		mutex_lock(&bc->lock);
803 	}
804 
805 	bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
806 
807 	BUG_ON(!six_trylock_intent(&b->c.lock));
808 	BUG_ON(!six_trylock_write(&b->c.lock));
809 
810 got_node:
811 	/*
812 	 * btree_free() doesn't free memory; it sticks the node on the end of
813 	 * the list. Check if there's any freed nodes there:
814 	 */
815 	list_for_each_entry(b2, &bc->freeable, list)
816 		if (!btree_node_reclaim(c, b2, false)) {
817 			swap(b->data, b2->data);
818 			swap(b->aux_data, b2->aux_data);
819 
820 			list_del_init(&b2->list);
821 			--bc->nr_freeable;
822 			btree_node_to_freedlist(bc, b2);
823 			mutex_unlock(&bc->lock);
824 
825 			six_unlock_write(&b2->c.lock);
826 			six_unlock_intent(&b2->c.lock);
827 			goto got_mem;
828 		}
829 
830 	mutex_unlock(&bc->lock);
831 
832 	if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
833 		bch2_trans_unlock(trans);
834 		if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
835 			goto err;
836 	}
837 
838 got_mem:
839 	BUG_ON(!list_empty(&b->list));
840 	BUG_ON(btree_node_hashed(b));
841 	BUG_ON(btree_node_dirty(b));
842 	BUG_ON(btree_node_write_in_flight(b));
843 out:
844 	b->flags		= 0;
845 	b->written		= 0;
846 	b->nsets		= 0;
847 	b->sib_u64s[0]		= 0;
848 	b->sib_u64s[1]		= 0;
849 	b->whiteout_u64s	= 0;
850 	bch2_btree_keys_init(b);
851 	set_btree_node_accessed(b);
852 
853 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
854 			       start_time);
855 
856 	int ret = bch2_trans_relock(trans);
857 	if (unlikely(ret)) {
858 		bch2_btree_node_to_freelist(c, b);
859 		return ERR_PTR(ret);
860 	}
861 
862 	return b;
863 err:
864 	mutex_lock(&bc->lock);
865 
866 	/* Try to cannibalize another cached btree node: */
867 	if (bc->alloc_lock == current) {
868 		b2 = btree_node_cannibalize(c);
869 		clear_btree_node_just_written(b2);
870 		__bch2_btree_node_hash_remove(bc, b2);
871 
872 		if (b) {
873 			swap(b->data, b2->data);
874 			swap(b->aux_data, b2->aux_data);
875 			btree_node_to_freedlist(bc, b2);
876 			six_unlock_write(&b2->c.lock);
877 			six_unlock_intent(&b2->c.lock);
878 		} else {
879 			b = b2;
880 		}
881 
882 		BUG_ON(!list_empty(&b->list));
883 		mutex_unlock(&bc->lock);
884 
885 		trace_and_count(c, btree_cache_cannibalize, trans);
886 		goto out;
887 	}
888 
889 	mutex_unlock(&bc->lock);
890 	return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
891 }
892 
893 /* Slowpath, don't want it inlined into btree_iter_traverse() */
894 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
895 				struct btree_path *path,
896 				const struct bkey_i *k,
897 				enum btree_id btree_id,
898 				unsigned level,
899 				enum six_lock_type lock_type,
900 				bool sync)
901 {
902 	struct bch_fs *c = trans->c;
903 	struct btree_cache *bc = &c->btree_cache;
904 	struct btree *b;
905 
906 	if (unlikely(level >= BTREE_MAX_DEPTH)) {
907 		int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
908 						 level, BTREE_MAX_DEPTH);
909 		return ERR_PTR(ret);
910 	}
911 
912 	if (unlikely(!bkey_is_btree_ptr(&k->k))) {
913 		struct printbuf buf = PRINTBUF;
914 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
915 
916 		int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
917 		printbuf_exit(&buf);
918 		return ERR_PTR(ret);
919 	}
920 
921 	if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
922 		struct printbuf buf = PRINTBUF;
923 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
924 
925 		int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
926 		printbuf_exit(&buf);
927 		return ERR_PTR(ret);
928 	}
929 
930 	/*
931 	 * Parent node must be locked, else we could read in a btree node that's
932 	 * been freed:
933 	 */
934 	if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
935 		trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
936 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
937 	}
938 
939 	b = bch2_btree_node_mem_alloc(trans, level != 0);
940 
941 	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
942 		if (!path)
943 			return b;
944 
945 		trans->memory_allocation_failure = true;
946 		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
947 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
948 	}
949 
950 	if (IS_ERR(b))
951 		return b;
952 
953 	bkey_copy(&b->key, k);
954 	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
955 		/* raced with another fill: */
956 
957 		/* mark as unhashed... */
958 		b->hash_val = 0;
959 
960 		mutex_lock(&bc->lock);
961 		__bch2_btree_node_to_freelist(bc, b);
962 		mutex_unlock(&bc->lock);
963 
964 		six_unlock_write(&b->c.lock);
965 		six_unlock_intent(&b->c.lock);
966 		return NULL;
967 	}
968 
969 	set_btree_node_read_in_flight(b);
970 	six_unlock_write(&b->c.lock);
971 
972 	if (path) {
973 		u32 seq = six_lock_seq(&b->c.lock);
974 
975 		/* Unlock before doing IO: */
976 		six_unlock_intent(&b->c.lock);
977 		bch2_trans_unlock_noassert(trans);
978 
979 		bch2_btree_node_read(trans, b, sync);
980 
981 		int ret = bch2_trans_relock(trans);
982 		if (ret)
983 			return ERR_PTR(ret);
984 
985 		if (!sync)
986 			return NULL;
987 
988 		if (!six_relock_type(&b->c.lock, lock_type, seq))
989 			b = NULL;
990 	} else {
991 		bch2_btree_node_read(trans, b, sync);
992 		if (lock_type == SIX_LOCK_read)
993 			six_lock_downgrade(&b->c.lock);
994 	}
995 
996 	return b;
997 }
998 
999 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
1000 {
1001 	struct printbuf buf = PRINTBUF;
1002 
1003 	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
1004 		return;
1005 
1006 	prt_printf(&buf,
1007 	       "btree node header doesn't match ptr\n"
1008 	       "btree %s level %u\n"
1009 	       "ptr: ",
1010 	       bch2_btree_id_str(b->c.btree_id), b->c.level);
1011 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
1012 
1013 	prt_printf(&buf, "\nheader: btree %s level %llu\n"
1014 	       "min ",
1015 	       bch2_btree_id_str(BTREE_NODE_ID(b->data)),
1016 	       BTREE_NODE_LEVEL(b->data));
1017 	bch2_bpos_to_text(&buf, b->data->min_key);
1018 
1019 	prt_printf(&buf, "\nmax ");
1020 	bch2_bpos_to_text(&buf, b->data->max_key);
1021 
1022 	bch2_fs_topology_error(c, "%s", buf.buf);
1023 
1024 	printbuf_exit(&buf);
1025 }
1026 
1027 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
1028 {
1029 	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
1030 	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
1031 	    !bpos_eq(b->data->max_key, b->key.k.p) ||
1032 	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
1033 	     !bpos_eq(b->data->min_key,
1034 		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
1035 		btree_bad_header(c, b);
1036 }
1037 
1038 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
1039 					   const struct bkey_i *k, unsigned level,
1040 					   enum six_lock_type lock_type,
1041 					   unsigned long trace_ip)
1042 {
1043 	struct bch_fs *c = trans->c;
1044 	struct btree_cache *bc = &c->btree_cache;
1045 	struct btree *b;
1046 	bool need_relock = false;
1047 	int ret;
1048 
1049 	EBUG_ON(level >= BTREE_MAX_DEPTH);
1050 retry:
1051 	b = btree_cache_find(bc, k);
1052 	if (unlikely(!b)) {
1053 		/*
1054 		 * We must have the parent locked to call bch2_btree_node_fill(),
1055 		 * else we could read in a btree node from disk that's been
1056 		 * freed:
1057 		 */
1058 		b = bch2_btree_node_fill(trans, path, k, path->btree_id,
1059 					 level, lock_type, true);
1060 		need_relock = true;
1061 
1062 		/* We raced and found the btree node in the cache */
1063 		if (!b)
1064 			goto retry;
1065 
1066 		if (IS_ERR(b))
1067 			return b;
1068 	} else {
1069 		if (btree_node_read_locked(path, level + 1))
1070 			btree_node_unlock(trans, path, level + 1);
1071 
1072 		ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
1073 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1074 			return ERR_PTR(ret);
1075 
1076 		BUG_ON(ret);
1077 
1078 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1079 			     b->c.level != level ||
1080 			     race_fault())) {
1081 			six_unlock_type(&b->c.lock, lock_type);
1082 			if (bch2_btree_node_relock(trans, path, level + 1))
1083 				goto retry;
1084 
1085 			trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1086 			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1087 		}
1088 
1089 		/* avoid atomic set bit if it's not needed: */
1090 		if (!btree_node_accessed(b))
1091 			set_btree_node_accessed(b);
1092 	}
1093 
1094 	if (unlikely(btree_node_read_in_flight(b))) {
1095 		u32 seq = six_lock_seq(&b->c.lock);
1096 
1097 		six_unlock_type(&b->c.lock, lock_type);
1098 		bch2_trans_unlock(trans);
1099 		need_relock = true;
1100 
1101 		bch2_btree_node_wait_on_read(b);
1102 
1103 		ret = bch2_trans_relock(trans);
1104 		if (ret)
1105 			return ERR_PTR(ret);
1106 
1107 		/*
1108 		 * should_be_locked is not set on this path yet, so we need to
1109 		 * relock it specifically:
1110 		 */
1111 		if (!six_relock_type(&b->c.lock, lock_type, seq))
1112 			goto retry;
1113 	}
1114 
1115 	if (unlikely(need_relock)) {
1116 		ret = bch2_trans_relock(trans) ?:
1117 			bch2_btree_path_relock_intent(trans, path);
1118 		if (ret) {
1119 			six_unlock_type(&b->c.lock, lock_type);
1120 			return ERR_PTR(ret);
1121 		}
1122 	}
1123 
1124 	prefetch(b->aux_data);
1125 
1126 	for_each_bset(b, t) {
1127 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
1128 
1129 		prefetch(p + L1_CACHE_BYTES * 0);
1130 		prefetch(p + L1_CACHE_BYTES * 1);
1131 		prefetch(p + L1_CACHE_BYTES * 2);
1132 	}
1133 
1134 	if (unlikely(btree_node_read_error(b))) {
1135 		six_unlock_type(&b->c.lock, lock_type);
1136 		return ERR_PTR(-BCH_ERR_btree_node_read_error);
1137 	}
1138 
1139 	EBUG_ON(b->c.btree_id != path->btree_id);
1140 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1141 	btree_check_header(c, b);
1142 
1143 	return b;
1144 }
1145 
1146 /**
1147  * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
1148  * in from disk if necessary.
1149  *
1150  * @trans:	btree transaction object
1151  * @path:	btree_path being traversed
1152  * @k:		pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
1153  * @level:	level of btree node being looked up (0 == leaf node)
1154  * @lock_type:	SIX_LOCK_read or SIX_LOCK_intent
1155  * @trace_ip:	ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
1156  *
1157  * The btree node will have either a read or a write lock held, depending on
1158  * the @write parameter.
1159  *
1160  * Returns: btree node or ERR_PTR()
1161  */
1162 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
1163 				  const struct bkey_i *k, unsigned level,
1164 				  enum six_lock_type lock_type,
1165 				  unsigned long trace_ip)
1166 {
1167 	struct bch_fs *c = trans->c;
1168 	struct btree *b;
1169 	int ret;
1170 
1171 	EBUG_ON(level >= BTREE_MAX_DEPTH);
1172 
1173 	b = btree_node_mem_ptr(k);
1174 
1175 	/*
1176 	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
1177 	 * be the node we want anymore, and trying to lock the wrong node could
1178 	 * cause an unneccessary transaction restart:
1179 	 */
1180 	if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
1181 		     !b ||
1182 		     b->hash_val != btree_ptr_hash_val(k)))
1183 		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1184 
1185 	if (btree_node_read_locked(path, level + 1))
1186 		btree_node_unlock(trans, path, level + 1);
1187 
1188 	ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
1189 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1190 		return ERR_PTR(ret);
1191 
1192 	BUG_ON(ret);
1193 
1194 	if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1195 		     b->c.level != level ||
1196 		     race_fault())) {
1197 		six_unlock_type(&b->c.lock, lock_type);
1198 		if (bch2_btree_node_relock(trans, path, level + 1))
1199 			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1200 
1201 		trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1202 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1203 	}
1204 
1205 	if (unlikely(btree_node_read_in_flight(b))) {
1206 		six_unlock_type(&b->c.lock, lock_type);
1207 		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1208 	}
1209 
1210 	prefetch(b->aux_data);
1211 
1212 	for_each_bset(b, t) {
1213 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
1214 
1215 		prefetch(p + L1_CACHE_BYTES * 0);
1216 		prefetch(p + L1_CACHE_BYTES * 1);
1217 		prefetch(p + L1_CACHE_BYTES * 2);
1218 	}
1219 
1220 	/* avoid atomic set bit if it's not needed: */
1221 	if (!btree_node_accessed(b))
1222 		set_btree_node_accessed(b);
1223 
1224 	if (unlikely(btree_node_read_error(b))) {
1225 		six_unlock_type(&b->c.lock, lock_type);
1226 		return ERR_PTR(-BCH_ERR_btree_node_read_error);
1227 	}
1228 
1229 	EBUG_ON(b->c.btree_id != path->btree_id);
1230 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1231 	btree_check_header(c, b);
1232 
1233 	return b;
1234 }
1235 
1236 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
1237 					 const struct bkey_i *k,
1238 					 enum btree_id btree_id,
1239 					 unsigned level,
1240 					 bool nofill)
1241 {
1242 	struct bch_fs *c = trans->c;
1243 	struct btree_cache *bc = &c->btree_cache;
1244 	struct btree *b;
1245 	int ret;
1246 
1247 	EBUG_ON(level >= BTREE_MAX_DEPTH);
1248 
1249 	if (c->opts.btree_node_mem_ptr_optimization) {
1250 		b = btree_node_mem_ptr(k);
1251 		if (b)
1252 			goto lock_node;
1253 	}
1254 retry:
1255 	b = btree_cache_find(bc, k);
1256 	if (unlikely(!b)) {
1257 		if (nofill)
1258 			goto out;
1259 
1260 		b = bch2_btree_node_fill(trans, NULL, k, btree_id,
1261 					 level, SIX_LOCK_read, true);
1262 
1263 		/* We raced and found the btree node in the cache */
1264 		if (!b)
1265 			goto retry;
1266 
1267 		if (IS_ERR(b) &&
1268 		    !bch2_btree_cache_cannibalize_lock(trans, NULL))
1269 			goto retry;
1270 
1271 		if (IS_ERR(b))
1272 			goto out;
1273 	} else {
1274 lock_node:
1275 		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
1276 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1277 			return ERR_PTR(ret);
1278 
1279 		BUG_ON(ret);
1280 
1281 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1282 			     b->c.btree_id != btree_id ||
1283 			     b->c.level != level)) {
1284 			six_unlock_read(&b->c.lock);
1285 			goto retry;
1286 		}
1287 	}
1288 
1289 	/* XXX: waiting on IO with btree locks held: */
1290 	__bch2_btree_node_wait_on_read(b);
1291 
1292 	prefetch(b->aux_data);
1293 
1294 	for_each_bset(b, t) {
1295 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
1296 
1297 		prefetch(p + L1_CACHE_BYTES * 0);
1298 		prefetch(p + L1_CACHE_BYTES * 1);
1299 		prefetch(p + L1_CACHE_BYTES * 2);
1300 	}
1301 
1302 	/* avoid atomic set bit if it's not needed: */
1303 	if (!btree_node_accessed(b))
1304 		set_btree_node_accessed(b);
1305 
1306 	if (unlikely(btree_node_read_error(b))) {
1307 		six_unlock_read(&b->c.lock);
1308 		b = ERR_PTR(-BCH_ERR_btree_node_read_error);
1309 		goto out;
1310 	}
1311 
1312 	EBUG_ON(b->c.btree_id != btree_id);
1313 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1314 	btree_check_header(c, b);
1315 out:
1316 	bch2_btree_cache_cannibalize_unlock(trans);
1317 	return b;
1318 }
1319 
1320 int bch2_btree_node_prefetch(struct btree_trans *trans,
1321 			     struct btree_path *path,
1322 			     const struct bkey_i *k,
1323 			     enum btree_id btree_id, unsigned level)
1324 {
1325 	struct bch_fs *c = trans->c;
1326 	struct btree_cache *bc = &c->btree_cache;
1327 
1328 	BUG_ON(path && !btree_node_locked(path, level + 1));
1329 	BUG_ON(level >= BTREE_MAX_DEPTH);
1330 
1331 	struct btree *b = btree_cache_find(bc, k);
1332 	if (b)
1333 		return 0;
1334 
1335 	b = bch2_btree_node_fill(trans, path, k, btree_id,
1336 				 level, SIX_LOCK_read, false);
1337 	int ret = PTR_ERR_OR_ZERO(b);
1338 	if (ret)
1339 		return ret;
1340 	if (b)
1341 		six_unlock_read(&b->c.lock);
1342 	return 0;
1343 }
1344 
1345 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1346 {
1347 	struct bch_fs *c = trans->c;
1348 	struct btree_cache *bc = &c->btree_cache;
1349 	struct btree *b;
1350 
1351 	b = btree_cache_find(bc, k);
1352 	if (!b)
1353 		return;
1354 
1355 	BUG_ON(b == btree_node_root(trans->c, b));
1356 wait_on_io:
1357 	/* not allowed to wait on io with btree locks held: */
1358 
1359 	/* XXX we're called from btree_gc which will be holding other btree
1360 	 * nodes locked
1361 	 */
1362 	__bch2_btree_node_wait_on_read(b);
1363 	__bch2_btree_node_wait_on_write(b);
1364 
1365 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1366 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1367 	if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
1368 		goto out;
1369 
1370 	if (btree_node_dirty(b)) {
1371 		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1372 		six_unlock_write(&b->c.lock);
1373 		six_unlock_intent(&b->c.lock);
1374 		goto wait_on_io;
1375 	}
1376 
1377 	BUG_ON(btree_node_dirty(b));
1378 
1379 	mutex_lock(&bc->lock);
1380 	bch2_btree_node_hash_remove(bc, b);
1381 	btree_node_data_free(bc, b);
1382 	mutex_unlock(&bc->lock);
1383 out:
1384 	six_unlock_write(&b->c.lock);
1385 	six_unlock_intent(&b->c.lock);
1386 }
1387 
1388 const char *bch2_btree_id_str(enum btree_id btree)
1389 {
1390 	return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
1391 }
1392 
1393 void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
1394 {
1395 	if (btree < BTREE_ID_NR)
1396 		prt_str(out, __bch2_btree_ids[btree]);
1397 	else
1398 		prt_printf(out, "(unknown btree %u)", btree);
1399 }
1400 
1401 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
1402 {
1403 	prt_printf(out, "%s level %u/%u\n  ",
1404 	       bch2_btree_id_str(b->c.btree_id),
1405 	       b->c.level,
1406 	       bch2_btree_id_root(c, b->c.btree_id)->level);
1407 	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1408 }
1409 
1410 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
1411 {
1412 	struct bset_stats stats;
1413 
1414 	memset(&stats, 0, sizeof(stats));
1415 
1416 	bch2_btree_keys_stats(b, &stats);
1417 
1418 	prt_printf(out, "l %u ", b->c.level);
1419 	bch2_bpos_to_text(out, b->data->min_key);
1420 	prt_printf(out, " - ");
1421 	bch2_bpos_to_text(out, b->data->max_key);
1422 	prt_printf(out, ":\n"
1423 	       "    ptrs: ");
1424 	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1425 	prt_newline(out);
1426 
1427 	prt_printf(out,
1428 	       "    format: ");
1429 	bch2_bkey_format_to_text(out, &b->format);
1430 
1431 	prt_printf(out,
1432 	       "    unpack fn len: %u\n"
1433 	       "    bytes used %zu/%zu (%zu%% full)\n"
1434 	       "    sib u64s: %u, %u (merge threshold %u)\n"
1435 	       "    nr packed keys %u\n"
1436 	       "    nr unpacked keys %u\n"
1437 	       "    floats %zu\n"
1438 	       "    failed unpacked %zu\n",
1439 	       b->unpack_fn_len,
1440 	       b->nr.live_u64s * sizeof(u64),
1441 	       btree_buf_bytes(b) - sizeof(struct btree_node),
1442 	       b->nr.live_u64s * 100 / btree_max_u64s(c),
1443 	       b->sib_u64s[0],
1444 	       b->sib_u64s[1],
1445 	       c->btree_foreground_merge_threshold,
1446 	       b->nr.packed_keys,
1447 	       b->nr.unpacked_keys,
1448 	       stats.floats,
1449 	       stats.failed);
1450 }
1451 
1452 static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
1453 				 const char *label, size_t nr)
1454 {
1455 	prt_printf(out, "%s\t", label);
1456 	prt_human_readable_u64(out, nr * c->opts.btree_node_size);
1457 	prt_printf(out, " (%zu)\n", nr);
1458 }
1459 
1460 static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
1461 #define x(n) #n,
1462 	BCH_BTREE_CACHE_NOT_FREED_REASONS()
1463 #undef x
1464 	NULL
1465 };
1466 
1467 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
1468 {
1469 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
1470 
1471 	if (!out->nr_tabstops)
1472 		printbuf_tabstop_push(out, 32);
1473 
1474 	prt_btree_cache_line(out, c, "live:",		bc->live[0].nr);
1475 	prt_btree_cache_line(out, c, "pinned:",		bc->live[1].nr);
1476 	prt_btree_cache_line(out, c, "freeable:",	bc->nr_freeable);
1477 	prt_btree_cache_line(out, c, "dirty:",		atomic_long_read(&bc->nr_dirty));
1478 	prt_printf(out, "cannibalize lock:\t%p\n",	bc->alloc_lock);
1479 	prt_newline(out);
1480 
1481 	for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
1482 		prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]);
1483 
1484 	prt_newline(out);
1485 	prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
1486 	prt_printf(out, "not freed:\n");
1487 
1488 	for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
1489 		prt_printf(out, "  %s\t%llu\n",
1490 			   bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
1491 }
1492