xref: /linux/fs/bcachefs/btree_iter.c (revision 59fff63cc2b75dcfe08f9eeb4b2187d73e53843d)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_journal_iter.h"
9 #include "btree_key_cache.h"
10 #include "btree_locking.h"
11 #include "btree_update.h"
12 #include "debug.h"
13 #include "error.h"
14 #include "extents.h"
15 #include "journal.h"
16 #include "replicas.h"
17 #include "snapshot.h"
18 #include "trace.h"
19 
20 #include <linux/random.h>
21 #include <linux/prefetch.h>
22 
23 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
24 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
25 				       struct btree_path *);
26 
27 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
28 {
29 #ifdef TRACK_PATH_ALLOCATED
30 	return iter->ip_allocated;
31 #else
32 	return 0;
33 #endif
34 }
35 
36 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
37 
38 static inline int __btree_path_cmp(const struct btree_path *l,
39 				   enum btree_id	r_btree_id,
40 				   bool			r_cached,
41 				   struct bpos		r_pos,
42 				   unsigned		r_level)
43 {
44 	/*
45 	 * Must match lock ordering as defined by __bch2_btree_node_lock:
46 	 */
47 	return   cmp_int(l->btree_id,	r_btree_id) ?:
48 		 cmp_int((int) l->cached,	(int) r_cached) ?:
49 		 bpos_cmp(l->pos,	r_pos) ?:
50 		-cmp_int(l->level,	r_level);
51 }
52 
53 static inline int btree_path_cmp(const struct btree_path *l,
54 				 const struct btree_path *r)
55 {
56 	return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
57 }
58 
59 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
60 {
61 	/* Are we iterating over keys in all snapshots? */
62 	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
63 		p = bpos_successor(p);
64 	} else {
65 		p = bpos_nosnap_successor(p);
66 		p.snapshot = iter->snapshot;
67 	}
68 
69 	return p;
70 }
71 
72 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
73 {
74 	/* Are we iterating over keys in all snapshots? */
75 	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
76 		p = bpos_predecessor(p);
77 	} else {
78 		p = bpos_nosnap_predecessor(p);
79 		p.snapshot = iter->snapshot;
80 	}
81 
82 	return p;
83 }
84 
85 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
86 {
87 	struct bpos pos = iter->pos;
88 
89 	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
90 	    !bkey_eq(pos, POS_MAX))
91 		pos = bkey_successor(iter, pos);
92 	return pos;
93 }
94 
95 static inline bool btree_path_pos_before_node(struct btree_path *path,
96 					      struct btree *b)
97 {
98 	return bpos_lt(path->pos, b->data->min_key);
99 }
100 
101 static inline bool btree_path_pos_after_node(struct btree_path *path,
102 					     struct btree *b)
103 {
104 	return bpos_gt(path->pos, b->key.k.p);
105 }
106 
107 static inline bool btree_path_pos_in_node(struct btree_path *path,
108 					  struct btree *b)
109 {
110 	return path->btree_id == b->c.btree_id &&
111 		!btree_path_pos_before_node(path, b) &&
112 		!btree_path_pos_after_node(path, b);
113 }
114 
115 /* Btree iterator: */
116 
117 #ifdef CONFIG_BCACHEFS_DEBUG
118 
119 static void bch2_btree_path_verify_cached(struct btree_trans *trans,
120 					  struct btree_path *path)
121 {
122 	struct bkey_cached *ck;
123 	bool locked = btree_node_locked(path, 0);
124 
125 	if (!bch2_btree_node_relock(trans, path, 0))
126 		return;
127 
128 	ck = (void *) path->l[0].b;
129 	BUG_ON(ck->key.btree_id != path->btree_id ||
130 	       !bkey_eq(ck->key.pos, path->pos));
131 
132 	if (!locked)
133 		btree_node_unlock(trans, path, 0);
134 }
135 
136 static void bch2_btree_path_verify_level(struct btree_trans *trans,
137 				struct btree_path *path, unsigned level)
138 {
139 	struct btree_path_level *l;
140 	struct btree_node_iter tmp;
141 	bool locked;
142 	struct bkey_packed *p, *k;
143 	struct printbuf buf1 = PRINTBUF;
144 	struct printbuf buf2 = PRINTBUF;
145 	struct printbuf buf3 = PRINTBUF;
146 	const char *msg;
147 
148 	if (!bch2_debug_check_iterators)
149 		return;
150 
151 	l	= &path->l[level];
152 	tmp	= l->iter;
153 	locked	= btree_node_locked(path, level);
154 
155 	if (path->cached) {
156 		if (!level)
157 			bch2_btree_path_verify_cached(trans, path);
158 		return;
159 	}
160 
161 	if (!btree_path_node(path, level))
162 		return;
163 
164 	if (!bch2_btree_node_relock_notrace(trans, path, level))
165 		return;
166 
167 	BUG_ON(!btree_path_pos_in_node(path, l->b));
168 
169 	bch2_btree_node_iter_verify(&l->iter, l->b);
170 
171 	/*
172 	 * For interior nodes, the iterator will have skipped past deleted keys:
173 	 */
174 	p = level
175 		? bch2_btree_node_iter_prev(&tmp, l->b)
176 		: bch2_btree_node_iter_prev_all(&tmp, l->b);
177 	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
178 
179 	if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
180 		msg = "before";
181 		goto err;
182 	}
183 
184 	if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
185 		msg = "after";
186 		goto err;
187 	}
188 
189 	if (!locked)
190 		btree_node_unlock(trans, path, level);
191 	return;
192 err:
193 	bch2_bpos_to_text(&buf1, path->pos);
194 
195 	if (p) {
196 		struct bkey uk = bkey_unpack_key(l->b, p);
197 
198 		bch2_bkey_to_text(&buf2, &uk);
199 	} else {
200 		prt_printf(&buf2, "(none)");
201 	}
202 
203 	if (k) {
204 		struct bkey uk = bkey_unpack_key(l->b, k);
205 
206 		bch2_bkey_to_text(&buf3, &uk);
207 	} else {
208 		prt_printf(&buf3, "(none)");
209 	}
210 
211 	panic("path should be %s key at level %u:\n"
212 	      "path pos %s\n"
213 	      "prev key %s\n"
214 	      "cur  key %s\n",
215 	      msg, level, buf1.buf, buf2.buf, buf3.buf);
216 }
217 
218 static void bch2_btree_path_verify(struct btree_trans *trans,
219 				   struct btree_path *path)
220 {
221 	struct bch_fs *c = trans->c;
222 	unsigned i;
223 
224 	EBUG_ON(path->btree_id >= BTREE_ID_NR);
225 
226 	for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
227 		if (!path->l[i].b) {
228 			BUG_ON(!path->cached &&
229 			       bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
230 			break;
231 		}
232 
233 		bch2_btree_path_verify_level(trans, path, i);
234 	}
235 
236 	bch2_btree_path_verify_locks(path);
237 }
238 
239 void bch2_trans_verify_paths(struct btree_trans *trans)
240 {
241 	struct btree_path *path;
242 
243 	trans_for_each_path(trans, path)
244 		bch2_btree_path_verify(trans, path);
245 }
246 
247 static void bch2_btree_iter_verify(struct btree_iter *iter)
248 {
249 	struct btree_trans *trans = iter->trans;
250 
251 	BUG_ON(iter->btree_id >= BTREE_ID_NR);
252 
253 	BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
254 
255 	BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
256 	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
257 
258 	BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
259 	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
260 	       !btree_type_has_snapshots(iter->btree_id));
261 
262 	if (iter->update_path)
263 		bch2_btree_path_verify(trans, iter->update_path);
264 	bch2_btree_path_verify(trans, iter->path);
265 }
266 
267 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
268 {
269 	BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
270 	       !iter->pos.snapshot);
271 
272 	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
273 	       iter->pos.snapshot != iter->snapshot);
274 
275 	BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
276 	       bkey_gt(iter->pos, iter->k.p));
277 }
278 
279 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
280 {
281 	struct btree_trans *trans = iter->trans;
282 	struct btree_iter copy;
283 	struct bkey_s_c prev;
284 	int ret = 0;
285 
286 	if (!bch2_debug_check_iterators)
287 		return 0;
288 
289 	if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
290 		return 0;
291 
292 	if (bkey_err(k) || !k.k)
293 		return 0;
294 
295 	BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
296 					  iter->snapshot,
297 					  k.k->p.snapshot));
298 
299 	bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
300 			     BTREE_ITER_NOPRESERVE|
301 			     BTREE_ITER_ALL_SNAPSHOTS);
302 	prev = bch2_btree_iter_prev(&copy);
303 	if (!prev.k)
304 		goto out;
305 
306 	ret = bkey_err(prev);
307 	if (ret)
308 		goto out;
309 
310 	if (bkey_eq(prev.k->p, k.k->p) &&
311 	    bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
312 				      prev.k->p.snapshot) > 0) {
313 		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
314 
315 		bch2_bkey_to_text(&buf1, k.k);
316 		bch2_bkey_to_text(&buf2, prev.k);
317 
318 		panic("iter snap %u\n"
319 		      "k    %s\n"
320 		      "prev %s\n",
321 		      iter->snapshot,
322 		      buf1.buf, buf2.buf);
323 	}
324 out:
325 	bch2_trans_iter_exit(trans, &copy);
326 	return ret;
327 }
328 
329 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
330 			    struct bpos pos, bool key_cache)
331 {
332 	struct btree_path *path;
333 	unsigned idx;
334 	struct printbuf buf = PRINTBUF;
335 
336 	btree_trans_sort_paths(trans);
337 
338 	trans_for_each_path_inorder(trans, path, idx) {
339 		int cmp = cmp_int(path->btree_id, id) ?:
340 			cmp_int(path->cached, key_cache);
341 
342 		if (cmp > 0)
343 			break;
344 		if (cmp < 0)
345 			continue;
346 
347 		if (!btree_node_locked(path, 0) ||
348 		    !path->should_be_locked)
349 			continue;
350 
351 		if (!key_cache) {
352 			if (bkey_ge(pos, path->l[0].b->data->min_key) &&
353 			    bkey_le(pos, path->l[0].b->key.k.p))
354 				return;
355 		} else {
356 			if (bkey_eq(pos, path->pos))
357 				return;
358 		}
359 	}
360 
361 	bch2_dump_trans_paths_updates(trans);
362 	bch2_bpos_to_text(&buf, pos);
363 
364 	panic("not locked: %s %s%s\n",
365 	      bch2_btree_ids[id], buf.buf,
366 	      key_cache ? " cached" : "");
367 }
368 
369 #else
370 
371 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
372 						struct btree_path *path, unsigned l) {}
373 static inline void bch2_btree_path_verify(struct btree_trans *trans,
374 					  struct btree_path *path) {}
375 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
376 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
377 static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
378 
379 #endif
380 
381 /* Btree path: fixups after btree updates */
382 
383 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
384 					struct btree *b,
385 					struct bset_tree *t,
386 					struct bkey_packed *k)
387 {
388 	struct btree_node_iter_set *set;
389 
390 	btree_node_iter_for_each(iter, set)
391 		if (set->end == t->end_offset) {
392 			set->k = __btree_node_key_to_offset(b, k);
393 			bch2_btree_node_iter_sort(iter, b);
394 			return;
395 		}
396 
397 	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
398 }
399 
400 static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
401 					       struct btree *b,
402 					       struct bkey_packed *where)
403 {
404 	struct btree_path_level *l = &path->l[b->c.level];
405 
406 	if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
407 		return;
408 
409 	if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
410 		bch2_btree_node_iter_advance(&l->iter, l->b);
411 }
412 
413 void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
414 				      struct btree *b,
415 				      struct bkey_packed *where)
416 {
417 	struct btree_path *path;
418 
419 	trans_for_each_path_with_node(trans, b, path) {
420 		__bch2_btree_path_fix_key_modified(path, b, where);
421 		bch2_btree_path_verify_level(trans, path, b->c.level);
422 	}
423 }
424 
425 static void __bch2_btree_node_iter_fix(struct btree_path *path,
426 				       struct btree *b,
427 				       struct btree_node_iter *node_iter,
428 				       struct bset_tree *t,
429 				       struct bkey_packed *where,
430 				       unsigned clobber_u64s,
431 				       unsigned new_u64s)
432 {
433 	const struct bkey_packed *end = btree_bkey_last(b, t);
434 	struct btree_node_iter_set *set;
435 	unsigned offset = __btree_node_key_to_offset(b, where);
436 	int shift = new_u64s - clobber_u64s;
437 	unsigned old_end = t->end_offset - shift;
438 	unsigned orig_iter_pos = node_iter->data[0].k;
439 	bool iter_current_key_modified =
440 		orig_iter_pos >= offset &&
441 		orig_iter_pos <= offset + clobber_u64s;
442 
443 	btree_node_iter_for_each(node_iter, set)
444 		if (set->end == old_end)
445 			goto found;
446 
447 	/* didn't find the bset in the iterator - might have to readd it: */
448 	if (new_u64s &&
449 	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
450 		bch2_btree_node_iter_push(node_iter, b, where, end);
451 		goto fixup_done;
452 	} else {
453 		/* Iterator is after key that changed */
454 		return;
455 	}
456 found:
457 	set->end = t->end_offset;
458 
459 	/* Iterator hasn't gotten to the key that changed yet: */
460 	if (set->k < offset)
461 		return;
462 
463 	if (new_u64s &&
464 	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
465 		set->k = offset;
466 	} else if (set->k < offset + clobber_u64s) {
467 		set->k = offset + new_u64s;
468 		if (set->k == set->end)
469 			bch2_btree_node_iter_set_drop(node_iter, set);
470 	} else {
471 		/* Iterator is after key that changed */
472 		set->k = (int) set->k + shift;
473 		return;
474 	}
475 
476 	bch2_btree_node_iter_sort(node_iter, b);
477 fixup_done:
478 	if (node_iter->data[0].k != orig_iter_pos)
479 		iter_current_key_modified = true;
480 
481 	/*
482 	 * When a new key is added, and the node iterator now points to that
483 	 * key, the iterator might have skipped past deleted keys that should
484 	 * come after the key the iterator now points to. We have to rewind to
485 	 * before those deleted keys - otherwise
486 	 * bch2_btree_node_iter_prev_all() breaks:
487 	 */
488 	if (!bch2_btree_node_iter_end(node_iter) &&
489 	    iter_current_key_modified &&
490 	    b->c.level) {
491 		struct bkey_packed *k, *k2, *p;
492 
493 		k = bch2_btree_node_iter_peek_all(node_iter, b);
494 
495 		for_each_bset(b, t) {
496 			bool set_pos = false;
497 
498 			if (node_iter->data[0].end == t->end_offset)
499 				continue;
500 
501 			k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
502 
503 			while ((p = bch2_bkey_prev_all(b, t, k2)) &&
504 			       bkey_iter_cmp(b, k, p) < 0) {
505 				k2 = p;
506 				set_pos = true;
507 			}
508 
509 			if (set_pos)
510 				btree_node_iter_set_set_pos(node_iter,
511 							    b, t, k2);
512 		}
513 	}
514 }
515 
516 void bch2_btree_node_iter_fix(struct btree_trans *trans,
517 			      struct btree_path *path,
518 			      struct btree *b,
519 			      struct btree_node_iter *node_iter,
520 			      struct bkey_packed *where,
521 			      unsigned clobber_u64s,
522 			      unsigned new_u64s)
523 {
524 	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
525 	struct btree_path *linked;
526 
527 	if (node_iter != &path->l[b->c.level].iter) {
528 		__bch2_btree_node_iter_fix(path, b, node_iter, t,
529 					   where, clobber_u64s, new_u64s);
530 
531 		if (bch2_debug_check_iterators)
532 			bch2_btree_node_iter_verify(node_iter, b);
533 	}
534 
535 	trans_for_each_path_with_node(trans, b, linked) {
536 		__bch2_btree_node_iter_fix(linked, b,
537 					   &linked->l[b->c.level].iter, t,
538 					   where, clobber_u64s, new_u64s);
539 		bch2_btree_path_verify_level(trans, linked, b->c.level);
540 	}
541 }
542 
543 /* Btree path level: pointer to a particular btree node and node iter */
544 
545 static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
546 						  struct btree_path_level *l,
547 						  struct bkey *u,
548 						  struct bkey_packed *k)
549 {
550 	if (unlikely(!k)) {
551 		/*
552 		 * signal to bch2_btree_iter_peek_slot() that we're currently at
553 		 * a hole
554 		 */
555 		u->type = KEY_TYPE_deleted;
556 		return bkey_s_c_null;
557 	}
558 
559 	return bkey_disassemble(l->b, k, u);
560 }
561 
562 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
563 							struct btree_path_level *l,
564 							struct bkey *u)
565 {
566 	return __btree_iter_unpack(c, l, u,
567 			bch2_btree_node_iter_peek_all(&l->iter, l->b));
568 }
569 
570 static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
571 						    struct btree_path *path,
572 						    struct btree_path_level *l,
573 						    struct bkey *u)
574 {
575 	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
576 			bch2_btree_node_iter_peek(&l->iter, l->b));
577 
578 	path->pos = k.k ? k.k->p : l->b->key.k.p;
579 	trans->paths_sorted = false;
580 	bch2_btree_path_verify_level(trans, path, l - path->l);
581 	return k;
582 }
583 
584 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
585 						    struct btree_path *path,
586 						    struct btree_path_level *l,
587 						    struct bkey *u)
588 {
589 	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
590 			bch2_btree_node_iter_prev(&l->iter, l->b));
591 
592 	path->pos = k.k ? k.k->p : l->b->data->min_key;
593 	trans->paths_sorted = false;
594 	bch2_btree_path_verify_level(trans, path, l - path->l);
595 	return k;
596 }
597 
598 static inline bool btree_path_advance_to_pos(struct btree_path *path,
599 					     struct btree_path_level *l,
600 					     int max_advance)
601 {
602 	struct bkey_packed *k;
603 	int nr_advanced = 0;
604 
605 	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
606 	       bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
607 		if (max_advance > 0 && nr_advanced >= max_advance)
608 			return false;
609 
610 		bch2_btree_node_iter_advance(&l->iter, l->b);
611 		nr_advanced++;
612 	}
613 
614 	return true;
615 }
616 
617 static inline void __btree_path_level_init(struct btree_path *path,
618 					   unsigned level)
619 {
620 	struct btree_path_level *l = &path->l[level];
621 
622 	bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
623 
624 	/*
625 	 * Iterators to interior nodes should always be pointed at the first non
626 	 * whiteout:
627 	 */
628 	if (level)
629 		bch2_btree_node_iter_peek(&l->iter, l->b);
630 }
631 
632 void bch2_btree_path_level_init(struct btree_trans *trans,
633 				struct btree_path *path,
634 				struct btree *b)
635 {
636 	BUG_ON(path->cached);
637 
638 	EBUG_ON(!btree_path_pos_in_node(path, b));
639 
640 	path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
641 	path->l[b->c.level].b = b;
642 	__btree_path_level_init(path, b->c.level);
643 }
644 
645 /* Btree path: fixups after btree node updates: */
646 
647 static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
648 {
649 	struct bch_fs *c = trans->c;
650 	struct btree_insert_entry *i;
651 
652 	trans_for_each_update(trans, i)
653 		if (!i->cached &&
654 		    i->level	== b->c.level &&
655 		    i->btree_id	== b->c.btree_id &&
656 		    bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
657 		    bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
658 			i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
659 
660 			if (unlikely(trans->journal_replay_not_finished)) {
661 				struct bkey_i *j_k =
662 					bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
663 								    i->k->k.p);
664 
665 				if (j_k) {
666 					i->old_k = j_k->k;
667 					i->old_v = &j_k->v;
668 				}
669 			}
670 		}
671 }
672 
673 /*
674  * A btree node is being replaced - update the iterator to point to the new
675  * node:
676  */
677 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
678 {
679 	struct btree_path *path;
680 
681 	trans_for_each_path(trans, path)
682 		if (path->uptodate == BTREE_ITER_UPTODATE &&
683 		    !path->cached &&
684 		    btree_path_pos_in_node(path, b)) {
685 			enum btree_node_locked_type t =
686 				btree_lock_want(path, b->c.level);
687 
688 			if (t != BTREE_NODE_UNLOCKED) {
689 				btree_node_unlock(trans, path, b->c.level);
690 				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
691 				mark_btree_node_locked(trans, path, b->c.level, t);
692 			}
693 
694 			bch2_btree_path_level_init(trans, path, b);
695 		}
696 
697 	bch2_trans_revalidate_updates_in_node(trans, b);
698 }
699 
700 /*
701  * A btree node has been modified in such a way as to invalidate iterators - fix
702  * them:
703  */
704 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
705 {
706 	struct btree_path *path;
707 
708 	trans_for_each_path_with_node(trans, b, path)
709 		__btree_path_level_init(path, b->c.level);
710 
711 	bch2_trans_revalidate_updates_in_node(trans, b);
712 }
713 
714 /* Btree path: traverse, set_pos: */
715 
716 static inline int btree_path_lock_root(struct btree_trans *trans,
717 				       struct btree_path *path,
718 				       unsigned depth_want,
719 				       unsigned long trace_ip)
720 {
721 	struct bch_fs *c = trans->c;
722 	struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
723 	enum six_lock_type lock_type;
724 	unsigned i;
725 	int ret;
726 
727 	EBUG_ON(path->nodes_locked);
728 
729 	while (1) {
730 		b = READ_ONCE(*rootp);
731 		path->level = READ_ONCE(b->c.level);
732 
733 		if (unlikely(path->level < depth_want)) {
734 			/*
735 			 * the root is at a lower depth than the depth we want:
736 			 * got to the end of the btree, or we're walking nodes
737 			 * greater than some depth and there are no nodes >=
738 			 * that depth
739 			 */
740 			path->level = depth_want;
741 			for (i = path->level; i < BTREE_MAX_DEPTH; i++)
742 				path->l[i].b = NULL;
743 			return 1;
744 		}
745 
746 		lock_type = __btree_lock_want(path, path->level);
747 		ret = btree_node_lock(trans, path, &b->c,
748 				      path->level, lock_type, trace_ip);
749 		if (unlikely(ret)) {
750 			if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
751 				continue;
752 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
753 				return ret;
754 			BUG();
755 		}
756 
757 		if (likely(b == READ_ONCE(*rootp) &&
758 			   b->c.level == path->level &&
759 			   !race_fault())) {
760 			for (i = 0; i < path->level; i++)
761 				path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
762 			path->l[path->level].b = b;
763 			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
764 				path->l[i].b = NULL;
765 
766 			mark_btree_node_locked(trans, path, path->level,
767 					       (enum btree_node_locked_type) lock_type);
768 			bch2_btree_path_level_init(trans, path, b);
769 			return 0;
770 		}
771 
772 		six_unlock_type(&b->c.lock, lock_type);
773 	}
774 }
775 
776 noinline
777 static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
778 {
779 	struct bch_fs *c = trans->c;
780 	struct btree_path_level *l = path_l(path);
781 	struct btree_node_iter node_iter = l->iter;
782 	struct bkey_packed *k;
783 	struct bkey_buf tmp;
784 	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
785 		? (path->level > 1 ? 0 :  2)
786 		: (path->level > 1 ? 1 : 16);
787 	bool was_locked = btree_node_locked(path, path->level);
788 	int ret = 0;
789 
790 	bch2_bkey_buf_init(&tmp);
791 
792 	while (nr-- && !ret) {
793 		if (!bch2_btree_node_relock(trans, path, path->level))
794 			break;
795 
796 		bch2_btree_node_iter_advance(&node_iter, l->b);
797 		k = bch2_btree_node_iter_peek(&node_iter, l->b);
798 		if (!k)
799 			break;
800 
801 		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
802 		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
803 					       path->level - 1);
804 	}
805 
806 	if (!was_locked)
807 		btree_node_unlock(trans, path, path->level);
808 
809 	bch2_bkey_buf_exit(&tmp, c);
810 	return ret;
811 }
812 
813 static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
814 				 struct btree_and_journal_iter *jiter)
815 {
816 	struct bch_fs *c = trans->c;
817 	struct bkey_s_c k;
818 	struct bkey_buf tmp;
819 	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
820 		? (path->level > 1 ? 0 :  2)
821 		: (path->level > 1 ? 1 : 16);
822 	bool was_locked = btree_node_locked(path, path->level);
823 	int ret = 0;
824 
825 	bch2_bkey_buf_init(&tmp);
826 
827 	while (nr-- && !ret) {
828 		if (!bch2_btree_node_relock(trans, path, path->level))
829 			break;
830 
831 		bch2_btree_and_journal_iter_advance(jiter);
832 		k = bch2_btree_and_journal_iter_peek(jiter);
833 		if (!k.k)
834 			break;
835 
836 		bch2_bkey_buf_reassemble(&tmp, c, k);
837 		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
838 					       path->level - 1);
839 	}
840 
841 	if (!was_locked)
842 		btree_node_unlock(trans, path, path->level);
843 
844 	bch2_bkey_buf_exit(&tmp, c);
845 	return ret;
846 }
847 
848 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
849 					    struct btree_path *path,
850 					    unsigned plevel, struct btree *b)
851 {
852 	struct btree_path_level *l = &path->l[plevel];
853 	bool locked = btree_node_locked(path, plevel);
854 	struct bkey_packed *k;
855 	struct bch_btree_ptr_v2 *bp;
856 
857 	if (!bch2_btree_node_relock(trans, path, plevel))
858 		return;
859 
860 	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
861 	BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
862 
863 	bp = (void *) bkeyp_val(&l->b->format, k);
864 	bp->mem_ptr = (unsigned long)b;
865 
866 	if (!locked)
867 		btree_node_unlock(trans, path, plevel);
868 }
869 
870 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
871 						     struct btree_path *path,
872 						     unsigned flags,
873 						     struct bkey_buf *out)
874 {
875 	struct bch_fs *c = trans->c;
876 	struct btree_path_level *l = path_l(path);
877 	struct btree_and_journal_iter jiter;
878 	struct bkey_s_c k;
879 	int ret = 0;
880 
881 	__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
882 
883 	k = bch2_btree_and_journal_iter_peek(&jiter);
884 
885 	bch2_bkey_buf_reassemble(out, c, k);
886 
887 	if (flags & BTREE_ITER_PREFETCH)
888 		ret = btree_path_prefetch_j(trans, path, &jiter);
889 
890 	bch2_btree_and_journal_iter_exit(&jiter);
891 	return ret;
892 }
893 
894 static __always_inline int btree_path_down(struct btree_trans *trans,
895 					   struct btree_path *path,
896 					   unsigned flags,
897 					   unsigned long trace_ip)
898 {
899 	struct bch_fs *c = trans->c;
900 	struct btree_path_level *l = path_l(path);
901 	struct btree *b;
902 	unsigned level = path->level - 1;
903 	enum six_lock_type lock_type = __btree_lock_want(path, level);
904 	struct bkey_buf tmp;
905 	int ret;
906 
907 	EBUG_ON(!btree_node_locked(path, path->level));
908 
909 	bch2_bkey_buf_init(&tmp);
910 
911 	if (unlikely(trans->journal_replay_not_finished)) {
912 		ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
913 		if (ret)
914 			goto err;
915 	} else {
916 		bch2_bkey_buf_unpack(&tmp, c, l->b,
917 				 bch2_btree_node_iter_peek(&l->iter, l->b));
918 
919 		if (flags & BTREE_ITER_PREFETCH) {
920 			ret = btree_path_prefetch(trans, path);
921 			if (ret)
922 				goto err;
923 		}
924 	}
925 
926 	b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
927 	ret = PTR_ERR_OR_ZERO(b);
928 	if (unlikely(ret))
929 		goto err;
930 
931 	if (likely(!trans->journal_replay_not_finished &&
932 		   tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
933 	    unlikely(b != btree_node_mem_ptr(tmp.k)))
934 		btree_node_mem_ptr_set(trans, path, level + 1, b);
935 
936 	if (btree_node_read_locked(path, level + 1))
937 		btree_node_unlock(trans, path, level + 1);
938 
939 	mark_btree_node_locked(trans, path, level,
940 			       (enum btree_node_locked_type) lock_type);
941 	path->level = level;
942 	bch2_btree_path_level_init(trans, path, b);
943 
944 	bch2_btree_path_verify_locks(path);
945 err:
946 	bch2_bkey_buf_exit(&tmp, c);
947 	return ret;
948 }
949 
950 
951 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
952 {
953 	struct bch_fs *c = trans->c;
954 	struct btree_path *path;
955 	unsigned long trace_ip = _RET_IP_;
956 	int i, ret = 0;
957 
958 	if (trans->in_traverse_all)
959 		return -BCH_ERR_transaction_restart_in_traverse_all;
960 
961 	trans->in_traverse_all = true;
962 retry_all:
963 	trans->restarted = 0;
964 	trans->last_restarted_ip = 0;
965 
966 	trans_for_each_path(trans, path)
967 		path->should_be_locked = false;
968 
969 	btree_trans_sort_paths(trans);
970 
971 	bch2_trans_unlock(trans);
972 	cond_resched();
973 
974 	if (unlikely(trans->memory_allocation_failure)) {
975 		struct closure cl;
976 
977 		closure_init_stack(&cl);
978 
979 		do {
980 			ret = bch2_btree_cache_cannibalize_lock(c, &cl);
981 			closure_sync(&cl);
982 		} while (ret);
983 	}
984 
985 	/* Now, redo traversals in correct order: */
986 	i = 0;
987 	while (i < trans->nr_sorted) {
988 		path = trans->paths + trans->sorted[i];
989 
990 		/*
991 		 * Traversing a path can cause another path to be added at about
992 		 * the same position:
993 		 */
994 		if (path->uptodate) {
995 			__btree_path_get(path, false);
996 			ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_);
997 			__btree_path_put(path, false);
998 
999 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1000 			    bch2_err_matches(ret, ENOMEM))
1001 				goto retry_all;
1002 			if (ret)
1003 				goto err;
1004 		} else {
1005 			i++;
1006 		}
1007 	}
1008 
1009 	/*
1010 	 * We used to assert that all paths had been traversed here
1011 	 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
1012 	 * path->should_be_locked is not set yet, we might have unlocked and
1013 	 * then failed to relock a path - that's fine.
1014 	 */
1015 err:
1016 	bch2_btree_cache_cannibalize_unlock(c);
1017 
1018 	trans->in_traverse_all = false;
1019 
1020 	trace_and_count(c, trans_traverse_all, trans, trace_ip);
1021 	return ret;
1022 }
1023 
1024 static inline bool btree_path_check_pos_in_node(struct btree_path *path,
1025 						unsigned l, int check_pos)
1026 {
1027 	if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
1028 		return false;
1029 	if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
1030 		return false;
1031 	return true;
1032 }
1033 
1034 static inline bool btree_path_good_node(struct btree_trans *trans,
1035 					struct btree_path *path,
1036 					unsigned l, int check_pos)
1037 {
1038 	return is_btree_node(path, l) &&
1039 		bch2_btree_node_relock(trans, path, l) &&
1040 		btree_path_check_pos_in_node(path, l, check_pos);
1041 }
1042 
1043 static void btree_path_set_level_down(struct btree_trans *trans,
1044 				      struct btree_path *path,
1045 				      unsigned new_level)
1046 {
1047 	unsigned l;
1048 
1049 	path->level = new_level;
1050 
1051 	for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
1052 		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
1053 			btree_node_unlock(trans, path, l);
1054 
1055 	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1056 	bch2_btree_path_verify(trans, path);
1057 }
1058 
1059 static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
1060 							 struct btree_path *path,
1061 							 int check_pos)
1062 {
1063 	unsigned i, l = path->level;
1064 again:
1065 	while (btree_path_node(path, l) &&
1066 	       !btree_path_good_node(trans, path, l, check_pos))
1067 		__btree_path_set_level_up(trans, path, l++);
1068 
1069 	/* If we need intent locks, take them too: */
1070 	for (i = l + 1;
1071 	     i < path->locks_want && btree_path_node(path, i);
1072 	     i++)
1073 		if (!bch2_btree_node_relock(trans, path, i)) {
1074 			while (l <= i)
1075 				__btree_path_set_level_up(trans, path, l++);
1076 			goto again;
1077 		}
1078 
1079 	return l;
1080 }
1081 
1082 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
1083 						     struct btree_path *path,
1084 						     int check_pos)
1085 {
1086 	return likely(btree_node_locked(path, path->level) &&
1087 		      btree_path_check_pos_in_node(path, path->level, check_pos))
1088 		? path->level
1089 		: __btree_path_up_until_good_node(trans, path, check_pos);
1090 }
1091 
1092 /*
1093  * This is the main state machine for walking down the btree - walks down to a
1094  * specified depth
1095  *
1096  * Returns 0 on success, -EIO on error (error reading in a btree node).
1097  *
1098  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1099  * stashed in the iterator and returned from bch2_trans_exit().
1100  */
1101 int bch2_btree_path_traverse_one(struct btree_trans *trans,
1102 				 struct btree_path *path,
1103 				 unsigned flags,
1104 				 unsigned long trace_ip)
1105 {
1106 	unsigned depth_want = path->level;
1107 	int ret = -((int) trans->restarted);
1108 
1109 	if (unlikely(ret))
1110 		goto out;
1111 
1112 	/*
1113 	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
1114 	 * and re-traverse the path without a transaction restart:
1115 	 */
1116 	if (path->should_be_locked) {
1117 		ret = bch2_btree_path_relock(trans, path, trace_ip);
1118 		goto out;
1119 	}
1120 
1121 	if (path->cached) {
1122 		ret = bch2_btree_path_traverse_cached(trans, path, flags);
1123 		goto out;
1124 	}
1125 
1126 	if (unlikely(path->level >= BTREE_MAX_DEPTH))
1127 		goto out;
1128 
1129 	path->level = btree_path_up_until_good_node(trans, path, 0);
1130 
1131 	EBUG_ON(btree_path_node(path, path->level) &&
1132 		!btree_node_locked(path, path->level));
1133 
1134 	/*
1135 	 * Note: path->nodes[path->level] may be temporarily NULL here - that
1136 	 * would indicate to other code that we got to the end of the btree,
1137 	 * here it indicates that relocking the root failed - it's critical that
1138 	 * btree_path_lock_root() comes next and that it can't fail
1139 	 */
1140 	while (path->level > depth_want) {
1141 		ret = btree_path_node(path, path->level)
1142 			? btree_path_down(trans, path, flags, trace_ip)
1143 			: btree_path_lock_root(trans, path, depth_want, trace_ip);
1144 		if (unlikely(ret)) {
1145 			if (ret == 1) {
1146 				/*
1147 				 * No nodes at this level - got to the end of
1148 				 * the btree:
1149 				 */
1150 				ret = 0;
1151 				goto out;
1152 			}
1153 
1154 			__bch2_btree_path_unlock(trans, path);
1155 			path->level = depth_want;
1156 			path->l[path->level].b = ERR_PTR(ret);
1157 			goto out;
1158 		}
1159 	}
1160 
1161 	path->uptodate = BTREE_ITER_UPTODATE;
1162 out:
1163 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
1164 		panic("ret %s (%i) trans->restarted %s (%i)\n",
1165 		      bch2_err_str(ret), ret,
1166 		      bch2_err_str(trans->restarted), trans->restarted);
1167 	bch2_btree_path_verify(trans, path);
1168 	return ret;
1169 }
1170 
1171 static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
1172 			    struct btree_path *src)
1173 {
1174 	unsigned i, offset = offsetof(struct btree_path, pos);
1175 
1176 	memcpy((void *) dst + offset,
1177 	       (void *) src + offset,
1178 	       sizeof(struct btree_path) - offset);
1179 
1180 	for (i = 0; i < BTREE_MAX_DEPTH; i++) {
1181 		unsigned t = btree_node_locked_type(dst, i);
1182 
1183 		if (t != BTREE_NODE_UNLOCKED)
1184 			six_lock_increment(&dst->l[i].b->c.lock, t);
1185 	}
1186 }
1187 
1188 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
1189 					   bool intent)
1190 {
1191 	struct btree_path *new = btree_path_alloc(trans, src);
1192 
1193 	btree_path_copy(trans, new, src);
1194 	__btree_path_get(new, intent);
1195 	return new;
1196 }
1197 
1198 __flatten
1199 struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
1200 			 struct btree_path *path, bool intent,
1201 			 unsigned long ip)
1202 {
1203 	__btree_path_put(path, intent);
1204 	path = btree_path_clone(trans, path, intent);
1205 	path->preserve = false;
1206 	return path;
1207 }
1208 
1209 struct btree_path * __must_check
1210 __bch2_btree_path_set_pos(struct btree_trans *trans,
1211 		   struct btree_path *path, struct bpos new_pos,
1212 		   bool intent, unsigned long ip, int cmp)
1213 {
1214 	unsigned level = path->level;
1215 
1216 	bch2_trans_verify_not_in_restart(trans);
1217 	EBUG_ON(!path->ref);
1218 
1219 	path = bch2_btree_path_make_mut(trans, path, intent, ip);
1220 
1221 	path->pos		= new_pos;
1222 	trans->paths_sorted	= false;
1223 
1224 	if (unlikely(path->cached)) {
1225 		btree_node_unlock(trans, path, 0);
1226 		path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
1227 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1228 		goto out;
1229 	}
1230 
1231 	level = btree_path_up_until_good_node(trans, path, cmp);
1232 
1233 	if (btree_path_node(path, level)) {
1234 		struct btree_path_level *l = &path->l[level];
1235 
1236 		BUG_ON(!btree_node_locked(path, level));
1237 		/*
1238 		 * We might have to skip over many keys, or just a few: try
1239 		 * advancing the node iterator, and if we have to skip over too
1240 		 * many keys just reinit it (or if we're rewinding, since that
1241 		 * is expensive).
1242 		 */
1243 		if (cmp < 0 ||
1244 		    !btree_path_advance_to_pos(path, l, 8))
1245 			bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
1246 
1247 		/*
1248 		 * Iterators to interior nodes should always be pointed at the first non
1249 		 * whiteout:
1250 		 */
1251 		if (unlikely(level))
1252 			bch2_btree_node_iter_peek(&l->iter, l->b);
1253 	}
1254 
1255 	if (unlikely(level != path->level)) {
1256 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1257 		__bch2_btree_path_unlock(trans, path);
1258 	}
1259 out:
1260 	bch2_btree_path_verify(trans, path);
1261 	return path;
1262 }
1263 
1264 /* Btree path: main interface: */
1265 
1266 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
1267 {
1268 	struct btree_path *sib;
1269 
1270 	sib = prev_btree_path(trans, path);
1271 	if (sib && !btree_path_cmp(sib, path))
1272 		return sib;
1273 
1274 	sib = next_btree_path(trans, path);
1275 	if (sib && !btree_path_cmp(sib, path))
1276 		return sib;
1277 
1278 	return NULL;
1279 }
1280 
1281 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
1282 {
1283 	struct btree_path *sib;
1284 
1285 	sib = prev_btree_path(trans, path);
1286 	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1287 		return sib;
1288 
1289 	sib = next_btree_path(trans, path);
1290 	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1291 		return sib;
1292 
1293 	return NULL;
1294 }
1295 
1296 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
1297 {
1298 	__bch2_btree_path_unlock(trans, path);
1299 	btree_path_list_remove(trans, path);
1300 	trans->paths_allocated &= ~(1ULL << path->idx);
1301 }
1302 
1303 void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
1304 {
1305 	struct btree_path *dup;
1306 
1307 	EBUG_ON(trans->paths + path->idx != path);
1308 	EBUG_ON(!path->ref);
1309 
1310 	if (!__btree_path_put(path, intent))
1311 		return;
1312 
1313 	dup = path->preserve
1314 		? have_path_at_pos(trans, path)
1315 		: have_node_at_pos(trans, path);
1316 
1317 	if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
1318 		return;
1319 
1320 	if (path->should_be_locked &&
1321 	    !trans->restarted &&
1322 	    (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
1323 		return;
1324 
1325 	if (dup) {
1326 		dup->preserve		|= path->preserve;
1327 		dup->should_be_locked	|= path->should_be_locked;
1328 	}
1329 
1330 	__bch2_path_free(trans, path);
1331 }
1332 
1333 static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
1334 				 bool intent)
1335 {
1336 	EBUG_ON(trans->paths + path->idx != path);
1337 	EBUG_ON(!path->ref);
1338 
1339 	if (!__btree_path_put(path, intent))
1340 		return;
1341 
1342 	__bch2_path_free(trans, path);
1343 }
1344 
1345 void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
1346 {
1347 	panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
1348 	      trans->restart_count, restart_count,
1349 	      (void *) trans->last_begin_ip);
1350 }
1351 
1352 void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
1353 {
1354 	panic("in transaction restart: %s, last restarted by %pS\n",
1355 	      bch2_err_str(trans->restarted),
1356 	      (void *) trans->last_restarted_ip);
1357 }
1358 
1359 noinline __cold
1360 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
1361 {
1362 	struct btree_insert_entry *i;
1363 	struct btree_write_buffered_key *wb;
1364 
1365 	prt_printf(buf, "transaction updates for %s journal seq %llu",
1366 	       trans->fn, trans->journal_res.seq);
1367 	prt_newline(buf);
1368 	printbuf_indent_add(buf, 2);
1369 
1370 	trans_for_each_update(trans, i) {
1371 		struct bkey_s_c old = { &i->old_k, i->old_v };
1372 
1373 		prt_printf(buf, "update: btree=%s cached=%u %pS",
1374 		       bch2_btree_ids[i->btree_id],
1375 		       i->cached,
1376 		       (void *) i->ip_allocated);
1377 		prt_newline(buf);
1378 
1379 		prt_printf(buf, "  old ");
1380 		bch2_bkey_val_to_text(buf, trans->c, old);
1381 		prt_newline(buf);
1382 
1383 		prt_printf(buf, "  new ");
1384 		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
1385 		prt_newline(buf);
1386 	}
1387 
1388 	trans_for_each_wb_update(trans, wb) {
1389 		prt_printf(buf, "update: btree=%s wb=1 %pS",
1390 		       bch2_btree_ids[wb->btree],
1391 		       (void *) i->ip_allocated);
1392 		prt_newline(buf);
1393 
1394 		prt_printf(buf, "  new ");
1395 		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
1396 		prt_newline(buf);
1397 	}
1398 
1399 	printbuf_indent_sub(buf, 2);
1400 }
1401 
1402 noinline __cold
1403 void bch2_dump_trans_updates(struct btree_trans *trans)
1404 {
1405 	struct printbuf buf = PRINTBUF;
1406 
1407 	bch2_trans_updates_to_text(&buf, trans);
1408 	bch2_print_string_as_lines(KERN_ERR, buf.buf);
1409 	printbuf_exit(&buf);
1410 }
1411 
1412 noinline __cold
1413 void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
1414 {
1415 	prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
1416 		   path->idx, path->ref, path->intent_ref,
1417 		   path->preserve ? 'P' : ' ',
1418 		   path->should_be_locked ? 'S' : ' ',
1419 		   bch2_btree_ids[path->btree_id],
1420 		   path->level);
1421 	bch2_bpos_to_text(out, path->pos);
1422 
1423 	prt_printf(out, " locks %u", path->nodes_locked);
1424 #ifdef TRACK_PATH_ALLOCATED
1425 	prt_printf(out, " %pS", (void *) path->ip_allocated);
1426 #endif
1427 	prt_newline(out);
1428 }
1429 
1430 static noinline __cold
1431 void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
1432 				bool nosort)
1433 {
1434 	struct btree_path *path;
1435 	unsigned idx;
1436 
1437 	if (!nosort)
1438 		btree_trans_sort_paths(trans);
1439 
1440 	trans_for_each_path_inorder(trans, path, idx)
1441 		bch2_btree_path_to_text(out, path);
1442 }
1443 
1444 noinline __cold
1445 void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
1446 {
1447 	__bch2_trans_paths_to_text(out, trans, false);
1448 }
1449 
1450 static noinline __cold
1451 void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
1452 {
1453 	struct printbuf buf = PRINTBUF;
1454 
1455 	__bch2_trans_paths_to_text(&buf, trans, nosort);
1456 	bch2_trans_updates_to_text(&buf, trans);
1457 
1458 	bch2_print_string_as_lines(KERN_ERR, buf.buf);
1459 	printbuf_exit(&buf);
1460 }
1461 
1462 noinline __cold
1463 void bch2_dump_trans_paths_updates(struct btree_trans *trans)
1464 {
1465 	__bch2_dump_trans_paths_updates(trans, false);
1466 }
1467 
1468 noinline __cold
1469 static void bch2_trans_update_max_paths(struct btree_trans *trans)
1470 {
1471 	struct btree_transaction_stats *s = btree_trans_stats(trans);
1472 	struct printbuf buf = PRINTBUF;
1473 
1474 	if (!s)
1475 		return;
1476 
1477 	bch2_trans_paths_to_text(&buf, trans);
1478 
1479 	if (!buf.allocation_failure) {
1480 		mutex_lock(&s->lock);
1481 		if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
1482 			s->nr_max_paths = trans->nr_max_paths =
1483 				hweight64(trans->paths_allocated);
1484 			swap(s->max_paths_text, buf.buf);
1485 		}
1486 		mutex_unlock(&s->lock);
1487 	}
1488 
1489 	printbuf_exit(&buf);
1490 
1491 	trans->nr_max_paths = hweight64(trans->paths_allocated);
1492 }
1493 
1494 static noinline void btree_path_overflow(struct btree_trans *trans)
1495 {
1496 	bch2_dump_trans_paths_updates(trans);
1497 	panic("trans path overflow\n");
1498 }
1499 
1500 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
1501 						  struct btree_path *pos)
1502 {
1503 	struct btree_path *path;
1504 	unsigned idx;
1505 
1506 	if (unlikely(trans->paths_allocated ==
1507 		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
1508 		btree_path_overflow(trans);
1509 
1510 	idx = __ffs64(~trans->paths_allocated);
1511 
1512 	/*
1513 	 * Do this before marking the new path as allocated, since it won't be
1514 	 * initialized yet:
1515 	 */
1516 	if (unlikely(idx > trans->nr_max_paths))
1517 		bch2_trans_update_max_paths(trans);
1518 
1519 	trans->paths_allocated |= 1ULL << idx;
1520 
1521 	path = &trans->paths[idx];
1522 	path->idx		= idx;
1523 	path->ref		= 0;
1524 	path->intent_ref	= 0;
1525 	path->nodes_locked	= 0;
1526 
1527 	btree_path_list_add(trans, pos, path);
1528 	trans->paths_sorted = false;
1529 	return path;
1530 }
1531 
1532 struct btree_path *bch2_path_get(struct btree_trans *trans,
1533 				 enum btree_id btree_id, struct bpos pos,
1534 				 unsigned locks_want, unsigned level,
1535 				 unsigned flags, unsigned long ip)
1536 {
1537 	struct btree_path *path, *path_pos = NULL;
1538 	bool cached = flags & BTREE_ITER_CACHED;
1539 	bool intent = flags & BTREE_ITER_INTENT;
1540 	int i;
1541 
1542 	bch2_trans_verify_not_in_restart(trans);
1543 	bch2_trans_verify_locks(trans);
1544 
1545 	btree_trans_sort_paths(trans);
1546 
1547 	trans_for_each_path_inorder(trans, path, i) {
1548 		if (__btree_path_cmp(path,
1549 				     btree_id,
1550 				     cached,
1551 				     pos,
1552 				     level) > 0)
1553 			break;
1554 
1555 		path_pos = path;
1556 	}
1557 
1558 	if (path_pos &&
1559 	    path_pos->cached	== cached &&
1560 	    path_pos->btree_id	== btree_id &&
1561 	    path_pos->level	== level) {
1562 		__btree_path_get(path_pos, intent);
1563 		path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
1564 	} else {
1565 		path = btree_path_alloc(trans, path_pos);
1566 		path_pos = NULL;
1567 
1568 		__btree_path_get(path, intent);
1569 		path->pos			= pos;
1570 		path->btree_id			= btree_id;
1571 		path->cached			= cached;
1572 		path->uptodate			= BTREE_ITER_NEED_TRAVERSE;
1573 		path->should_be_locked		= false;
1574 		path->level			= level;
1575 		path->locks_want		= locks_want;
1576 		path->nodes_locked		= 0;
1577 		for (i = 0; i < ARRAY_SIZE(path->l); i++)
1578 			path->l[i].b		= ERR_PTR(-BCH_ERR_no_btree_node_init);
1579 #ifdef TRACK_PATH_ALLOCATED
1580 		path->ip_allocated		= ip;
1581 #endif
1582 		trans->paths_sorted		= false;
1583 	}
1584 
1585 	if (!(flags & BTREE_ITER_NOPRESERVE))
1586 		path->preserve = true;
1587 
1588 	if (path->intent_ref)
1589 		locks_want = max(locks_want, level + 1);
1590 
1591 	/*
1592 	 * If the path has locks_want greater than requested, we don't downgrade
1593 	 * it here - on transaction restart because btree node split needs to
1594 	 * upgrade locks, we might be putting/getting the iterator again.
1595 	 * Downgrading iterators only happens via bch2_trans_downgrade(), after
1596 	 * a successful transaction commit.
1597 	 */
1598 
1599 	locks_want = min(locks_want, BTREE_MAX_DEPTH);
1600 	if (locks_want > path->locks_want)
1601 		bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
1602 
1603 	return path;
1604 }
1605 
1606 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
1607 {
1608 
1609 	struct btree_path_level *l = path_l(path);
1610 	struct bkey_packed *_k;
1611 	struct bkey_s_c k;
1612 
1613 	if (unlikely(!l->b))
1614 		return bkey_s_c_null;
1615 
1616 	EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
1617 	EBUG_ON(!btree_node_locked(path, path->level));
1618 
1619 	if (!path->cached) {
1620 		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1621 		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
1622 
1623 		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
1624 
1625 		if (!k.k || !bpos_eq(path->pos, k.k->p))
1626 			goto hole;
1627 	} else {
1628 		struct bkey_cached *ck = (void *) path->l[0].b;
1629 
1630 		EBUG_ON(ck &&
1631 			(path->btree_id != ck->key.btree_id ||
1632 			 !bkey_eq(path->pos, ck->key.pos)));
1633 		if (!ck || !ck->valid)
1634 			return bkey_s_c_null;
1635 
1636 		*u = ck->k->k;
1637 		k = bkey_i_to_s_c(ck->k);
1638 	}
1639 
1640 	return k;
1641 hole:
1642 	bkey_init(u);
1643 	u->p = path->pos;
1644 	return (struct bkey_s_c) { u, NULL };
1645 }
1646 
1647 /* Btree iterators: */
1648 
1649 int __must_check
1650 __bch2_btree_iter_traverse(struct btree_iter *iter)
1651 {
1652 	return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1653 }
1654 
1655 int __must_check
1656 bch2_btree_iter_traverse(struct btree_iter *iter)
1657 {
1658 	int ret;
1659 
1660 	iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
1661 					btree_iter_search_key(iter),
1662 					iter->flags & BTREE_ITER_INTENT,
1663 					btree_iter_ip_allocated(iter));
1664 
1665 	ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1666 	if (ret)
1667 		return ret;
1668 
1669 	btree_path_set_should_be_locked(iter->path);
1670 	return 0;
1671 }
1672 
1673 /* Iterate across nodes (leaf and interior nodes) */
1674 
1675 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1676 {
1677 	struct btree_trans *trans = iter->trans;
1678 	struct btree *b = NULL;
1679 	int ret;
1680 
1681 	EBUG_ON(iter->path->cached);
1682 	bch2_btree_iter_verify(iter);
1683 
1684 	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1685 	if (ret)
1686 		goto err;
1687 
1688 	b = btree_path_node(iter->path, iter->path->level);
1689 	if (!b)
1690 		goto out;
1691 
1692 	BUG_ON(bpos_lt(b->key.k.p, iter->pos));
1693 
1694 	bkey_init(&iter->k);
1695 	iter->k.p = iter->pos = b->key.k.p;
1696 
1697 	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1698 					iter->flags & BTREE_ITER_INTENT,
1699 					btree_iter_ip_allocated(iter));
1700 	btree_path_set_should_be_locked(iter->path);
1701 out:
1702 	bch2_btree_iter_verify_entry_exit(iter);
1703 	bch2_btree_iter_verify(iter);
1704 
1705 	return b;
1706 err:
1707 	b = ERR_PTR(ret);
1708 	goto out;
1709 }
1710 
1711 struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
1712 {
1713 	struct btree *b;
1714 
1715 	while (b = bch2_btree_iter_peek_node(iter),
1716 	       bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
1717 		bch2_trans_begin(iter->trans);
1718 
1719 	return b;
1720 }
1721 
1722 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1723 {
1724 	struct btree_trans *trans = iter->trans;
1725 	struct btree_path *path = iter->path;
1726 	struct btree *b = NULL;
1727 	int ret;
1728 
1729 	bch2_trans_verify_not_in_restart(trans);
1730 	EBUG_ON(iter->path->cached);
1731 	bch2_btree_iter_verify(iter);
1732 
1733 	/* already at end? */
1734 	if (!btree_path_node(path, path->level))
1735 		return NULL;
1736 
1737 	/* got to end? */
1738 	if (!btree_path_node(path, path->level + 1)) {
1739 		btree_path_set_level_up(trans, path);
1740 		return NULL;
1741 	}
1742 
1743 	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
1744 		__bch2_btree_path_unlock(trans, path);
1745 		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock);
1746 		path->l[path->level + 1].b	= ERR_PTR(-BCH_ERR_no_btree_node_relock);
1747 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1748 		trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
1749 		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
1750 		goto err;
1751 	}
1752 
1753 	b = btree_path_node(path, path->level + 1);
1754 
1755 	if (bpos_eq(iter->pos, b->key.k.p)) {
1756 		__btree_path_set_level_up(trans, path, path->level++);
1757 	} else {
1758 		/*
1759 		 * Haven't gotten to the end of the parent node: go back down to
1760 		 * the next child node
1761 		 */
1762 		path = iter->path =
1763 			bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
1764 					   iter->flags & BTREE_ITER_INTENT,
1765 					   btree_iter_ip_allocated(iter));
1766 
1767 		btree_path_set_level_down(trans, path, iter->min_depth);
1768 
1769 		ret = bch2_btree_path_traverse(trans, path, iter->flags);
1770 		if (ret)
1771 			goto err;
1772 
1773 		b = path->l[path->level].b;
1774 	}
1775 
1776 	bkey_init(&iter->k);
1777 	iter->k.p = iter->pos = b->key.k.p;
1778 
1779 	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1780 					iter->flags & BTREE_ITER_INTENT,
1781 					btree_iter_ip_allocated(iter));
1782 	btree_path_set_should_be_locked(iter->path);
1783 	BUG_ON(iter->path->uptodate);
1784 out:
1785 	bch2_btree_iter_verify_entry_exit(iter);
1786 	bch2_btree_iter_verify(iter);
1787 
1788 	return b;
1789 err:
1790 	b = ERR_PTR(ret);
1791 	goto out;
1792 }
1793 
1794 /* Iterate across keys (in leaf nodes only) */
1795 
1796 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1797 {
1798 	if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
1799 		struct bpos pos = iter->k.p;
1800 		bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1801 			     ? bpos_eq(pos, SPOS_MAX)
1802 			     : bkey_eq(pos, SPOS_MAX));
1803 
1804 		if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1805 			pos = bkey_successor(iter, pos);
1806 		bch2_btree_iter_set_pos(iter, pos);
1807 		return ret;
1808 	} else {
1809 		if (!btree_path_node(iter->path, iter->path->level))
1810 			return true;
1811 
1812 		iter->advanced = true;
1813 		return false;
1814 	}
1815 }
1816 
1817 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1818 {
1819 	struct bpos pos = bkey_start_pos(&iter->k);
1820 	bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1821 		     ? bpos_eq(pos, POS_MIN)
1822 		     : bkey_eq(pos, POS_MIN));
1823 
1824 	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1825 		pos = bkey_predecessor(iter, pos);
1826 	bch2_btree_iter_set_pos(iter, pos);
1827 	return ret;
1828 }
1829 
1830 static noinline
1831 struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
1832 {
1833 	struct btree_insert_entry *i;
1834 	struct bkey_i *ret = NULL;
1835 
1836 	trans_for_each_update(iter->trans, i) {
1837 		if (i->btree_id < iter->btree_id)
1838 			continue;
1839 		if (i->btree_id > iter->btree_id)
1840 			break;
1841 		if (bpos_lt(i->k->k.p, iter->path->pos))
1842 			continue;
1843 		if (i->key_cache_already_flushed)
1844 			continue;
1845 		if (!ret || bpos_lt(i->k->k.p, ret->k.p))
1846 			ret = i->k;
1847 	}
1848 
1849 	return ret;
1850 }
1851 
1852 static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
1853 {
1854 	return iter->flags & BTREE_ITER_WITH_UPDATES
1855 		? __bch2_btree_trans_peek_updates(iter)
1856 		: NULL;
1857 }
1858 
1859 static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
1860 					      struct btree_iter *iter,
1861 					      struct bpos end_pos)
1862 {
1863 	struct bkey_i *k;
1864 
1865 	if (bpos_lt(iter->path->pos, iter->journal_pos))
1866 		iter->journal_idx = 0;
1867 
1868 	k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
1869 					iter->path->level,
1870 					iter->path->pos,
1871 					end_pos,
1872 					&iter->journal_idx);
1873 
1874 	iter->journal_pos = k ? k->k.p : end_pos;
1875 	return k;
1876 }
1877 
1878 static noinline
1879 struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
1880 					      struct btree_iter *iter)
1881 {
1882 	struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos);
1883 
1884 	if (k) {
1885 		iter->k = k->k;
1886 		return bkey_i_to_s_c(k);
1887 	} else {
1888 		return bkey_s_c_null;
1889 	}
1890 }
1891 
1892 static noinline
1893 struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
1894 					 struct btree_iter *iter,
1895 					 struct bkey_s_c k)
1896 {
1897 	struct bkey_i *next_journal =
1898 		bch2_btree_journal_peek(trans, iter,
1899 				k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
1900 
1901 	if (next_journal) {
1902 		iter->k = next_journal->k;
1903 		k = bkey_i_to_s_c(next_journal);
1904 	}
1905 
1906 	return k;
1907 }
1908 
1909 /*
1910  * Checks btree key cache for key at iter->pos and returns it if present, or
1911  * bkey_s_c_null:
1912  */
1913 static noinline
1914 struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1915 {
1916 	struct btree_trans *trans = iter->trans;
1917 	struct bch_fs *c = trans->c;
1918 	struct bkey u;
1919 	struct bkey_s_c k;
1920 	int ret;
1921 
1922 	if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) &&
1923 	    bpos_eq(iter->pos, pos))
1924 		return bkey_s_c_null;
1925 
1926 	if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
1927 		return bkey_s_c_null;
1928 
1929 	if (!iter->key_cache_path)
1930 		iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
1931 						     iter->flags & BTREE_ITER_INTENT, 0,
1932 						     iter->flags|BTREE_ITER_CACHED|
1933 						     BTREE_ITER_CACHED_NOFILL,
1934 						     _THIS_IP_);
1935 
1936 	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
1937 					iter->flags & BTREE_ITER_INTENT,
1938 					btree_iter_ip_allocated(iter));
1939 
1940 	ret =   bch2_btree_path_traverse(trans, iter->key_cache_path,
1941 					 iter->flags|BTREE_ITER_CACHED) ?:
1942 		bch2_btree_path_relock(trans, iter->path, _THIS_IP_);
1943 	if (unlikely(ret))
1944 		return bkey_s_c_err(ret);
1945 
1946 	btree_path_set_should_be_locked(iter->key_cache_path);
1947 
1948 	k = bch2_btree_path_peek_slot(iter->key_cache_path, &u);
1949 	if (k.k && !bkey_err(k)) {
1950 		iter->k = u;
1951 		k.k = &iter->k;
1952 	}
1953 	return k;
1954 }
1955 
1956 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
1957 {
1958 	struct btree_trans *trans = iter->trans;
1959 	struct bkey_i *next_update;
1960 	struct bkey_s_c k, k2;
1961 	int ret;
1962 
1963 	EBUG_ON(iter->path->cached);
1964 	bch2_btree_iter_verify(iter);
1965 
1966 	while (1) {
1967 		struct btree_path_level *l;
1968 
1969 		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
1970 					iter->flags & BTREE_ITER_INTENT,
1971 					btree_iter_ip_allocated(iter));
1972 
1973 		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1974 		if (unlikely(ret)) {
1975 			/* ensure that iter->k is consistent with iter->pos: */
1976 			bch2_btree_iter_set_pos(iter, iter->pos);
1977 			k = bkey_s_c_err(ret);
1978 			goto out;
1979 		}
1980 
1981 		l = path_l(iter->path);
1982 
1983 		if (unlikely(!l->b)) {
1984 			/* No btree nodes at requested level: */
1985 			bch2_btree_iter_set_pos(iter, SPOS_MAX);
1986 			k = bkey_s_c_null;
1987 			goto out;
1988 		}
1989 
1990 		btree_path_set_should_be_locked(iter->path);
1991 
1992 		k = btree_path_level_peek_all(trans->c, l, &iter->k);
1993 
1994 		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
1995 		    k.k &&
1996 		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
1997 			k = k2;
1998 			ret = bkey_err(k);
1999 			if (ret) {
2000 				bch2_btree_iter_set_pos(iter, iter->pos);
2001 				goto out;
2002 			}
2003 		}
2004 
2005 		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
2006 			k = btree_trans_peek_journal(trans, iter, k);
2007 
2008 		next_update = btree_trans_peek_updates(iter);
2009 
2010 		if (next_update &&
2011 		    bpos_le(next_update->k.p,
2012 			    k.k ? k.k->p : l->b->key.k.p)) {
2013 			iter->k = next_update->k;
2014 			k = bkey_i_to_s_c(next_update);
2015 		}
2016 
2017 		if (k.k && bkey_deleted(k.k)) {
2018 			/*
2019 			 * If we've got a whiteout, and it's after the search
2020 			 * key, advance the search key to the whiteout instead
2021 			 * of just after the whiteout - it might be a btree
2022 			 * whiteout, with a real key at the same position, since
2023 			 * in the btree deleted keys sort before non deleted.
2024 			 */
2025 			search_key = !bpos_eq(search_key, k.k->p)
2026 				? k.k->p
2027 				: bpos_successor(k.k->p);
2028 			continue;
2029 		}
2030 
2031 		if (likely(k.k)) {
2032 			break;
2033 		} else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
2034 			/* Advance to next leaf node: */
2035 			search_key = bpos_successor(l->b->key.k.p);
2036 		} else {
2037 			/* End of btree: */
2038 			bch2_btree_iter_set_pos(iter, SPOS_MAX);
2039 			k = bkey_s_c_null;
2040 			goto out;
2041 		}
2042 	}
2043 out:
2044 	bch2_btree_iter_verify(iter);
2045 
2046 	return k;
2047 }
2048 
2049 /**
2050  * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
2051  * iterator's current position
2052  * @iter:	iterator to peek from
2053  * @end:	search limit: returns keys less than or equal to @end
2054  *
2055  * Returns:	key if found, or an error extractable with bkey_err().
2056  */
2057 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
2058 {
2059 	struct btree_trans *trans = iter->trans;
2060 	struct bpos search_key = btree_iter_search_key(iter);
2061 	struct bkey_s_c k;
2062 	struct bpos iter_pos;
2063 	int ret;
2064 
2065 	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2066 	EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
2067 
2068 	if (iter->update_path) {
2069 		bch2_path_put_nokeep(trans, iter->update_path,
2070 				     iter->flags & BTREE_ITER_INTENT);
2071 		iter->update_path = NULL;
2072 	}
2073 
2074 	bch2_btree_iter_verify_entry_exit(iter);
2075 
2076 	while (1) {
2077 		k = __bch2_btree_iter_peek(iter, search_key);
2078 		if (unlikely(!k.k))
2079 			goto end;
2080 		if (unlikely(bkey_err(k)))
2081 			goto out_no_locked;
2082 
2083 		/*
2084 		 * iter->pos should be mononotically increasing, and always be
2085 		 * equal to the key we just returned - except extents can
2086 		 * straddle iter->pos:
2087 		 */
2088 		if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
2089 			iter_pos = k.k->p;
2090 		else
2091 			iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
2092 
2093 		if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
2094 			     ? bkey_gt(iter_pos, end)
2095 			     : bkey_ge(iter_pos, end)))
2096 			goto end;
2097 
2098 		if (iter->update_path &&
2099 		    !bkey_eq(iter->update_path->pos, k.k->p)) {
2100 			bch2_path_put_nokeep(trans, iter->update_path,
2101 					     iter->flags & BTREE_ITER_INTENT);
2102 			iter->update_path = NULL;
2103 		}
2104 
2105 		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2106 		    (iter->flags & BTREE_ITER_INTENT) &&
2107 		    !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
2108 		    !iter->update_path) {
2109 			struct bpos pos = k.k->p;
2110 
2111 			if (pos.snapshot < iter->snapshot) {
2112 				search_key = bpos_successor(k.k->p);
2113 				continue;
2114 			}
2115 
2116 			pos.snapshot = iter->snapshot;
2117 
2118 			/*
2119 			 * advance, same as on exit for iter->path, but only up
2120 			 * to snapshot
2121 			 */
2122 			__btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
2123 			iter->update_path = iter->path;
2124 
2125 			iter->update_path = bch2_btree_path_set_pos(trans,
2126 						iter->update_path, pos,
2127 						iter->flags & BTREE_ITER_INTENT,
2128 						_THIS_IP_);
2129 			ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
2130 			if (unlikely(ret)) {
2131 				k = bkey_s_c_err(ret);
2132 				goto out_no_locked;
2133 			}
2134 		}
2135 
2136 		/*
2137 		 * We can never have a key in a leaf node at POS_MAX, so
2138 		 * we don't have to check these successor() calls:
2139 		 */
2140 		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2141 		    !bch2_snapshot_is_ancestor(trans->c,
2142 					       iter->snapshot,
2143 					       k.k->p.snapshot)) {
2144 			search_key = bpos_successor(k.k->p);
2145 			continue;
2146 		}
2147 
2148 		if (bkey_whiteout(k.k) &&
2149 		    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2150 			search_key = bkey_successor(iter, k.k->p);
2151 			continue;
2152 		}
2153 
2154 		break;
2155 	}
2156 
2157 	iter->pos = iter_pos;
2158 
2159 	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
2160 				iter->flags & BTREE_ITER_INTENT,
2161 				btree_iter_ip_allocated(iter));
2162 
2163 	btree_path_set_should_be_locked(iter->path);
2164 out_no_locked:
2165 	if (iter->update_path) {
2166 		ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
2167 		if (unlikely(ret))
2168 			k = bkey_s_c_err(ret);
2169 		else
2170 			btree_path_set_should_be_locked(iter->update_path);
2171 	}
2172 
2173 	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
2174 		iter->pos.snapshot = iter->snapshot;
2175 
2176 	ret = bch2_btree_iter_verify_ret(iter, k);
2177 	if (unlikely(ret)) {
2178 		bch2_btree_iter_set_pos(iter, iter->pos);
2179 		k = bkey_s_c_err(ret);
2180 	}
2181 
2182 	bch2_btree_iter_verify_entry_exit(iter);
2183 
2184 	return k;
2185 end:
2186 	bch2_btree_iter_set_pos(iter, end);
2187 	k = bkey_s_c_null;
2188 	goto out_no_locked;
2189 }
2190 
2191 /**
2192  * bch2_btree_iter_peek_all_levels() - returns the first key greater than or
2193  * equal to iterator's current position, returning keys from every level of the
2194  * btree. For keys at different levels of the btree that compare equal, the key
2195  * from the lower level (leaf) is returned first.
2196  * @iter:	iterator to peek from
2197  *
2198  * Returns:	key if found, or an error extractable with bkey_err().
2199  */
2200 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
2201 {
2202 	struct btree_trans *trans = iter->trans;
2203 	struct bkey_s_c k;
2204 	int ret;
2205 
2206 	EBUG_ON(iter->path->cached);
2207 	bch2_btree_iter_verify(iter);
2208 	BUG_ON(iter->path->level < iter->min_depth);
2209 	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
2210 	EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
2211 
2212 	while (1) {
2213 		iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
2214 					iter->flags & BTREE_ITER_INTENT,
2215 					btree_iter_ip_allocated(iter));
2216 
2217 		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2218 		if (unlikely(ret)) {
2219 			/* ensure that iter->k is consistent with iter->pos: */
2220 			bch2_btree_iter_set_pos(iter, iter->pos);
2221 			k = bkey_s_c_err(ret);
2222 			goto out_no_locked;
2223 		}
2224 
2225 		/* Already at end? */
2226 		if (!btree_path_node(iter->path, iter->path->level)) {
2227 			k = bkey_s_c_null;
2228 			goto out_no_locked;
2229 		}
2230 
2231 		k = btree_path_level_peek_all(trans->c,
2232 				&iter->path->l[iter->path->level], &iter->k);
2233 
2234 		/* Check if we should go up to the parent node: */
2235 		if (!k.k ||
2236 		    (iter->advanced &&
2237 		     bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
2238 			iter->pos = path_l(iter->path)->b->key.k.p;
2239 			btree_path_set_level_up(trans, iter->path);
2240 			iter->advanced = false;
2241 			continue;
2242 		}
2243 
2244 		/*
2245 		 * Check if we should go back down to a leaf:
2246 		 * If we're not in a leaf node, we only return the current key
2247 		 * if it exactly matches iter->pos - otherwise we first have to
2248 		 * go back to the leaf:
2249 		 */
2250 		if (iter->path->level != iter->min_depth &&
2251 		    (iter->advanced ||
2252 		     !k.k ||
2253 		     !bpos_eq(iter->pos, k.k->p))) {
2254 			btree_path_set_level_down(trans, iter->path, iter->min_depth);
2255 			iter->pos = bpos_successor(iter->pos);
2256 			iter->advanced = false;
2257 			continue;
2258 		}
2259 
2260 		/* Check if we should go to the next key: */
2261 		if (iter->path->level == iter->min_depth &&
2262 		    iter->advanced &&
2263 		    k.k &&
2264 		    bpos_eq(iter->pos, k.k->p)) {
2265 			iter->pos = bpos_successor(iter->pos);
2266 			iter->advanced = false;
2267 			continue;
2268 		}
2269 
2270 		if (iter->advanced &&
2271 		    iter->path->level == iter->min_depth &&
2272 		    !bpos_eq(k.k->p, iter->pos))
2273 			iter->advanced = false;
2274 
2275 		BUG_ON(iter->advanced);
2276 		BUG_ON(!k.k);
2277 		break;
2278 	}
2279 
2280 	iter->pos = k.k->p;
2281 	btree_path_set_should_be_locked(iter->path);
2282 out_no_locked:
2283 	bch2_btree_iter_verify(iter);
2284 
2285 	return k;
2286 }
2287 
2288 /**
2289  * bch2_btree_iter_next() - returns first key greater than iterator's current
2290  * position
2291  * @iter:	iterator to peek from
2292  *
2293  * Returns:	key if found, or an error extractable with bkey_err().
2294  */
2295 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
2296 {
2297 	if (!bch2_btree_iter_advance(iter))
2298 		return bkey_s_c_null;
2299 
2300 	return bch2_btree_iter_peek(iter);
2301 }
2302 
2303 /**
2304  * bch2_btree_iter_peek_prev() - returns first key less than or equal to
2305  * iterator's current position
2306  * @iter:	iterator to peek from
2307  *
2308  * Returns:	key if found, or an error extractable with bkey_err().
2309  */
2310 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
2311 {
2312 	struct btree_trans *trans = iter->trans;
2313 	struct bpos search_key = iter->pos;
2314 	struct btree_path *saved_path = NULL;
2315 	struct bkey_s_c k;
2316 	struct bkey saved_k;
2317 	const struct bch_val *saved_v;
2318 	int ret;
2319 
2320 	EBUG_ON(iter->path->cached || iter->path->level);
2321 	EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
2322 
2323 	if (iter->flags & BTREE_ITER_WITH_JOURNAL)
2324 		return bkey_s_c_err(-EIO);
2325 
2326 	bch2_btree_iter_verify(iter);
2327 	bch2_btree_iter_verify_entry_exit(iter);
2328 
2329 	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2330 		search_key.snapshot = U32_MAX;
2331 
2332 	while (1) {
2333 		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2334 						iter->flags & BTREE_ITER_INTENT,
2335 						btree_iter_ip_allocated(iter));
2336 
2337 		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2338 		if (unlikely(ret)) {
2339 			/* ensure that iter->k is consistent with iter->pos: */
2340 			bch2_btree_iter_set_pos(iter, iter->pos);
2341 			k = bkey_s_c_err(ret);
2342 			goto out_no_locked;
2343 		}
2344 
2345 		k = btree_path_level_peek(trans, iter->path,
2346 					  &iter->path->l[0], &iter->k);
2347 		if (!k.k ||
2348 		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
2349 		     ? bpos_ge(bkey_start_pos(k.k), search_key)
2350 		     : bpos_gt(k.k->p, search_key)))
2351 			k = btree_path_level_prev(trans, iter->path,
2352 						  &iter->path->l[0], &iter->k);
2353 
2354 		if (likely(k.k)) {
2355 			if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
2356 				if (k.k->p.snapshot == iter->snapshot)
2357 					goto got_key;
2358 
2359 				/*
2360 				 * If we have a saved candidate, and we're no
2361 				 * longer at the same _key_ (not pos), return
2362 				 * that candidate
2363 				 */
2364 				if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
2365 					bch2_path_put_nokeep(trans, iter->path,
2366 						      iter->flags & BTREE_ITER_INTENT);
2367 					iter->path = saved_path;
2368 					saved_path = NULL;
2369 					iter->k	= saved_k;
2370 					k.v	= saved_v;
2371 					goto got_key;
2372 				}
2373 
2374 				if (bch2_snapshot_is_ancestor(iter->trans->c,
2375 							      iter->snapshot,
2376 							      k.k->p.snapshot)) {
2377 					if (saved_path)
2378 						bch2_path_put_nokeep(trans, saved_path,
2379 						      iter->flags & BTREE_ITER_INTENT);
2380 					saved_path = btree_path_clone(trans, iter->path,
2381 								iter->flags & BTREE_ITER_INTENT);
2382 					saved_k = *k.k;
2383 					saved_v = k.v;
2384 				}
2385 
2386 				search_key = bpos_predecessor(k.k->p);
2387 				continue;
2388 			}
2389 got_key:
2390 			if (bkey_whiteout(k.k) &&
2391 			    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2392 				search_key = bkey_predecessor(iter, k.k->p);
2393 				if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2394 					search_key.snapshot = U32_MAX;
2395 				continue;
2396 			}
2397 
2398 			break;
2399 		} else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
2400 			/* Advance to previous leaf node: */
2401 			search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
2402 		} else {
2403 			/* Start of btree: */
2404 			bch2_btree_iter_set_pos(iter, POS_MIN);
2405 			k = bkey_s_c_null;
2406 			goto out_no_locked;
2407 		}
2408 	}
2409 
2410 	EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
2411 
2412 	/* Extents can straddle iter->pos: */
2413 	if (bkey_lt(k.k->p, iter->pos))
2414 		iter->pos = k.k->p;
2415 
2416 	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2417 		iter->pos.snapshot = iter->snapshot;
2418 
2419 	btree_path_set_should_be_locked(iter->path);
2420 out_no_locked:
2421 	if (saved_path)
2422 		bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
2423 
2424 	bch2_btree_iter_verify_entry_exit(iter);
2425 	bch2_btree_iter_verify(iter);
2426 
2427 	return k;
2428 }
2429 
2430 /**
2431  * bch2_btree_iter_prev() - returns first key less than iterator's current
2432  * position
2433  * @iter:	iterator to peek from
2434  *
2435  * Returns:	key if found, or an error extractable with bkey_err().
2436  */
2437 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
2438 {
2439 	if (!bch2_btree_iter_rewind(iter))
2440 		return bkey_s_c_null;
2441 
2442 	return bch2_btree_iter_peek_prev(iter);
2443 }
2444 
2445 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
2446 {
2447 	struct btree_trans *trans = iter->trans;
2448 	struct bpos search_key;
2449 	struct bkey_s_c k;
2450 	int ret;
2451 
2452 	bch2_btree_iter_verify(iter);
2453 	bch2_btree_iter_verify_entry_exit(iter);
2454 	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2455 	EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
2456 
2457 	/* extents can't span inode numbers: */
2458 	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
2459 	    unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
2460 		if (iter->pos.inode == KEY_INODE_MAX)
2461 			return bkey_s_c_null;
2462 
2463 		bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
2464 	}
2465 
2466 	search_key = btree_iter_search_key(iter);
2467 	iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2468 					iter->flags & BTREE_ITER_INTENT,
2469 					btree_iter_ip_allocated(iter));
2470 
2471 	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2472 	if (unlikely(ret)) {
2473 		k = bkey_s_c_err(ret);
2474 		goto out_no_locked;
2475 	}
2476 
2477 	if ((iter->flags & BTREE_ITER_CACHED) ||
2478 	    !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
2479 		struct bkey_i *next_update;
2480 
2481 		if ((next_update = btree_trans_peek_updates(iter)) &&
2482 		    bpos_eq(next_update->k.p, iter->pos)) {
2483 			iter->k = next_update->k;
2484 			k = bkey_i_to_s_c(next_update);
2485 			goto out;
2486 		}
2487 
2488 		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
2489 		    (k = btree_trans_peek_slot_journal(trans, iter)).k)
2490 			goto out;
2491 
2492 		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
2493 		    (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
2494 			if (!bkey_err(k))
2495 				iter->k = *k.k;
2496 			/* We're not returning a key from iter->path: */
2497 			goto out_no_locked;
2498 		}
2499 
2500 		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
2501 		if (unlikely(!k.k))
2502 			goto out_no_locked;
2503 	} else {
2504 		struct bpos next;
2505 		struct bpos end = iter->pos;
2506 
2507 		if (iter->flags & BTREE_ITER_IS_EXTENTS)
2508 			end.offset = U64_MAX;
2509 
2510 		EBUG_ON(iter->path->level);
2511 
2512 		if (iter->flags & BTREE_ITER_INTENT) {
2513 			struct btree_iter iter2;
2514 
2515 			bch2_trans_copy_iter(&iter2, iter);
2516 			k = bch2_btree_iter_peek_upto(&iter2, end);
2517 
2518 			if (k.k && !bkey_err(k)) {
2519 				iter->k = iter2.k;
2520 				k.k = &iter->k;
2521 			}
2522 			bch2_trans_iter_exit(trans, &iter2);
2523 		} else {
2524 			struct bpos pos = iter->pos;
2525 
2526 			k = bch2_btree_iter_peek_upto(iter, end);
2527 			if (unlikely(bkey_err(k)))
2528 				bch2_btree_iter_set_pos(iter, pos);
2529 			else
2530 				iter->pos = pos;
2531 		}
2532 
2533 		if (unlikely(bkey_err(k)))
2534 			goto out_no_locked;
2535 
2536 		next = k.k ? bkey_start_pos(k.k) : POS_MAX;
2537 
2538 		if (bkey_lt(iter->pos, next)) {
2539 			bkey_init(&iter->k);
2540 			iter->k.p = iter->pos;
2541 
2542 			if (iter->flags & BTREE_ITER_IS_EXTENTS) {
2543 				bch2_key_resize(&iter->k,
2544 						min_t(u64, KEY_SIZE_MAX,
2545 						      (next.inode == iter->pos.inode
2546 						       ? next.offset
2547 						       : KEY_OFFSET_MAX) -
2548 						      iter->pos.offset));
2549 				EBUG_ON(!iter->k.size);
2550 			}
2551 
2552 			k = (struct bkey_s_c) { &iter->k, NULL };
2553 		}
2554 	}
2555 out:
2556 	btree_path_set_should_be_locked(iter->path);
2557 out_no_locked:
2558 	bch2_btree_iter_verify_entry_exit(iter);
2559 	bch2_btree_iter_verify(iter);
2560 	ret = bch2_btree_iter_verify_ret(iter, k);
2561 	if (unlikely(ret))
2562 		return bkey_s_c_err(ret);
2563 
2564 	return k;
2565 }
2566 
2567 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
2568 {
2569 	if (!bch2_btree_iter_advance(iter))
2570 		return bkey_s_c_null;
2571 
2572 	return bch2_btree_iter_peek_slot(iter);
2573 }
2574 
2575 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
2576 {
2577 	if (!bch2_btree_iter_rewind(iter))
2578 		return bkey_s_c_null;
2579 
2580 	return bch2_btree_iter_peek_slot(iter);
2581 }
2582 
2583 struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
2584 {
2585 	struct bkey_s_c k;
2586 
2587 	while (btree_trans_too_many_iters(iter->trans) ||
2588 	       (k = bch2_btree_iter_peek_type(iter, iter->flags),
2589 		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
2590 		bch2_trans_begin(iter->trans);
2591 
2592 	return k;
2593 }
2594 
2595 /* new transactional stuff: */
2596 
2597 #ifdef CONFIG_BCACHEFS_DEBUG
2598 static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2599 {
2600 	struct btree_path *path;
2601 	unsigned i;
2602 
2603 	BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
2604 
2605 	trans_for_each_path(trans, path) {
2606 		BUG_ON(path->sorted_idx >= trans->nr_sorted);
2607 		BUG_ON(trans->sorted[path->sorted_idx] != path->idx);
2608 	}
2609 
2610 	for (i = 0; i < trans->nr_sorted; i++) {
2611 		unsigned idx = trans->sorted[i];
2612 
2613 		EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
2614 		BUG_ON(trans->paths[idx].sorted_idx != i);
2615 	}
2616 }
2617 
2618 static void btree_trans_verify_sorted(struct btree_trans *trans)
2619 {
2620 	struct btree_path *path, *prev = NULL;
2621 	unsigned i;
2622 
2623 	if (!bch2_debug_check_iterators)
2624 		return;
2625 
2626 	trans_for_each_path_inorder(trans, path, i) {
2627 		if (prev && btree_path_cmp(prev, path) > 0) {
2628 			__bch2_dump_trans_paths_updates(trans, true);
2629 			panic("trans paths out of order!\n");
2630 		}
2631 		prev = path;
2632 	}
2633 }
2634 #else
2635 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
2636 static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
2637 #endif
2638 
2639 void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
2640 {
2641 	int i, l = 0, r = trans->nr_sorted, inc = 1;
2642 	bool swapped;
2643 
2644 	btree_trans_verify_sorted_refs(trans);
2645 
2646 	if (trans->paths_sorted)
2647 		goto out;
2648 
2649 	/*
2650 	 * Cocktail shaker sort: this is efficient because iterators will be
2651 	 * mostly sorted.
2652 	 */
2653 	do {
2654 		swapped = false;
2655 
2656 		for (i = inc > 0 ? l : r - 2;
2657 		     i + 1 < r && i >= l;
2658 		     i += inc) {
2659 			if (btree_path_cmp(trans->paths + trans->sorted[i],
2660 					   trans->paths + trans->sorted[i + 1]) > 0) {
2661 				swap(trans->sorted[i], trans->sorted[i + 1]);
2662 				trans->paths[trans->sorted[i]].sorted_idx = i;
2663 				trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
2664 				swapped = true;
2665 			}
2666 		}
2667 
2668 		if (inc > 0)
2669 			--r;
2670 		else
2671 			l++;
2672 		inc = -inc;
2673 	} while (swapped);
2674 
2675 	trans->paths_sorted = true;
2676 out:
2677 	btree_trans_verify_sorted(trans);
2678 }
2679 
2680 static inline void btree_path_list_remove(struct btree_trans *trans,
2681 					  struct btree_path *path)
2682 {
2683 	unsigned i;
2684 
2685 	EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2686 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2687 	trans->nr_sorted--;
2688 	memmove_u64s_down_small(trans->sorted + path->sorted_idx,
2689 				trans->sorted + path->sorted_idx + 1,
2690 				DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
2691 #else
2692 	array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
2693 #endif
2694 	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2695 		trans->paths[trans->sorted[i]].sorted_idx = i;
2696 
2697 	path->sorted_idx = U8_MAX;
2698 }
2699 
2700 static inline void btree_path_list_add(struct btree_trans *trans,
2701 				       struct btree_path *pos,
2702 				       struct btree_path *path)
2703 {
2704 	unsigned i;
2705 
2706 	path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted;
2707 
2708 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2709 	memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
2710 			      trans->sorted + path->sorted_idx,
2711 			      DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
2712 	trans->nr_sorted++;
2713 	trans->sorted[path->sorted_idx] = path->idx;
2714 #else
2715 	array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
2716 #endif
2717 
2718 	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2719 		trans->paths[trans->sorted[i]].sorted_idx = i;
2720 
2721 	btree_trans_verify_sorted_refs(trans);
2722 }
2723 
2724 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
2725 {
2726 	if (iter->update_path)
2727 		bch2_path_put_nokeep(trans, iter->update_path,
2728 			      iter->flags & BTREE_ITER_INTENT);
2729 	if (iter->path)
2730 		bch2_path_put(trans, iter->path,
2731 			      iter->flags & BTREE_ITER_INTENT);
2732 	if (iter->key_cache_path)
2733 		bch2_path_put(trans, iter->key_cache_path,
2734 			      iter->flags & BTREE_ITER_INTENT);
2735 	iter->path = NULL;
2736 	iter->update_path = NULL;
2737 	iter->key_cache_path = NULL;
2738 }
2739 
2740 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
2741 			  struct btree_iter *iter,
2742 			  enum btree_id btree_id, struct bpos pos,
2743 			  unsigned flags)
2744 {
2745 	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
2746 			       bch2_btree_iter_flags(trans, btree_id, flags),
2747 			       _RET_IP_);
2748 }
2749 
2750 void bch2_trans_node_iter_init(struct btree_trans *trans,
2751 			       struct btree_iter *iter,
2752 			       enum btree_id btree_id,
2753 			       struct bpos pos,
2754 			       unsigned locks_want,
2755 			       unsigned depth,
2756 			       unsigned flags)
2757 {
2758 	flags |= BTREE_ITER_NOT_EXTENTS;
2759 	flags |= __BTREE_ITER_ALL_SNAPSHOTS;
2760 	flags |= BTREE_ITER_ALL_SNAPSHOTS;
2761 
2762 	bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
2763 			       __bch2_btree_iter_flags(trans, btree_id, flags),
2764 			       _RET_IP_);
2765 
2766 	iter->min_depth	= depth;
2767 
2768 	BUG_ON(iter->path->locks_want	 < min(locks_want, BTREE_MAX_DEPTH));
2769 	BUG_ON(iter->path->level	!= depth);
2770 	BUG_ON(iter->min_depth		!= depth);
2771 }
2772 
2773 void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
2774 {
2775 	*dst = *src;
2776 	if (src->path)
2777 		__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
2778 	if (src->update_path)
2779 		__btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
2780 	dst->key_cache_path = NULL;
2781 }
2782 
2783 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2784 {
2785 	unsigned new_top = trans->mem_top + size;
2786 	size_t old_bytes = trans->mem_bytes;
2787 	size_t new_bytes = roundup_pow_of_two(new_top);
2788 	int ret;
2789 	void *new_mem;
2790 	void *p;
2791 
2792 	trans->mem_max = max(trans->mem_max, new_top);
2793 
2794 	WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2795 
2796 	new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
2797 	if (unlikely(!new_mem)) {
2798 		bch2_trans_unlock(trans);
2799 
2800 		new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
2801 		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2802 			new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
2803 			new_bytes = BTREE_TRANS_MEM_MAX;
2804 			kfree(trans->mem);
2805 		}
2806 
2807 		if (!new_mem)
2808 			return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
2809 
2810 		trans->mem = new_mem;
2811 		trans->mem_bytes = new_bytes;
2812 
2813 		ret = bch2_trans_relock(trans);
2814 		if (ret)
2815 			return ERR_PTR(ret);
2816 	}
2817 
2818 	trans->mem = new_mem;
2819 	trans->mem_bytes = new_bytes;
2820 
2821 	if (old_bytes) {
2822 		trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
2823 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
2824 	}
2825 
2826 	p = trans->mem + trans->mem_top;
2827 	trans->mem_top += size;
2828 	memset(p, 0, size);
2829 	return p;
2830 }
2831 
2832 static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
2833 {
2834 	struct bch_fs *c = trans->c;
2835 	struct btree_path *path;
2836 
2837 	trans_for_each_path(trans, path)
2838 		if (path->cached && !btree_node_locked(path, 0))
2839 			path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
2840 
2841 	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2842 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2843 	trans->srcu_lock_time	= jiffies;
2844 }
2845 
2846 /**
2847  * bch2_trans_begin() - reset a transaction after a interrupted attempt
2848  * @trans: transaction to reset
2849  *
2850  * Returns:	current restart counter, to be used with trans_was_restarted()
2851  *
2852  * While iterating over nodes or updating nodes a attempt to lock a btree node
2853  * may return BCH_ERR_transaction_restart when the trylock fails. When this
2854  * occurs bch2_trans_begin() should be called and the transaction retried.
2855  */
2856 u32 bch2_trans_begin(struct btree_trans *trans)
2857 {
2858 	struct btree_path *path;
2859 	u64 now;
2860 
2861 	bch2_trans_reset_updates(trans);
2862 
2863 	trans->restart_count++;
2864 	trans->mem_top			= 0;
2865 
2866 	trans_for_each_path(trans, path) {
2867 		path->should_be_locked = false;
2868 
2869 		/*
2870 		 * If the transaction wasn't restarted, we're presuming to be
2871 		 * doing something new: dont keep iterators excpt the ones that
2872 		 * are in use - except for the subvolumes btree:
2873 		 */
2874 		if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
2875 			path->preserve = false;
2876 
2877 		/*
2878 		 * XXX: we probably shouldn't be doing this if the transaction
2879 		 * was restarted, but currently we still overflow transaction
2880 		 * iterators if we do that
2881 		 */
2882 		if (!path->ref && !path->preserve)
2883 			__bch2_path_free(trans, path);
2884 		else
2885 			path->preserve = false;
2886 	}
2887 
2888 	now = local_clock();
2889 	if (!trans->restarted &&
2890 	    (need_resched() ||
2891 	     now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
2892 		drop_locks_do(trans, (cond_resched(), 0));
2893 		now = local_clock();
2894 	}
2895 	trans->last_begin_time = now;
2896 
2897 	if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
2898 		bch2_trans_reset_srcu_lock(trans);
2899 
2900 	trans->last_begin_ip = _RET_IP_;
2901 	if (trans->restarted) {
2902 		bch2_btree_path_traverse_all(trans);
2903 		trans->notrace_relock_fail = false;
2904 	}
2905 
2906 	return trans->restart_count;
2907 }
2908 
2909 static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
2910 {
2911 	struct btree_trans *trans;
2912 
2913 	if (IS_ENABLED(__KERNEL__)) {
2914 		trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
2915 		if (trans)
2916 			return trans;
2917 	}
2918 
2919 	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
2920 	/*
2921 	 * paths need to be zeroed, bch2_check_for_deadlock looks at
2922 	 * paths in other threads
2923 	 */
2924 	memset(&trans->paths, 0, sizeof(trans->paths));
2925 	return trans;
2926 }
2927 
2928 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
2929 
2930 unsigned bch2_trans_get_fn_idx(const char *fn)
2931 {
2932 	unsigned i;
2933 
2934 	for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
2935 		if (!bch2_btree_transaction_fns[i] ||
2936 		    bch2_btree_transaction_fns[i] == fn) {
2937 			bch2_btree_transaction_fns[i] = fn;
2938 			return i;
2939 		}
2940 
2941 	pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
2942 	return i;
2943 }
2944 
2945 struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
2946 	__acquires(&c->btree_trans_barrier)
2947 {
2948 	struct btree_trans *trans;
2949 	struct btree_transaction_stats *s;
2950 
2951 	trans = bch2_trans_alloc(c);
2952 
2953 	memset(trans, 0, sizeof(*trans));
2954 	trans->c		= c;
2955 	trans->fn		= fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
2956 		? bch2_btree_transaction_fns[fn_idx] : NULL;
2957 	trans->last_begin_time	= local_clock();
2958 	trans->fn_idx		= fn_idx;
2959 	trans->locking_wait.task = current;
2960 	trans->journal_replay_not_finished =
2961 		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
2962 	closure_init_stack(&trans->ref);
2963 
2964 	s = btree_trans_stats(trans);
2965 	if (s && s->max_mem) {
2966 		unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
2967 
2968 		trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
2969 
2970 		if (!unlikely(trans->mem)) {
2971 			trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2972 			trans->mem_bytes = BTREE_TRANS_MEM_MAX;
2973 		} else {
2974 			trans->mem_bytes = expected_mem_bytes;
2975 		}
2976 	}
2977 
2978 	if (s) {
2979 		trans->nr_max_paths = s->nr_max_paths;
2980 		trans->wb_updates_size = s->wb_updates_size;
2981 	}
2982 
2983 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2984 	trans->srcu_lock_time	= jiffies;
2985 
2986 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
2987 		struct btree_trans *pos;
2988 
2989 		seqmutex_lock(&c->btree_trans_lock);
2990 		list_for_each_entry(pos, &c->btree_trans_list, list) {
2991 			/*
2992 			 * We'd much prefer to be stricter here and completely
2993 			 * disallow multiple btree_trans in the same thread -
2994 			 * but the data move path calls bch2_write when we
2995 			 * already have a btree_trans initialized.
2996 			 */
2997 			BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
2998 			       bch2_trans_locked(pos));
2999 
3000 			if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
3001 				list_add_tail(&trans->list, &pos->list);
3002 				goto list_add_done;
3003 			}
3004 		}
3005 		list_add_tail(&trans->list, &c->btree_trans_list);
3006 list_add_done:
3007 		seqmutex_unlock(&c->btree_trans_lock);
3008 	}
3009 
3010 	return trans;
3011 }
3012 
3013 static void check_btree_paths_leaked(struct btree_trans *trans)
3014 {
3015 #ifdef CONFIG_BCACHEFS_DEBUG
3016 	struct bch_fs *c = trans->c;
3017 	struct btree_path *path;
3018 
3019 	trans_for_each_path(trans, path)
3020 		if (path->ref)
3021 			goto leaked;
3022 	return;
3023 leaked:
3024 	bch_err(c, "btree paths leaked from %s!", trans->fn);
3025 	trans_for_each_path(trans, path)
3026 		if (path->ref)
3027 			printk(KERN_ERR "  btree %s %pS\n",
3028 			       bch2_btree_ids[path->btree_id],
3029 			       (void *) path->ip_allocated);
3030 	/* Be noisy about this: */
3031 	bch2_fatal_error(c);
3032 #endif
3033 }
3034 
3035 void bch2_trans_put(struct btree_trans *trans)
3036 	__releases(&c->btree_trans_barrier)
3037 {
3038 	struct btree_insert_entry *i;
3039 	struct bch_fs *c = trans->c;
3040 	struct btree_transaction_stats *s = btree_trans_stats(trans);
3041 
3042 	bch2_trans_unlock(trans);
3043 
3044 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
3045 		seqmutex_lock(&c->btree_trans_lock);
3046 		list_del(&trans->list);
3047 		seqmutex_unlock(&c->btree_trans_lock);
3048 	}
3049 
3050 	closure_sync(&trans->ref);
3051 
3052 	if (s)
3053 		s->max_mem = max(s->max_mem, trans->mem_max);
3054 
3055 	trans_for_each_update(trans, i)
3056 		__btree_path_put(i->path, true);
3057 	trans->nr_updates		= 0;
3058 
3059 	check_btree_paths_leaked(trans);
3060 
3061 	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
3062 
3063 	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
3064 
3065 	kfree(trans->extra_journal_entries.data);
3066 
3067 	if (trans->fs_usage_deltas) {
3068 		if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
3069 		    REPLICAS_DELTA_LIST_MAX)
3070 			mempool_free(trans->fs_usage_deltas,
3071 				     &c->replicas_delta_pool);
3072 		else
3073 			kfree(trans->fs_usage_deltas);
3074 	}
3075 
3076 	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
3077 		mempool_free(trans->mem, &c->btree_trans_mem_pool);
3078 	else
3079 		kfree(trans->mem);
3080 
3081 	/* Userspace doesn't have a real percpu implementation: */
3082 	if (IS_ENABLED(__KERNEL__))
3083 		trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
3084 	if (trans)
3085 		mempool_free(trans, &c->btree_trans_pool);
3086 }
3087 
3088 static void __maybe_unused
3089 bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
3090 				      struct btree_bkey_cached_common *b)
3091 {
3092 	struct six_lock_count c = six_lock_counts(&b->lock);
3093 	struct task_struct *owner;
3094 	pid_t pid;
3095 
3096 	rcu_read_lock();
3097 	owner = READ_ONCE(b->lock.owner);
3098 	pid = owner ? owner->pid : 0;
3099 	rcu_read_unlock();
3100 
3101 	prt_tab(out);
3102 	prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
3103 		   b->level, bch2_btree_ids[b->btree_id]);
3104 	bch2_bpos_to_text(out, btree_node_pos(b));
3105 
3106 	prt_tab(out);
3107 	prt_printf(out, " locks %u:%u:%u held by pid %u",
3108 		   c.n[0], c.n[1], c.n[2], pid);
3109 }
3110 
3111 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
3112 {
3113 	struct btree_path *path;
3114 	struct btree_bkey_cached_common *b;
3115 	static char lock_types[] = { 'r', 'i', 'w' };
3116 	unsigned l, idx;
3117 
3118 	if (!out->nr_tabstops) {
3119 		printbuf_tabstop_push(out, 16);
3120 		printbuf_tabstop_push(out, 32);
3121 	}
3122 
3123 	prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
3124 
3125 	trans_for_each_path_safe(trans, path, idx) {
3126 		if (!path->nodes_locked)
3127 			continue;
3128 
3129 		prt_printf(out, "  path %u %c l=%u %s:",
3130 		       path->idx,
3131 		       path->cached ? 'c' : 'b',
3132 		       path->level,
3133 		       bch2_btree_ids[path->btree_id]);
3134 		bch2_bpos_to_text(out, path->pos);
3135 		prt_newline(out);
3136 
3137 		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
3138 			if (btree_node_locked(path, l) &&
3139 			    !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
3140 				prt_printf(out, "    %c l=%u ",
3141 					   lock_types[btree_node_locked_type(path, l)], l);
3142 				bch2_btree_bkey_cached_common_to_text(out, b);
3143 				prt_newline(out);
3144 			}
3145 		}
3146 	}
3147 
3148 	b = READ_ONCE(trans->locking);
3149 	if (b) {
3150 		prt_printf(out, "  blocked for %lluus on",
3151 			   div_u64(local_clock() - trans->locking_wait.start_time,
3152 				   1000));
3153 		prt_newline(out);
3154 		prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
3155 		bch2_btree_bkey_cached_common_to_text(out, b);
3156 		prt_newline(out);
3157 	}
3158 }
3159 
3160 void bch2_fs_btree_iter_exit(struct bch_fs *c)
3161 {
3162 	struct btree_transaction_stats *s;
3163 	struct btree_trans *trans;
3164 	int cpu;
3165 
3166 	trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
3167 	if (trans)
3168 		panic("%s leaked btree_trans\n", trans->fn);
3169 
3170 	if (c->btree_trans_bufs)
3171 		for_each_possible_cpu(cpu)
3172 			kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
3173 	free_percpu(c->btree_trans_bufs);
3174 
3175 	for (s = c->btree_transaction_stats;
3176 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3177 	     s++) {
3178 		kfree(s->max_paths_text);
3179 		bch2_time_stats_exit(&s->lock_hold_times);
3180 	}
3181 
3182 	if (c->btree_trans_barrier_initialized)
3183 		cleanup_srcu_struct(&c->btree_trans_barrier);
3184 	mempool_exit(&c->btree_trans_mem_pool);
3185 	mempool_exit(&c->btree_trans_pool);
3186 }
3187 
3188 int bch2_fs_btree_iter_init(struct bch_fs *c)
3189 {
3190 	struct btree_transaction_stats *s;
3191 	int ret;
3192 
3193 	for (s = c->btree_transaction_stats;
3194 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3195 	     s++) {
3196 		bch2_time_stats_init(&s->lock_hold_times);
3197 		mutex_init(&s->lock);
3198 	}
3199 
3200 	INIT_LIST_HEAD(&c->btree_trans_list);
3201 	seqmutex_init(&c->btree_trans_lock);
3202 
3203 	c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
3204 	if (!c->btree_trans_bufs)
3205 		return -ENOMEM;
3206 
3207 	ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
3208 					  sizeof(struct btree_trans)) ?:
3209 		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
3210 					  BTREE_TRANS_MEM_MAX) ?:
3211 		init_srcu_struct(&c->btree_trans_barrier);
3212 	if (!ret)
3213 		c->btree_trans_barrier_initialized = true;
3214 	return ret;
3215 }
3216