xref: /linux/fs/bcachefs/fsck.c (revision dbcedec3a31119d7594baacc743300d127c99c56)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "darray.h"
9 #include "dirent.h"
10 #include "error.h"
11 #include "fs-common.h"
12 #include "fsck.h"
13 #include "inode.h"
14 #include "keylist.h"
15 #include "recovery.h"
16 #include "snapshot.h"
17 #include "super.h"
18 #include "xattr.h"
19 
20 #include <linux/bsearch.h>
21 #include <linux/dcache.h> /* struct qstr */
22 
23 /*
24  * XXX: this is handling transaction restarts without returning
25  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
26  */
27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
28 				    u32 snapshot)
29 {
30 	u64 sectors = 0;
31 
32 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
33 				SPOS(inum, 0, snapshot),
34 				POS(inum, U64_MAX),
35 				0, k, ({
36 		if (bkey_extent_is_allocation(k.k))
37 			sectors += k.k->size;
38 		0;
39 	}));
40 
41 	return ret ?: sectors;
42 }
43 
44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45 				    u32 snapshot)
46 {
47 	u64 subdirs = 0;
48 
49 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
50 				    SPOS(inum, 0, snapshot),
51 				    POS(inum, U64_MAX),
52 				    0, k, ({
53 		if (k.k->type == KEY_TYPE_dirent &&
54 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
55 			subdirs++;
56 		0;
57 	}));
58 
59 	return ret ?: subdirs;
60 }
61 
62 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
63 			 u32 *snapshot, u64 *inum)
64 {
65 	struct bch_subvolume s;
66 	int ret;
67 
68 	ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
69 
70 	*snapshot = le32_to_cpu(s.snapshot);
71 	*inum = le64_to_cpu(s.inode);
72 	return ret;
73 }
74 
75 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
76 			      struct bch_inode_unpacked *inode)
77 {
78 	struct btree_iter iter;
79 	struct bkey_s_c k;
80 	int ret;
81 
82 	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
83 			     POS(0, inode_nr),
84 			     BTREE_ITER_ALL_SNAPSHOTS);
85 	k = bch2_btree_iter_peek(&iter);
86 	ret = bkey_err(k);
87 	if (ret)
88 		goto err;
89 
90 	if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
91 		ret = -BCH_ERR_ENOENT_inode;
92 		goto err;
93 	}
94 
95 	ret = bch2_inode_unpack(k, inode);
96 err:
97 	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
98 	bch2_trans_iter_exit(trans, &iter);
99 	return ret;
100 }
101 
102 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
103 			struct bch_inode_unpacked *inode,
104 			u32 *snapshot)
105 {
106 	struct btree_iter iter;
107 	struct bkey_s_c k;
108 	int ret;
109 
110 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
111 			       SPOS(0, inode_nr, *snapshot), 0);
112 	ret = bkey_err(k);
113 	if (ret)
114 		goto err;
115 
116 	ret = bkey_is_inode(k.k)
117 		? bch2_inode_unpack(k, inode)
118 		: -BCH_ERR_ENOENT_inode;
119 	if (!ret)
120 		*snapshot = iter.pos.snapshot;
121 err:
122 	bch2_trans_iter_exit(trans, &iter);
123 	return ret;
124 }
125 
126 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
127 			   struct bch_hash_info hash_info,
128 			   subvol_inum dir, struct qstr *name,
129 			   u64 *target, unsigned *type, u32 snapshot)
130 {
131 	struct btree_iter iter;
132 	struct bkey_s_c_dirent d;
133 	int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
134 			       &hash_info, dir, name, 0, snapshot);
135 	if (ret)
136 		return ret;
137 
138 	d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
139 	*target = le64_to_cpu(d.v->d_inum);
140 	*type = d.v->d_type;
141 	bch2_trans_iter_exit(trans, &iter);
142 	return 0;
143 }
144 
145 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
146 {
147 	struct bch_fs *c = trans->c;
148 	struct btree_iter iter;
149 	struct bch_inode_unpacked dir_inode;
150 	struct bch_hash_info dir_hash_info;
151 	int ret;
152 
153 	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
154 	if (ret)
155 		goto err;
156 
157 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
158 
159 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
160 
161 	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
162 				  &dir_hash_info, &iter,
163 				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
164 	bch2_trans_iter_exit(trans, &iter);
165 err:
166 	bch_err_fn(c, ret);
167 	return ret;
168 }
169 
170 /* Get lost+found, create if it doesn't exist: */
171 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
172 			    struct bch_inode_unpacked *lostfound)
173 {
174 	struct bch_fs *c = trans->c;
175 	struct qstr lostfound_str = QSTR("lost+found");
176 	u64 inum = 0;
177 	unsigned d_type = 0;
178 	int ret;
179 
180 	struct bch_snapshot_tree st;
181 	ret = bch2_snapshot_tree_lookup(trans,
182 			bch2_snapshot_tree(c, snapshot), &st);
183 	if (ret)
184 		return ret;
185 
186 	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
187 	u32 subvol_snapshot;
188 
189 	ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
190 			    &subvol_snapshot, &root_inum.inum);
191 	bch_err_msg(c, ret, "looking up root subvol");
192 	if (ret)
193 		return ret;
194 
195 	struct bch_inode_unpacked root_inode;
196 	struct bch_hash_info root_hash_info;
197 	u32 root_inode_snapshot = snapshot;
198 	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
199 	bch_err_msg(c, ret, "looking up root inode");
200 	if (ret)
201 		return ret;
202 
203 	root_hash_info = bch2_hash_info_init(c, &root_inode);
204 
205 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
206 			      &lostfound_str, &inum, &d_type, snapshot);
207 	if (bch2_err_matches(ret, ENOENT))
208 		goto create_lostfound;
209 
210 	bch_err_fn(c, ret);
211 	if (ret)
212 		return ret;
213 
214 	if (d_type != DT_DIR) {
215 		bch_err(c, "error looking up lost+found: not a directory");
216 		return -BCH_ERR_ENOENT_not_directory;
217 	}
218 
219 	/*
220 	 * The bch2_check_dirents pass has already run, dangling dirents
221 	 * shouldn't exist here:
222 	 */
223 	ret = lookup_inode(trans, inum, lostfound, &snapshot);
224 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
225 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
226 	return ret;
227 
228 create_lostfound:
229 	/*
230 	 * XXX: we could have a nicer log message here  if we had a nice way to
231 	 * walk backpointers to print a path
232 	 */
233 	bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
234 
235 	u64 now = bch2_current_time(c);
236 	struct btree_iter lostfound_iter = { NULL };
237 	u64 cpu = raw_smp_processor_id();
238 
239 	bch2_inode_init_early(c, lostfound);
240 	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
241 	lostfound->bi_dir = root_inode.bi_inum;
242 
243 	root_inode.bi_nlink++;
244 
245 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
246 	if (ret)
247 		goto err;
248 
249 	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
250 	ret = bch2_btree_iter_traverse(&lostfound_iter);
251 	if (ret)
252 		goto err;
253 
254 	ret =   bch2_dirent_create_snapshot(trans,
255 				0, root_inode.bi_inum, snapshot, &root_hash_info,
256 				mode_to_type(lostfound->bi_mode),
257 				&lostfound_str,
258 				lostfound->bi_inum,
259 				&lostfound->bi_dir_offset,
260 				BCH_HASH_SET_MUST_CREATE) ?:
261 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
262 				       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
263 err:
264 	bch_err_msg(c, ret, "creating lost+found");
265 	bch2_trans_iter_exit(trans, &lostfound_iter);
266 	return ret;
267 }
268 
269 static int reattach_inode(struct btree_trans *trans,
270 			  struct bch_inode_unpacked *inode,
271 			  u32 inode_snapshot)
272 {
273 	struct bch_hash_info dir_hash;
274 	struct bch_inode_unpacked lostfound;
275 	char name_buf[20];
276 	struct qstr name;
277 	u64 dir_offset = 0;
278 	u32 dirent_snapshot = inode_snapshot;
279 	int ret;
280 
281 	if (inode->bi_subvol) {
282 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
283 
284 		u64 root_inum;
285 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
286 				    &dirent_snapshot, &root_inum);
287 		if (ret)
288 			return ret;
289 
290 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
291 	} else {
292 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
293 	}
294 
295 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound);
296 	if (ret)
297 		return ret;
298 
299 	if (S_ISDIR(inode->bi_mode)) {
300 		lostfound.bi_nlink++;
301 
302 		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
303 		if (ret)
304 			return ret;
305 	}
306 
307 	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
308 
309 	name = (struct qstr) QSTR(name_buf);
310 
311 	ret = bch2_dirent_create_snapshot(trans,
312 				inode->bi_parent_subvol, lostfound.bi_inum,
313 				dirent_snapshot,
314 				&dir_hash,
315 				inode_d_type(inode),
316 				&name,
317 				inode->bi_subvol ?: inode->bi_inum,
318 				&dir_offset,
319 				BCH_HASH_SET_MUST_CREATE);
320 	if (ret)
321 		return ret;
322 
323 	inode->bi_dir		= lostfound.bi_inum;
324 	inode->bi_dir_offset	= dir_offset;
325 
326 	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
327 }
328 
329 static int remove_backpointer(struct btree_trans *trans,
330 			      struct bch_inode_unpacked *inode)
331 {
332 	struct btree_iter iter;
333 	struct bkey_s_c_dirent d;
334 	int ret;
335 
336 	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
337 				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
338 				     dirent);
339 	ret =   bkey_err(d) ?:
340 		__remove_dirent(trans, d.k->p);
341 	bch2_trans_iter_exit(trans, &iter);
342 	return ret;
343 }
344 
345 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
346 {
347 	struct bch_fs *c = trans->c;
348 
349 	struct bch_inode_unpacked inode;
350 	int ret = bch2_inode_find_by_inum_trans(trans,
351 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
352 				&inode);
353 	if (ret)
354 		return ret;
355 
356 	ret = remove_backpointer(trans, &inode);
357 	bch_err_msg(c, ret, "removing dirent");
358 	if (ret)
359 		return ret;
360 
361 	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
362 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
363 	return ret;
364 }
365 
366 struct snapshots_seen_entry {
367 	u32				id;
368 	u32				equiv;
369 };
370 
371 struct snapshots_seen {
372 	struct bpos			pos;
373 	DARRAY(struct snapshots_seen_entry) ids;
374 };
375 
376 static inline void snapshots_seen_exit(struct snapshots_seen *s)
377 {
378 	darray_exit(&s->ids);
379 }
380 
381 static inline void snapshots_seen_init(struct snapshots_seen *s)
382 {
383 	memset(s, 0, sizeof(*s));
384 }
385 
386 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
387 {
388 	struct snapshots_seen_entry *i, n = {
389 		.id	= id,
390 		.equiv	= bch2_snapshot_equiv(c, id),
391 	};
392 	int ret = 0;
393 
394 	__darray_for_each(s->ids, i) {
395 		if (i->id == id)
396 			return 0;
397 		if (i->id > id)
398 			break;
399 	}
400 
401 	ret = darray_insert_item(&s->ids, i - s->ids.data, n);
402 	if (ret)
403 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
404 			s->ids.size);
405 	return ret;
406 }
407 
408 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
409 				 enum btree_id btree_id, struct bpos pos)
410 {
411 	struct snapshots_seen_entry n = {
412 		.id	= pos.snapshot,
413 		.equiv	= bch2_snapshot_equiv(c, pos.snapshot),
414 	};
415 	int ret = 0;
416 
417 	if (!bkey_eq(s->pos, pos))
418 		s->ids.nr = 0;
419 
420 	s->pos = pos;
421 	s->pos.snapshot = n.equiv;
422 
423 	darray_for_each(s->ids, i) {
424 		if (i->id == n.id)
425 			return 0;
426 
427 		/*
428 		 * We currently don't rigorously track for snapshot cleanup
429 		 * needing to be run, so it shouldn't be a fsck error yet:
430 		 */
431 		if (i->equiv == n.equiv) {
432 			bch_err(c, "snapshot deletion did not finish:\n"
433 				"  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
434 				bch2_btree_id_str(btree_id),
435 				pos.inode, pos.offset,
436 				i->id, n.id, n.equiv);
437 			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
438 			return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
439 		}
440 	}
441 
442 	ret = darray_push(&s->ids, n);
443 	if (ret)
444 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
445 			s->ids.size);
446 	return ret;
447 }
448 
449 /**
450  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
451  * and @ancestor hasn't been overwritten in @seen
452  *
453  * @c:		filesystem handle
454  * @seen:	list of snapshot ids already seen at current position
455  * @id:		descendent snapshot id
456  * @ancestor:	ancestor snapshot id
457  *
458  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
459  */
460 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
461 				    u32 id, u32 ancestor)
462 {
463 	ssize_t i;
464 
465 	EBUG_ON(id > ancestor);
466 	EBUG_ON(!bch2_snapshot_is_equiv(c, id));
467 	EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
468 
469 	/* @ancestor should be the snapshot most recently added to @seen */
470 	EBUG_ON(ancestor != seen->pos.snapshot);
471 	EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
472 
473 	if (id == ancestor)
474 		return true;
475 
476 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
477 		return false;
478 
479 	/*
480 	 * We know that @id is a descendant of @ancestor, we're checking if
481 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
482 	 * @ascestor and with @id as a descendent.
483 	 *
484 	 * But we already know that we're scanning IDs between @id and @ancestor
485 	 * numerically, since snapshot ID lists are kept sorted, so if we find
486 	 * an id that's an ancestor of @id we're done:
487 	 */
488 
489 	for (i = seen->ids.nr - 2;
490 	     i >= 0 && seen->ids.data[i].equiv >= id;
491 	     --i)
492 		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
493 			return false;
494 
495 	return true;
496 }
497 
498 /**
499  * ref_visible - given a key with snapshot id @src that points to a key with
500  * snapshot id @dst, test whether there is some snapshot in which @dst is
501  * visible.
502  *
503  * @c:		filesystem handle
504  * @s:		list of snapshot IDs already seen at @src
505  * @src:	snapshot ID of src key
506  * @dst:	snapshot ID of dst key
507  * Returns:	true if there is some snapshot in which @dst is visible
508  *
509  * Assumes we're visiting @src keys in natural key order
510  */
511 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
512 			u32 src, u32 dst)
513 {
514 	return dst <= src
515 		? key_visible_in_snapshot(c, s, dst, src)
516 		: bch2_snapshot_is_ancestor(c, src, dst);
517 }
518 
519 static int ref_visible2(struct bch_fs *c,
520 			u32 src, struct snapshots_seen *src_seen,
521 			u32 dst, struct snapshots_seen *dst_seen)
522 {
523 	src = bch2_snapshot_equiv(c, src);
524 	dst = bch2_snapshot_equiv(c, dst);
525 
526 	if (dst > src) {
527 		swap(dst, src);
528 		swap(dst_seen, src_seen);
529 	}
530 	return key_visible_in_snapshot(c, src_seen, dst, src);
531 }
532 
533 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
534 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
535 	     (_i)->snapshot <= (_snapshot); _i++)					\
536 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
537 
538 struct inode_walker_entry {
539 	struct bch_inode_unpacked inode;
540 	u32			snapshot;
541 	bool			seen_this_pos;
542 	u64			count;
543 };
544 
545 struct inode_walker {
546 	bool				first_this_inode;
547 	bool				recalculate_sums;
548 	struct bpos			last_pos;
549 
550 	DARRAY(struct inode_walker_entry) inodes;
551 };
552 
553 static void inode_walker_exit(struct inode_walker *w)
554 {
555 	darray_exit(&w->inodes);
556 }
557 
558 static struct inode_walker inode_walker_init(void)
559 {
560 	return (struct inode_walker) { 0, };
561 }
562 
563 static int add_inode(struct bch_fs *c, struct inode_walker *w,
564 		     struct bkey_s_c inode)
565 {
566 	struct bch_inode_unpacked u;
567 
568 	BUG_ON(bch2_inode_unpack(inode, &u));
569 
570 	return darray_push(&w->inodes, ((struct inode_walker_entry) {
571 		.inode		= u,
572 		.snapshot	= bch2_snapshot_equiv(c, inode.k->p.snapshot),
573 	}));
574 }
575 
576 static int get_inodes_all_snapshots(struct btree_trans *trans,
577 				    struct inode_walker *w, u64 inum)
578 {
579 	struct bch_fs *c = trans->c;
580 	struct btree_iter iter;
581 	struct bkey_s_c k;
582 	int ret;
583 
584 	w->recalculate_sums = false;
585 	w->inodes.nr = 0;
586 
587 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
588 				     BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
589 		if (k.k->p.offset != inum)
590 			break;
591 
592 		if (bkey_is_inode(k.k))
593 			add_inode(c, w, k);
594 	}
595 	bch2_trans_iter_exit(trans, &iter);
596 
597 	if (ret)
598 		return ret;
599 
600 	w->first_this_inode = true;
601 	return 0;
602 }
603 
604 static struct inode_walker_entry *
605 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
606 {
607 	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
608 	u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
609 
610 	struct inode_walker_entry *i;
611 	__darray_for_each(w->inodes, i)
612 		if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
613 			goto found;
614 
615 	return NULL;
616 found:
617 	BUG_ON(snapshot > i->snapshot);
618 
619 	if (snapshot != i->snapshot && !is_whiteout) {
620 		struct inode_walker_entry new = *i;
621 
622 		new.snapshot = snapshot;
623 		new.count = 0;
624 
625 		struct printbuf buf = PRINTBUF;
626 		bch2_bkey_val_to_text(&buf, c, k);
627 
628 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
629 			 "unexpected because we should always update the inode when we update a key in that inode\n"
630 			 "%s",
631 			 w->last_pos.inode, snapshot, i->snapshot, buf.buf);
632 		printbuf_exit(&buf);
633 
634 		while (i > w->inodes.data && i[-1].snapshot > snapshot)
635 			--i;
636 
637 		size_t pos = i - w->inodes.data;
638 		int ret = darray_insert_item(&w->inodes, pos, new);
639 		if (ret)
640 			return ERR_PTR(ret);
641 
642 		i = w->inodes.data + pos;
643 	}
644 
645 	return i;
646 }
647 
648 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
649 					     struct inode_walker *w,
650 					     struct bkey_s_c k)
651 {
652 	if (w->last_pos.inode != k.k->p.inode) {
653 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
654 		if (ret)
655 			return ERR_PTR(ret);
656 	} else if (bkey_cmp(w->last_pos, k.k->p)) {
657 		darray_for_each(w->inodes, i)
658 			i->seen_this_pos = false;
659 	}
660 
661 	w->last_pos = k.k->p;
662 
663 	return lookup_inode_for_snapshot(trans->c, w, k);
664 }
665 
666 static int __get_visible_inodes(struct btree_trans *trans,
667 				struct inode_walker *w,
668 				struct snapshots_seen *s,
669 				u64 inum)
670 {
671 	struct bch_fs *c = trans->c;
672 	struct btree_iter iter;
673 	struct bkey_s_c k;
674 	int ret;
675 
676 	w->inodes.nr = 0;
677 
678 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
679 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
680 		u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
681 
682 		if (k.k->p.offset != inum)
683 			break;
684 
685 		if (!ref_visible(c, s, s->pos.snapshot, equiv))
686 			continue;
687 
688 		if (bkey_is_inode(k.k))
689 			add_inode(c, w, k);
690 
691 		if (equiv >= s->pos.snapshot)
692 			break;
693 	}
694 	bch2_trans_iter_exit(trans, &iter);
695 
696 	return ret;
697 }
698 
699 static int check_key_has_snapshot(struct btree_trans *trans,
700 				  struct btree_iter *iter,
701 				  struct bkey_s_c k)
702 {
703 	struct bch_fs *c = trans->c;
704 	struct printbuf buf = PRINTBUF;
705 	int ret = 0;
706 
707 	if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
708 				bkey_in_missing_snapshot,
709 				"key in missing snapshot: %s",
710 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
711 		ret = bch2_btree_delete_at(trans, iter,
712 					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
713 fsck_err:
714 	printbuf_exit(&buf);
715 	return ret;
716 }
717 
718 static int hash_redo_key(struct btree_trans *trans,
719 			 const struct bch_hash_desc desc,
720 			 struct bch_hash_info *hash_info,
721 			 struct btree_iter *k_iter, struct bkey_s_c k)
722 {
723 	struct bkey_i *delete;
724 	struct bkey_i *tmp;
725 
726 	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
727 	if (IS_ERR(delete))
728 		return PTR_ERR(delete);
729 
730 	tmp = bch2_bkey_make_mut_noupdate(trans, k);
731 	if (IS_ERR(tmp))
732 		return PTR_ERR(tmp);
733 
734 	bkey_init(&delete->k);
735 	delete->k.p = k_iter->pos;
736 	return  bch2_btree_iter_traverse(k_iter) ?:
737 		bch2_trans_update(trans, k_iter, delete, 0) ?:
738 		bch2_hash_set_in_snapshot(trans, desc, hash_info,
739 				       (subvol_inum) { 0, k.k->p.inode },
740 				       k.k->p.snapshot, tmp,
741 				       BCH_HASH_SET_MUST_CREATE,
742 				       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
743 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
744 }
745 
746 static int hash_check_key(struct btree_trans *trans,
747 			  const struct bch_hash_desc desc,
748 			  struct bch_hash_info *hash_info,
749 			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
750 {
751 	struct bch_fs *c = trans->c;
752 	struct btree_iter iter = { NULL };
753 	struct printbuf buf = PRINTBUF;
754 	struct bkey_s_c k;
755 	u64 hash;
756 	int ret = 0;
757 
758 	if (hash_k.k->type != desc.key_type)
759 		return 0;
760 
761 	hash = desc.hash_bkey(hash_info, hash_k);
762 
763 	if (likely(hash == hash_k.k->p.offset))
764 		return 0;
765 
766 	if (hash_k.k->p.offset < hash)
767 		goto bad_hash;
768 
769 	for_each_btree_key_norestart(trans, iter, desc.btree_id,
770 				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
771 				     BTREE_ITER_SLOTS, k, ret) {
772 		if (bkey_eq(k.k->p, hash_k.k->p))
773 			break;
774 
775 		if (fsck_err_on(k.k->type == desc.key_type &&
776 				!desc.cmp_bkey(k, hash_k), c,
777 				hash_table_key_duplicate,
778 				"duplicate hash table keys:\n%s",
779 				(printbuf_reset(&buf),
780 				 bch2_bkey_val_to_text(&buf, c, hash_k),
781 				 buf.buf))) {
782 			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
783 			break;
784 		}
785 
786 		if (bkey_deleted(k.k)) {
787 			bch2_trans_iter_exit(trans, &iter);
788 			goto bad_hash;
789 		}
790 	}
791 out:
792 	bch2_trans_iter_exit(trans, &iter);
793 	printbuf_exit(&buf);
794 	return ret;
795 bad_hash:
796 	if (fsck_err(c, hash_table_key_wrong_offset,
797 		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
798 		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
799 		     (printbuf_reset(&buf),
800 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
801 		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
802 		bch_err_fn(c, ret);
803 		if (ret)
804 			return ret;
805 		ret = -BCH_ERR_transaction_restart_nested;
806 	}
807 fsck_err:
808 	goto out;
809 }
810 
811 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
812 						struct btree_iter *iter,
813 						struct bpos pos)
814 {
815 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
816 }
817 
818 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
819 					       struct btree_iter *iter,
820 					       struct bch_inode_unpacked *inode,
821 					       u32 *snapshot)
822 {
823 	if (inode->bi_subvol) {
824 		u64 inum;
825 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
826 		if (ret)
827 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
828 	}
829 
830 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
831 }
832 
833 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
834 				   struct bkey_s_c_dirent d)
835 {
836 	return  inode->bi_dir		== d.k->p.inode &&
837 		inode->bi_dir_offset	== d.k->p.offset;
838 }
839 
840 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
841 				   struct bch_inode_unpacked *inode)
842 {
843 	return d.v->d_type == DT_SUBVOL
844 		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
845 		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
846 }
847 
848 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
849 {
850 	struct btree_iter iter;
851 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
852 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
853 	bch2_trans_iter_exit(trans, &iter);
854 	return ret;
855 }
856 
857 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
858 				    struct bch_inode_unpacked *inode,
859 				    u32 inode_snapshot, bool *write_inode)
860 {
861 	struct bch_fs *c = trans->c;
862 	struct printbuf buf = PRINTBUF;
863 
864 	struct btree_iter dirent_iter = {};
865 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
866 	int ret = bkey_err(d);
867 	if (ret && !bch2_err_matches(ret, ENOENT))
868 		return ret;
869 
870 	if (fsck_err_on(ret,
871 			c, inode_points_to_missing_dirent,
872 			"inode points to missing dirent\n%s",
873 			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
874 	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
875 			c, inode_points_to_wrong_dirent,
876 			"inode points to dirent that does not point back:\n%s",
877 			(bch2_bkey_val_to_text(&buf, c, inode_k),
878 			 prt_newline(&buf),
879 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
880 		/*
881 		 * We just clear the backpointer fields for now. If we find a
882 		 * dirent that points to this inode in check_dirents(), we'll
883 		 * update it then; then when we get to check_path() if the
884 		 * backpointer is still 0 we'll reattach it.
885 		 */
886 		inode->bi_dir = 0;
887 		inode->bi_dir_offset = 0;
888 		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
889 		*write_inode = true;
890 	}
891 
892 	ret = 0;
893 fsck_err:
894 	bch2_trans_iter_exit(trans, &dirent_iter);
895 	printbuf_exit(&buf);
896 	bch_err_fn(c, ret);
897 	return ret;
898 }
899 
900 static int check_inode(struct btree_trans *trans,
901 		       struct btree_iter *iter,
902 		       struct bkey_s_c k,
903 		       struct bch_inode_unpacked *prev,
904 		       struct snapshots_seen *s,
905 		       bool full)
906 {
907 	struct bch_fs *c = trans->c;
908 	struct bch_inode_unpacked u;
909 	bool do_update = false;
910 	int ret;
911 
912 	ret = check_key_has_snapshot(trans, iter, k);
913 	if (ret < 0)
914 		goto err;
915 	if (ret)
916 		return 0;
917 
918 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
919 	if (ret)
920 		goto err;
921 
922 	if (!bkey_is_inode(k.k))
923 		return 0;
924 
925 	BUG_ON(bch2_inode_unpack(k, &u));
926 
927 	if (!full &&
928 	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
929 			    BCH_INODE_i_sectors_dirty|
930 			    BCH_INODE_unlinked)))
931 		return 0;
932 
933 	if (prev->bi_inum != u.bi_inum)
934 		*prev = u;
935 
936 	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
937 			inode_d_type(prev)	!= inode_d_type(&u),
938 			c, inode_snapshot_mismatch,
939 			"inodes in different snapshots don't match")) {
940 		bch_err(c, "repair not implemented yet");
941 		return -BCH_ERR_fsck_repair_unimplemented;
942 	}
943 
944 	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
945 	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
946 		struct bpos new_min_pos;
947 
948 		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
949 		if (ret)
950 			goto err;
951 
952 		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
953 
954 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
955 
956 		bch_err_msg(c, ret, "in fsck updating inode");
957 		if (ret)
958 			return ret;
959 
960 		if (!bpos_eq(new_min_pos, POS_MIN))
961 			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
962 		return 0;
963 	}
964 
965 	if (u.bi_flags & BCH_INODE_unlinked) {
966 		ret = check_inode_deleted_list(trans, k.k->p);
967 		if (ret < 0)
968 			return ret;
969 
970 		fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
971 			    "inode %llu:%u unlinked, but not on deleted list",
972 			    u.bi_inum, k.k->p.snapshot);
973 		ret = 0;
974 	}
975 
976 	if (u.bi_flags & BCH_INODE_unlinked &&
977 	    (!c->sb.clean ||
978 	     fsck_err(c, inode_unlinked_but_clean,
979 		      "filesystem marked clean, but inode %llu unlinked",
980 		      u.bi_inum))) {
981 		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
982 		bch_err_msg(c, ret, "in fsck deleting inode");
983 		return ret;
984 	}
985 
986 	if (u.bi_flags & BCH_INODE_i_size_dirty &&
987 	    (!c->sb.clean ||
988 	     fsck_err(c, inode_i_size_dirty_but_clean,
989 		      "filesystem marked clean, but inode %llu has i_size dirty",
990 		      u.bi_inum))) {
991 		bch_verbose(c, "truncating inode %llu", u.bi_inum);
992 
993 		/*
994 		 * XXX: need to truncate partial blocks too here - or ideally
995 		 * just switch units to bytes and that issue goes away
996 		 */
997 		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
998 				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
999 				     iter->pos.snapshot),
1000 				POS(u.bi_inum, U64_MAX),
1001 				0, NULL);
1002 		bch_err_msg(c, ret, "in fsck truncating inode");
1003 		if (ret)
1004 			return ret;
1005 
1006 		/*
1007 		 * We truncated without our normal sector accounting hook, just
1008 		 * make sure we recalculate it:
1009 		 */
1010 		u.bi_flags |= BCH_INODE_i_sectors_dirty;
1011 
1012 		u.bi_flags &= ~BCH_INODE_i_size_dirty;
1013 		do_update = true;
1014 	}
1015 
1016 	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
1017 	    (!c->sb.clean ||
1018 	     fsck_err(c, inode_i_sectors_dirty_but_clean,
1019 		      "filesystem marked clean, but inode %llu has i_sectors dirty",
1020 		      u.bi_inum))) {
1021 		s64 sectors;
1022 
1023 		bch_verbose(c, "recounting sectors for inode %llu",
1024 			    u.bi_inum);
1025 
1026 		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1027 		if (sectors < 0) {
1028 			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1029 			return sectors;
1030 		}
1031 
1032 		u.bi_sectors = sectors;
1033 		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1034 		do_update = true;
1035 	}
1036 
1037 	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1038 		u.bi_dir = 0;
1039 		u.bi_dir_offset = 0;
1040 		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1041 		do_update = true;
1042 	}
1043 
1044 	if (u.bi_dir || u.bi_dir_offset) {
1045 		ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1046 		if (ret)
1047 			goto err;
1048 	}
1049 
1050 	if (fsck_err_on(u.bi_parent_subvol &&
1051 			(u.bi_subvol == 0 ||
1052 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1053 			c, inode_bi_parent_nonzero,
1054 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1055 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1056 		u.bi_parent_subvol = 0;
1057 		do_update = true;
1058 	}
1059 
1060 	if (u.bi_subvol) {
1061 		struct bch_subvolume s;
1062 
1063 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1064 		if (ret && !bch2_err_matches(ret, ENOENT))
1065 			goto err;
1066 
1067 		if (fsck_err_on(ret,
1068 				c, inode_bi_subvol_missing,
1069 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1070 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1071 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1072 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1073 							   k.k->p.snapshot),
1074 				c, inode_bi_subvol_wrong,
1075 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1076 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1077 				le64_to_cpu(s.inode),
1078 				le32_to_cpu(s.snapshot))) {
1079 			u.bi_subvol = 0;
1080 			u.bi_parent_subvol = 0;
1081 			do_update = true;
1082 		}
1083 	}
1084 
1085 	if (do_update) {
1086 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1087 		bch_err_msg(c, ret, "in fsck updating inode");
1088 		if (ret)
1089 			return ret;
1090 	}
1091 err:
1092 fsck_err:
1093 	bch_err_fn(c, ret);
1094 	return ret;
1095 }
1096 
1097 int bch2_check_inodes(struct bch_fs *c)
1098 {
1099 	bool full = c->opts.fsck;
1100 	struct bch_inode_unpacked prev = { 0 };
1101 	struct snapshots_seen s;
1102 
1103 	snapshots_seen_init(&s);
1104 
1105 	int ret = bch2_trans_run(c,
1106 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1107 				POS_MIN,
1108 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1109 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1110 			check_inode(trans, &iter, k, &prev, &s, full)));
1111 
1112 	snapshots_seen_exit(&s);
1113 	bch_err_fn(c, ret);
1114 	return ret;
1115 }
1116 
1117 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1118 {
1119 	struct bch_fs *c = trans->c;
1120 	int ret = 0;
1121 	s64 count2;
1122 
1123 	darray_for_each(w->inodes, i) {
1124 		if (i->inode.bi_sectors == i->count)
1125 			continue;
1126 
1127 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1128 
1129 		if (w->recalculate_sums)
1130 			i->count = count2;
1131 
1132 		if (i->count != count2) {
1133 			bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1134 				w->last_pos.inode, i->snapshot, i->count, count2);
1135 			return -BCH_ERR_internal_fsck_err;
1136 		}
1137 
1138 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1139 				c, inode_i_sectors_wrong,
1140 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1141 				w->last_pos.inode, i->snapshot,
1142 				i->inode.bi_sectors, i->count)) {
1143 			i->inode.bi_sectors = i->count;
1144 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1145 			if (ret)
1146 				break;
1147 		}
1148 	}
1149 fsck_err:
1150 	bch_err_fn(c, ret);
1151 	return ret;
1152 }
1153 
1154 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1155 {
1156 	u32 restart_count = trans->restart_count;
1157 	return check_i_sectors_notnested(trans, w) ?:
1158 		trans_was_restarted(trans, restart_count);
1159 }
1160 
1161 struct extent_end {
1162 	u32			snapshot;
1163 	u64			offset;
1164 	struct snapshots_seen	seen;
1165 };
1166 
1167 struct extent_ends {
1168 	struct bpos			last_pos;
1169 	DARRAY(struct extent_end)	e;
1170 };
1171 
1172 static void extent_ends_reset(struct extent_ends *extent_ends)
1173 {
1174 	darray_for_each(extent_ends->e, i)
1175 		snapshots_seen_exit(&i->seen);
1176 	extent_ends->e.nr = 0;
1177 }
1178 
1179 static void extent_ends_exit(struct extent_ends *extent_ends)
1180 {
1181 	extent_ends_reset(extent_ends);
1182 	darray_exit(&extent_ends->e);
1183 }
1184 
1185 static void extent_ends_init(struct extent_ends *extent_ends)
1186 {
1187 	memset(extent_ends, 0, sizeof(*extent_ends));
1188 }
1189 
1190 static int extent_ends_at(struct bch_fs *c,
1191 			  struct extent_ends *extent_ends,
1192 			  struct snapshots_seen *seen,
1193 			  struct bkey_s_c k)
1194 {
1195 	struct extent_end *i, n = (struct extent_end) {
1196 		.offset		= k.k->p.offset,
1197 		.snapshot	= k.k->p.snapshot,
1198 		.seen		= *seen,
1199 	};
1200 
1201 	n.seen.ids.data = kmemdup(seen->ids.data,
1202 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1203 			      GFP_KERNEL);
1204 	if (!n.seen.ids.data)
1205 		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1206 
1207 	__darray_for_each(extent_ends->e, i) {
1208 		if (i->snapshot == k.k->p.snapshot) {
1209 			snapshots_seen_exit(&i->seen);
1210 			*i = n;
1211 			return 0;
1212 		}
1213 
1214 		if (i->snapshot >= k.k->p.snapshot)
1215 			break;
1216 	}
1217 
1218 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1219 }
1220 
1221 static int overlapping_extents_found(struct btree_trans *trans,
1222 				     enum btree_id btree,
1223 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1224 				     struct bkey pos2,
1225 				     bool *fixed,
1226 				     struct extent_end *extent_end)
1227 {
1228 	struct bch_fs *c = trans->c;
1229 	struct printbuf buf = PRINTBUF;
1230 	struct btree_iter iter1, iter2 = { NULL };
1231 	struct bkey_s_c k1, k2;
1232 	int ret;
1233 
1234 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1235 
1236 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1237 			     BTREE_ITER_ALL_SNAPSHOTS|
1238 			     BTREE_ITER_NOT_EXTENTS);
1239 	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1240 	ret = bkey_err(k1);
1241 	if (ret)
1242 		goto err;
1243 
1244 	prt_str(&buf, "\n  ");
1245 	bch2_bkey_val_to_text(&buf, c, k1);
1246 
1247 	if (!bpos_eq(pos1, k1.k->p)) {
1248 		prt_str(&buf, "\n  wanted\n  ");
1249 		bch2_bpos_to_text(&buf, pos1);
1250 		prt_str(&buf, "\n  ");
1251 		bch2_bkey_to_text(&buf, &pos2);
1252 
1253 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1254 			__func__, buf.buf);
1255 		ret = -BCH_ERR_internal_fsck_err;
1256 		goto err;
1257 	}
1258 
1259 	bch2_trans_copy_iter(&iter2, &iter1);
1260 
1261 	while (1) {
1262 		bch2_btree_iter_advance(&iter2);
1263 
1264 		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1265 		ret = bkey_err(k2);
1266 		if (ret)
1267 			goto err;
1268 
1269 		if (bpos_ge(k2.k->p, pos2.p))
1270 			break;
1271 	}
1272 
1273 	prt_str(&buf, "\n  ");
1274 	bch2_bkey_val_to_text(&buf, c, k2);
1275 
1276 	if (bpos_gt(k2.k->p, pos2.p) ||
1277 	    pos2.size != k2.k->size) {
1278 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1279 			__func__, buf.buf);
1280 		ret = -BCH_ERR_internal_fsck_err;
1281 		goto err;
1282 	}
1283 
1284 	prt_printf(&buf, "\n  overwriting %s extent",
1285 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1286 
1287 	if (fsck_err(c, extent_overlapping,
1288 		     "overlapping extents%s", buf.buf)) {
1289 		struct btree_iter *old_iter = &iter1;
1290 		struct disk_reservation res = { 0 };
1291 
1292 		if (pos1.snapshot < pos2.p.snapshot) {
1293 			old_iter = &iter2;
1294 			swap(k1, k2);
1295 		}
1296 
1297 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1298 
1299 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1300 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
1301 				k1, k2) ?:
1302 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1303 		bch2_disk_reservation_put(c, &res);
1304 
1305 		if (ret)
1306 			goto err;
1307 
1308 		*fixed = true;
1309 
1310 		if (pos1.snapshot == pos2.p.snapshot) {
1311 			/*
1312 			 * We overwrote the first extent, and did the overwrite
1313 			 * in the same snapshot:
1314 			 */
1315 			extent_end->offset = bkey_start_offset(&pos2);
1316 		} else if (pos1.snapshot > pos2.p.snapshot) {
1317 			/*
1318 			 * We overwrote the first extent in pos2's snapshot:
1319 			 */
1320 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1321 		} else {
1322 			/*
1323 			 * We overwrote the second extent - restart
1324 			 * check_extent() from the top:
1325 			 */
1326 			ret = -BCH_ERR_transaction_restart_nested;
1327 		}
1328 	}
1329 fsck_err:
1330 err:
1331 	bch2_trans_iter_exit(trans, &iter2);
1332 	bch2_trans_iter_exit(trans, &iter1);
1333 	printbuf_exit(&buf);
1334 	return ret;
1335 }
1336 
1337 static int check_overlapping_extents(struct btree_trans *trans,
1338 			      struct snapshots_seen *seen,
1339 			      struct extent_ends *extent_ends,
1340 			      struct bkey_s_c k,
1341 			      u32 equiv,
1342 			      struct btree_iter *iter,
1343 			      bool *fixed)
1344 {
1345 	struct bch_fs *c = trans->c;
1346 	int ret = 0;
1347 
1348 	/* transaction restart, running again */
1349 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1350 		return 0;
1351 
1352 	if (extent_ends->last_pos.inode != k.k->p.inode)
1353 		extent_ends_reset(extent_ends);
1354 
1355 	darray_for_each(extent_ends->e, i) {
1356 		if (i->offset <= bkey_start_offset(k.k))
1357 			continue;
1358 
1359 		if (!ref_visible2(c,
1360 				  k.k->p.snapshot, seen,
1361 				  i->snapshot, &i->seen))
1362 			continue;
1363 
1364 		ret = overlapping_extents_found(trans, iter->btree_id,
1365 						SPOS(iter->pos.inode,
1366 						     i->offset,
1367 						     i->snapshot),
1368 						&i->seen,
1369 						*k.k, fixed, i);
1370 		if (ret)
1371 			goto err;
1372 	}
1373 
1374 	ret = extent_ends_at(c, extent_ends, seen, k);
1375 	if (ret)
1376 		goto err;
1377 
1378 	extent_ends->last_pos = k.k->p;
1379 err:
1380 	return ret;
1381 }
1382 
1383 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1384 				struct bkey_s_c k)
1385 {
1386 	struct bch_fs *c = trans->c;
1387 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1388 	struct bch_extent_crc_unpacked crc;
1389 	const union bch_extent_entry *i;
1390 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1391 
1392 	bkey_for_each_crc(k.k, ptrs, crc, i)
1393 		if (crc_is_encoded(crc) &&
1394 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1395 			struct printbuf buf = PRINTBUF;
1396 
1397 			bch2_bkey_val_to_text(&buf, c, k);
1398 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1399 			printbuf_exit(&buf);
1400 		}
1401 
1402 	return 0;
1403 }
1404 
1405 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1406 			struct bkey_s_c k,
1407 			struct inode_walker *inode,
1408 			struct snapshots_seen *s,
1409 			struct extent_ends *extent_ends)
1410 {
1411 	struct bch_fs *c = trans->c;
1412 	struct inode_walker_entry *i;
1413 	struct printbuf buf = PRINTBUF;
1414 	struct bpos equiv = k.k->p;
1415 	int ret = 0;
1416 
1417 	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1418 
1419 	ret = check_key_has_snapshot(trans, iter, k);
1420 	if (ret) {
1421 		ret = ret < 0 ? ret : 0;
1422 		goto out;
1423 	}
1424 
1425 	if (inode->last_pos.inode != k.k->p.inode) {
1426 		ret = check_i_sectors(trans, inode);
1427 		if (ret)
1428 			goto err;
1429 	}
1430 
1431 	i = walk_inode(trans, inode, k);
1432 	ret = PTR_ERR_OR_ZERO(i);
1433 	if (ret)
1434 		goto err;
1435 
1436 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1437 	if (ret)
1438 		goto err;
1439 
1440 	if (k.k->type != KEY_TYPE_whiteout) {
1441 		if (fsck_err_on(!i, c, extent_in_missing_inode,
1442 				"extent in missing inode:\n  %s",
1443 				(printbuf_reset(&buf),
1444 				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1445 			goto delete;
1446 
1447 		if (fsck_err_on(i &&
1448 				!S_ISREG(i->inode.bi_mode) &&
1449 				!S_ISLNK(i->inode.bi_mode),
1450 				c, extent_in_non_reg_inode,
1451 				"extent in non regular inode mode %o:\n  %s",
1452 				i->inode.bi_mode,
1453 				(printbuf_reset(&buf),
1454 				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1455 			goto delete;
1456 
1457 		ret = check_overlapping_extents(trans, s, extent_ends, k,
1458 						equiv.snapshot, iter,
1459 						&inode->recalculate_sums);
1460 		if (ret)
1461 			goto err;
1462 	}
1463 
1464 	/*
1465 	 * Check inodes in reverse order, from oldest snapshots to newest,
1466 	 * starting from the inode that matches this extent's snapshot. If we
1467 	 * didn't have one, iterate over all inodes:
1468 	 */
1469 	if (!i)
1470 		i = inode->inodes.data + inode->inodes.nr - 1;
1471 
1472 	for (;
1473 	     inode->inodes.data && i >= inode->inodes.data;
1474 	     --i) {
1475 		if (i->snapshot > equiv.snapshot ||
1476 		    !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
1477 			continue;
1478 
1479 		if (k.k->type != KEY_TYPE_whiteout) {
1480 			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1481 					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1482 					!bkey_extent_is_reservation(k),
1483 					c, extent_past_end_of_inode,
1484 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
1485 					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1486 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1487 				struct btree_iter iter2;
1488 
1489 				bch2_trans_copy_iter(&iter2, iter);
1490 				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1491 				ret =   bch2_btree_iter_traverse(&iter2) ?:
1492 					bch2_btree_delete_at(trans, &iter2,
1493 						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1494 				bch2_trans_iter_exit(trans, &iter2);
1495 				if (ret)
1496 					goto err;
1497 
1498 				iter->k.type = KEY_TYPE_whiteout;
1499 			}
1500 
1501 			if (bkey_extent_is_allocation(k.k))
1502 				i->count += k.k->size;
1503 		}
1504 
1505 		i->seen_this_pos = true;
1506 	}
1507 out:
1508 err:
1509 fsck_err:
1510 	printbuf_exit(&buf);
1511 	bch_err_fn(c, ret);
1512 	return ret;
1513 delete:
1514 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1515 	goto out;
1516 }
1517 
1518 /*
1519  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1520  * that i_size an i_sectors are consistent
1521  */
1522 int bch2_check_extents(struct bch_fs *c)
1523 {
1524 	struct inode_walker w = inode_walker_init();
1525 	struct snapshots_seen s;
1526 	struct extent_ends extent_ends;
1527 	struct disk_reservation res = { 0 };
1528 
1529 	snapshots_seen_init(&s);
1530 	extent_ends_init(&extent_ends);
1531 
1532 	int ret = bch2_trans_run(c,
1533 		for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1534 				POS(BCACHEFS_ROOT_INO, 0),
1535 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1536 				&res, NULL,
1537 				BCH_TRANS_COMMIT_no_enospc, ({
1538 			bch2_disk_reservation_put(c, &res);
1539 			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1540 			check_extent_overbig(trans, &iter, k);
1541 		})) ?:
1542 		check_i_sectors_notnested(trans, &w));
1543 
1544 	bch2_disk_reservation_put(c, &res);
1545 	extent_ends_exit(&extent_ends);
1546 	inode_walker_exit(&w);
1547 	snapshots_seen_exit(&s);
1548 
1549 	bch_err_fn(c, ret);
1550 	return ret;
1551 }
1552 
1553 int bch2_check_indirect_extents(struct bch_fs *c)
1554 {
1555 	struct disk_reservation res = { 0 };
1556 
1557 	int ret = bch2_trans_run(c,
1558 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1559 				POS_MIN,
1560 				BTREE_ITER_PREFETCH, k,
1561 				&res, NULL,
1562 				BCH_TRANS_COMMIT_no_enospc, ({
1563 			bch2_disk_reservation_put(c, &res);
1564 			check_extent_overbig(trans, &iter, k);
1565 		})));
1566 
1567 	bch2_disk_reservation_put(c, &res);
1568 	bch_err_fn(c, ret);
1569 	return ret;
1570 }
1571 
1572 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1573 {
1574 	struct bch_fs *c = trans->c;
1575 	int ret = 0;
1576 	s64 count2;
1577 
1578 	darray_for_each(w->inodes, i) {
1579 		if (i->inode.bi_nlink == i->count)
1580 			continue;
1581 
1582 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1583 		if (count2 < 0)
1584 			return count2;
1585 
1586 		if (i->count != count2) {
1587 			bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1588 				i->count, count2);
1589 			i->count = count2;
1590 			if (i->inode.bi_nlink == i->count)
1591 				continue;
1592 		}
1593 
1594 		if (fsck_err_on(i->inode.bi_nlink != i->count,
1595 				c, inode_dir_wrong_nlink,
1596 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1597 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1598 			i->inode.bi_nlink = i->count;
1599 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1600 			if (ret)
1601 				break;
1602 		}
1603 	}
1604 fsck_err:
1605 	bch_err_fn(c, ret);
1606 	return ret;
1607 }
1608 
1609 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1610 {
1611 	u32 restart_count = trans->restart_count;
1612 	return check_subdir_count_notnested(trans, w) ?:
1613 		trans_was_restarted(trans, restart_count);
1614 }
1615 
1616 static int check_dirent_inode_dirent(struct btree_trans *trans,
1617 				   struct btree_iter *iter,
1618 				   struct bkey_s_c_dirent d,
1619 				   struct bch_inode_unpacked *target,
1620 				   u32 target_snapshot)
1621 {
1622 	struct bch_fs *c = trans->c;
1623 	struct printbuf buf = PRINTBUF;
1624 	int ret = 0;
1625 
1626 	if (inode_points_to_dirent(target, d))
1627 		return 0;
1628 
1629 	if (!target->bi_dir &&
1630 	    !target->bi_dir_offset) {
1631 		target->bi_dir		= d.k->p.inode;
1632 		target->bi_dir_offset	= d.k->p.offset;
1633 		return __bch2_fsck_write_inode(trans, target, target_snapshot);
1634 	}
1635 
1636 	struct btree_iter bp_iter = { NULL };
1637 	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1638 			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1639 	ret = bkey_err(bp_dirent);
1640 	if (ret && !bch2_err_matches(ret, ENOENT))
1641 		goto err;
1642 
1643 	bool backpointer_exists = !ret;
1644 	ret = 0;
1645 
1646 	if (fsck_err_on(!backpointer_exists,
1647 			c, inode_wrong_backpointer,
1648 			"inode %llu:%u has wrong backpointer:\n"
1649 			"got       %llu:%llu\n"
1650 			"should be %llu:%llu",
1651 			target->bi_inum, target_snapshot,
1652 			target->bi_dir,
1653 			target->bi_dir_offset,
1654 			d.k->p.inode,
1655 			d.k->p.offset)) {
1656 		target->bi_dir		= d.k->p.inode;
1657 		target->bi_dir_offset	= d.k->p.offset;
1658 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1659 		goto out;
1660 	}
1661 
1662 	bch2_bkey_val_to_text(&buf, c, d.s_c);
1663 	prt_newline(&buf);
1664 	if (backpointer_exists)
1665 		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1666 
1667 	if (fsck_err_on(backpointer_exists &&
1668 			(S_ISDIR(target->bi_mode) ||
1669 			 target->bi_subvol),
1670 			c, inode_dir_multiple_links,
1671 			"%s %llu:%u with multiple links\n%s",
1672 			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1673 			target->bi_inum, target_snapshot, buf.buf)) {
1674 		ret = __remove_dirent(trans, d.k->p);
1675 		goto out;
1676 	}
1677 
1678 	/*
1679 	 * hardlinked file with nlink 0:
1680 	 * We're just adjusting nlink here so check_nlinks() will pick
1681 	 * it up, it ignores inodes with nlink 0
1682 	 */
1683 	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1684 			c, inode_multiple_links_but_nlink_0,
1685 			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1686 			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1687 		target->bi_nlink++;
1688 		target->bi_flags &= ~BCH_INODE_unlinked;
1689 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1690 		if (ret)
1691 			goto err;
1692 	}
1693 out:
1694 err:
1695 fsck_err:
1696 	bch2_trans_iter_exit(trans, &bp_iter);
1697 	printbuf_exit(&buf);
1698 	bch_err_fn(c, ret);
1699 	return ret;
1700 }
1701 
1702 static int check_dirent_target(struct btree_trans *trans,
1703 			       struct btree_iter *iter,
1704 			       struct bkey_s_c_dirent d,
1705 			       struct bch_inode_unpacked *target,
1706 			       u32 target_snapshot)
1707 {
1708 	struct bch_fs *c = trans->c;
1709 	struct bkey_i_dirent *n;
1710 	struct printbuf buf = PRINTBUF;
1711 	int ret = 0;
1712 
1713 	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1714 	if (ret)
1715 		goto err;
1716 
1717 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
1718 			c, dirent_d_type_wrong,
1719 			"incorrect d_type: got %s, should be %s:\n%s",
1720 			bch2_d_type_str(d.v->d_type),
1721 			bch2_d_type_str(inode_d_type(target)),
1722 			(printbuf_reset(&buf),
1723 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1724 		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1725 		ret = PTR_ERR_OR_ZERO(n);
1726 		if (ret)
1727 			goto err;
1728 
1729 		bkey_reassemble(&n->k_i, d.s_c);
1730 		n->v.d_type = inode_d_type(target);
1731 		if (n->v.d_type == DT_SUBVOL) {
1732 			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1733 			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1734 		} else {
1735 			n->v.d_inum = cpu_to_le64(target->bi_inum);
1736 		}
1737 
1738 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1739 		if (ret)
1740 			goto err;
1741 
1742 		d = dirent_i_to_s_c(n);
1743 	}
1744 err:
1745 fsck_err:
1746 	printbuf_exit(&buf);
1747 	bch_err_fn(c, ret);
1748 	return ret;
1749 }
1750 
1751 /* find a subvolume that's a descendent of @snapshot: */
1752 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1753 {
1754 	struct btree_iter iter;
1755 	struct bkey_s_c k;
1756 	int ret;
1757 
1758 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1759 		if (k.k->type != KEY_TYPE_subvolume)
1760 			continue;
1761 
1762 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1763 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1764 			bch2_trans_iter_exit(trans, &iter);
1765 			*subvolid = k.k->p.offset;
1766 			goto found;
1767 		}
1768 	}
1769 	if (!ret)
1770 		ret = -ENOENT;
1771 found:
1772 	bch2_trans_iter_exit(trans, &iter);
1773 	return ret;
1774 }
1775 
1776 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1777 				  struct bkey_s_c_dirent d)
1778 {
1779 	struct bch_fs *c = trans->c;
1780 	struct btree_iter subvol_iter = {};
1781 	struct bch_inode_unpacked subvol_root;
1782 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1783 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1784 	u32 parent_snapshot;
1785 	u64 parent_inum;
1786 	struct printbuf buf = PRINTBUF;
1787 	int ret = 0;
1788 
1789 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1790 	if (ret && !bch2_err_matches(ret, ENOENT))
1791 		return ret;
1792 
1793 	if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
1794 			"dirent parent_subvol points to missing subvolume\n%s",
1795 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1796 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1797 			c, dirent_not_visible_in_parent_subvol,
1798 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1799 			parent_snapshot,
1800 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1801 		u32 new_parent_subvol;
1802 		ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1803 		if (ret)
1804 			goto err;
1805 
1806 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1807 		ret = PTR_ERR_OR_ZERO(new_dirent);
1808 		if (ret)
1809 			goto err;
1810 
1811 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1812 	}
1813 
1814 	struct bkey_s_c_subvolume s =
1815 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
1816 					 BTREE_ID_subvolumes, POS(0, target_subvol),
1817 					 0, subvolume);
1818 	ret = bkey_err(s.s_c);
1819 	if (ret && !bch2_err_matches(ret, ENOENT))
1820 		return ret;
1821 
1822 	if (ret) {
1823 		if (fsck_err(c, dirent_to_missing_subvol,
1824 			     "dirent points to missing subvolume\n%s",
1825 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1826 			return __remove_dirent(trans, d.k->p);
1827 		ret = 0;
1828 		goto out;
1829 	}
1830 
1831 	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
1832 			c, subvol_fs_path_parent_wrong,
1833 			"subvol with wrong fs_path_parent, should be be %u\n%s",
1834 			parent_subvol,
1835 			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
1836 		struct bkey_i_subvolume *n =
1837 			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
1838 		ret = PTR_ERR_OR_ZERO(n);
1839 		if (ret)
1840 			goto err;
1841 
1842 		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
1843 	}
1844 
1845 	u64 target_inum = le64_to_cpu(s.v->inode);
1846 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
1847 
1848 	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
1849 	if (ret && !bch2_err_matches(ret, ENOENT))
1850 		return ret;
1851 
1852 	if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol,
1853 			c, inode_bi_parent_wrong,
1854 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
1855 			target_inum,
1856 			subvol_root.bi_parent_subvol, parent_subvol)) {
1857 		subvol_root.bi_parent_subvol = parent_subvol;
1858 		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
1859 		if (ret)
1860 			return ret;
1861 	}
1862 
1863 	ret = check_dirent_target(trans, iter, d, &subvol_root,
1864 				  target_snapshot);
1865 	if (ret)
1866 		return ret;
1867 out:
1868 err:
1869 fsck_err:
1870 	bch2_trans_iter_exit(trans, &subvol_iter);
1871 	printbuf_exit(&buf);
1872 	return ret;
1873 }
1874 
1875 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1876 			struct bkey_s_c k,
1877 			struct bch_hash_info *hash_info,
1878 			struct inode_walker *dir,
1879 			struct inode_walker *target,
1880 			struct snapshots_seen *s)
1881 {
1882 	struct bch_fs *c = trans->c;
1883 	struct bkey_s_c_dirent d;
1884 	struct inode_walker_entry *i;
1885 	struct printbuf buf = PRINTBUF;
1886 	struct bpos equiv;
1887 	int ret = 0;
1888 
1889 	ret = check_key_has_snapshot(trans, iter, k);
1890 	if (ret) {
1891 		ret = ret < 0 ? ret : 0;
1892 		goto out;
1893 	}
1894 
1895 	equiv = k.k->p;
1896 	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1897 
1898 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1899 	if (ret)
1900 		goto err;
1901 
1902 	if (k.k->type == KEY_TYPE_whiteout)
1903 		goto out;
1904 
1905 	if (dir->last_pos.inode != k.k->p.inode) {
1906 		ret = check_subdir_count(trans, dir);
1907 		if (ret)
1908 			goto err;
1909 	}
1910 
1911 	BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
1912 
1913 	i = walk_inode(trans, dir, k);
1914 	ret = PTR_ERR_OR_ZERO(i);
1915 	if (ret < 0)
1916 		goto err;
1917 
1918 	if (dir->first_this_inode && dir->inodes.nr)
1919 		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
1920 	dir->first_this_inode = false;
1921 
1922 	if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
1923 			"dirent in nonexisting directory:\n%s",
1924 			(printbuf_reset(&buf),
1925 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1926 		ret = bch2_btree_delete_at(trans, iter,
1927 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1928 		goto out;
1929 	}
1930 
1931 	if (!i)
1932 		goto out;
1933 
1934 	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
1935 			c, dirent_in_non_dir_inode,
1936 			"dirent in non directory inode type %s:\n%s",
1937 			bch2_d_type_str(inode_d_type(&i->inode)),
1938 			(printbuf_reset(&buf),
1939 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1940 		ret = bch2_btree_delete_at(trans, iter, 0);
1941 		goto out;
1942 	}
1943 
1944 	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
1945 	if (ret < 0)
1946 		goto err;
1947 	if (ret) {
1948 		/* dirent has been deleted */
1949 		ret = 0;
1950 		goto out;
1951 	}
1952 
1953 	if (k.k->type != KEY_TYPE_dirent)
1954 		goto out;
1955 
1956 	d = bkey_s_c_to_dirent(k);
1957 
1958 	if (d.v->d_type == DT_SUBVOL) {
1959 		ret = check_dirent_to_subvol(trans, iter, d);
1960 		if (ret)
1961 			goto err;
1962 	} else {
1963 		ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
1964 		if (ret)
1965 			goto err;
1966 
1967 		if (fsck_err_on(!target->inodes.nr,
1968 				c, dirent_to_missing_inode,
1969 				"dirent points to missing inode: (equiv %u)\n%s",
1970 				equiv.snapshot,
1971 				(printbuf_reset(&buf),
1972 				 bch2_bkey_val_to_text(&buf, c, k),
1973 				 buf.buf))) {
1974 			ret = __remove_dirent(trans, d.k->p);
1975 			if (ret)
1976 				goto err;
1977 		}
1978 
1979 		darray_for_each(target->inodes, i) {
1980 			ret = check_dirent_target(trans, iter, d,
1981 						  &i->inode, i->snapshot);
1982 			if (ret)
1983 				goto err;
1984 		}
1985 
1986 		if (d.v->d_type == DT_DIR)
1987 			for_each_visible_inode(c, s, dir, equiv.snapshot, i)
1988 				i->count++;
1989 	}
1990 out:
1991 err:
1992 fsck_err:
1993 	printbuf_exit(&buf);
1994 	bch_err_fn(c, ret);
1995 	return ret;
1996 }
1997 
1998 /*
1999  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2000  * validate d_type
2001  */
2002 int bch2_check_dirents(struct bch_fs *c)
2003 {
2004 	struct inode_walker dir = inode_walker_init();
2005 	struct inode_walker target = inode_walker_init();
2006 	struct snapshots_seen s;
2007 	struct bch_hash_info hash_info;
2008 
2009 	snapshots_seen_init(&s);
2010 
2011 	int ret = bch2_trans_run(c,
2012 		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2013 				POS(BCACHEFS_ROOT_INO, 0),
2014 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
2015 				k,
2016 				NULL, NULL,
2017 				BCH_TRANS_COMMIT_no_enospc,
2018 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2019 		check_subdir_count_notnested(trans, &dir));
2020 
2021 	snapshots_seen_exit(&s);
2022 	inode_walker_exit(&dir);
2023 	inode_walker_exit(&target);
2024 	bch_err_fn(c, ret);
2025 	return ret;
2026 }
2027 
2028 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2029 		       struct bkey_s_c k,
2030 		       struct bch_hash_info *hash_info,
2031 		       struct inode_walker *inode)
2032 {
2033 	struct bch_fs *c = trans->c;
2034 	struct inode_walker_entry *i;
2035 	int ret;
2036 
2037 	ret = check_key_has_snapshot(trans, iter, k);
2038 	if (ret < 0)
2039 		return ret;
2040 	if (ret)
2041 		return 0;
2042 
2043 	i = walk_inode(trans, inode, k);
2044 	ret = PTR_ERR_OR_ZERO(i);
2045 	if (ret)
2046 		return ret;
2047 
2048 	if (inode->first_this_inode && inode->inodes.nr)
2049 		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
2050 	inode->first_this_inode = false;
2051 
2052 	if (fsck_err_on(!i, c, xattr_in_missing_inode,
2053 			"xattr for missing inode %llu",
2054 			k.k->p.inode))
2055 		return bch2_btree_delete_at(trans, iter, 0);
2056 
2057 	if (!i)
2058 		return 0;
2059 
2060 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2061 fsck_err:
2062 	bch_err_fn(c, ret);
2063 	return ret;
2064 }
2065 
2066 /*
2067  * Walk xattrs: verify that they all have a corresponding inode
2068  */
2069 int bch2_check_xattrs(struct bch_fs *c)
2070 {
2071 	struct inode_walker inode = inode_walker_init();
2072 	struct bch_hash_info hash_info;
2073 	int ret = 0;
2074 
2075 	ret = bch2_trans_run(c,
2076 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2077 			POS(BCACHEFS_ROOT_INO, 0),
2078 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
2079 			k,
2080 			NULL, NULL,
2081 			BCH_TRANS_COMMIT_no_enospc,
2082 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2083 	bch_err_fn(c, ret);
2084 	return ret;
2085 }
2086 
2087 static int check_root_trans(struct btree_trans *trans)
2088 {
2089 	struct bch_fs *c = trans->c;
2090 	struct bch_inode_unpacked root_inode;
2091 	u32 snapshot;
2092 	u64 inum;
2093 	int ret;
2094 
2095 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2096 	if (ret && !bch2_err_matches(ret, ENOENT))
2097 		return ret;
2098 
2099 	if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
2100 				"root subvol missing")) {
2101 		struct bkey_i_subvolume root_subvol;
2102 
2103 		snapshot	= U32_MAX;
2104 		inum		= BCACHEFS_ROOT_INO;
2105 
2106 		bkey_subvolume_init(&root_subvol.k_i);
2107 		root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
2108 		root_subvol.v.flags	= 0;
2109 		root_subvol.v.snapshot	= cpu_to_le32(snapshot);
2110 		root_subvol.v.inode	= cpu_to_le64(inum);
2111 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
2112 		bch_err_msg(c, ret, "writing root subvol");
2113 		if (ret)
2114 			goto err;
2115 	}
2116 
2117 	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2118 	if (ret && !bch2_err_matches(ret, ENOENT))
2119 		return ret;
2120 
2121 	if (mustfix_fsck_err_on(ret, c, root_dir_missing,
2122 				"root directory missing") ||
2123 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2124 				c, root_inode_not_dir,
2125 				"root inode not a directory")) {
2126 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2127 				0, NULL);
2128 		root_inode.bi_inum = inum;
2129 
2130 		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2131 		bch_err_msg(c, ret, "writing root inode");
2132 	}
2133 err:
2134 fsck_err:
2135 	return ret;
2136 }
2137 
2138 /* Get root directory, create if it doesn't exist: */
2139 int bch2_check_root(struct bch_fs *c)
2140 {
2141 	int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2142 		check_root_trans(trans));
2143 	bch_err_fn(c, ret);
2144 	return ret;
2145 }
2146 
2147 typedef DARRAY(u32) darray_u32;
2148 
2149 static bool darray_u32_has(darray_u32 *d, u32 v)
2150 {
2151 	darray_for_each(*d, i)
2152 		if (*i == v)
2153 			return true;
2154 	return false;
2155 }
2156 
2157 /*
2158  * We've checked that inode backpointers point to valid dirents; here, it's
2159  * sufficient to check that the subvolume root has a dirent:
2160  */
2161 static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
2162 {
2163 	struct bch_inode_unpacked inode;
2164 	int ret = bch2_inode_find_by_inum_trans(trans,
2165 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2166 				&inode);
2167 	if (ret)
2168 		return ret;
2169 
2170 	return inode.bi_dir != 0;
2171 }
2172 
2173 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2174 {
2175 	struct bch_fs *c = trans->c;
2176 	struct btree_iter parent_iter = {};
2177 	darray_u32 subvol_path = {};
2178 	struct printbuf buf = PRINTBUF;
2179 	int ret = 0;
2180 
2181 	if (k.k->type != KEY_TYPE_subvolume)
2182 		return 0;
2183 
2184 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2185 		ret = darray_push(&subvol_path, k.k->p.offset);
2186 		if (ret)
2187 			goto err;
2188 
2189 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2190 
2191 		ret = subvol_has_dirent(trans, s);
2192 		if (ret < 0)
2193 			break;
2194 
2195 		if (fsck_err_on(!ret,
2196 				c, subvol_unreachable,
2197 				"unreachable subvolume %s",
2198 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2199 				 buf.buf))) {
2200 			ret = reattach_subvol(trans, s);
2201 			break;
2202 		}
2203 
2204 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2205 
2206 		if (darray_u32_has(&subvol_path, parent)) {
2207 			if (fsck_err(c, subvol_loop, "subvolume loop"))
2208 				ret = reattach_subvol(trans, s);
2209 			break;
2210 		}
2211 
2212 		bch2_trans_iter_exit(trans, &parent_iter);
2213 		bch2_trans_iter_init(trans, &parent_iter,
2214 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2215 		k = bch2_btree_iter_peek_slot(&parent_iter);
2216 		ret = bkey_err(k);
2217 		if (ret)
2218 			goto err;
2219 
2220 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2221 				c, subvol_unreachable,
2222 				"unreachable subvolume %s",
2223 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2224 				 buf.buf))) {
2225 			ret = reattach_subvol(trans, s);
2226 			break;
2227 		}
2228 	}
2229 fsck_err:
2230 err:
2231 	printbuf_exit(&buf);
2232 	darray_exit(&subvol_path);
2233 	bch2_trans_iter_exit(trans, &parent_iter);
2234 	return ret;
2235 }
2236 
2237 int bch2_check_subvolume_structure(struct bch_fs *c)
2238 {
2239 	int ret = bch2_trans_run(c,
2240 		for_each_btree_key_commit(trans, iter,
2241 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
2242 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2243 			check_subvol_path(trans, &iter, k)));
2244 	bch_err_fn(c, ret);
2245 	return ret;
2246 }
2247 
2248 struct pathbuf_entry {
2249 	u64	inum;
2250 	u32	snapshot;
2251 };
2252 
2253 typedef DARRAY(struct pathbuf_entry) pathbuf;
2254 
2255 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2256 {
2257 	darray_for_each(*p, i)
2258 		if (i->inum	== inum &&
2259 		    i->snapshot	== snapshot)
2260 			return true;
2261 	return false;
2262 }
2263 
2264 /*
2265  * Check that a given inode is reachable from its subvolume root - we already
2266  * verified subvolume connectivity:
2267  *
2268  * XXX: we should also be verifying that inodes are in the right subvolumes
2269  */
2270 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2271 {
2272 	struct bch_fs *c = trans->c;
2273 	struct btree_iter inode_iter = {};
2274 	struct bch_inode_unpacked inode;
2275 	struct printbuf buf = PRINTBUF;
2276 	u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
2277 	int ret = 0;
2278 
2279 	p->nr = 0;
2280 
2281 	BUG_ON(bch2_inode_unpack(inode_k, &inode));
2282 
2283 	while (!inode.bi_subvol) {
2284 		struct btree_iter dirent_iter;
2285 		struct bkey_s_c_dirent d;
2286 		u32 parent_snapshot = snapshot;
2287 
2288 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2289 		ret = bkey_err(d.s_c);
2290 		if (ret && !bch2_err_matches(ret, ENOENT))
2291 			break;
2292 
2293 		if (!ret && !dirent_points_to_inode(d, &inode)) {
2294 			bch2_trans_iter_exit(trans, &dirent_iter);
2295 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2296 		}
2297 
2298 		if (bch2_err_matches(ret, ENOENT)) {
2299 			ret = 0;
2300 			if (fsck_err(c, inode_unreachable,
2301 				     "unreachable inode\n%s",
2302 				     (printbuf_reset(&buf),
2303 				      bch2_bkey_val_to_text(&buf, c, inode_k),
2304 				      buf.buf)))
2305 				ret = reattach_inode(trans, &inode, snapshot);
2306 			goto out;
2307 		}
2308 
2309 		bch2_trans_iter_exit(trans, &dirent_iter);
2310 
2311 		if (!S_ISDIR(inode.bi_mode))
2312 			break;
2313 
2314 		ret = darray_push(p, ((struct pathbuf_entry) {
2315 			.inum		= inode.bi_inum,
2316 			.snapshot	= snapshot,
2317 		}));
2318 		if (ret)
2319 			return ret;
2320 
2321 		snapshot = parent_snapshot;
2322 
2323 		bch2_trans_iter_exit(trans, &inode_iter);
2324 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2325 					     SPOS(0, inode.bi_dir, snapshot), 0);
2326 		ret = bkey_err(inode_k) ?:
2327 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2328 			: bch2_inode_unpack(inode_k, &inode);
2329 		if (ret) {
2330 			/* Should have been caught in dirents pass */
2331 			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
2332 				bch_err(c, "error looking up parent directory: %i", ret);
2333 			break;
2334 		}
2335 
2336 		snapshot = inode_k.k->p.snapshot;
2337 
2338 		if (path_is_dup(p, inode.bi_inum, snapshot)) {
2339 			/* XXX print path */
2340 			bch_err(c, "directory structure loop");
2341 
2342 			darray_for_each(*p, i)
2343 				pr_err("%llu:%u", i->inum, i->snapshot);
2344 			pr_err("%llu:%u", inode.bi_inum, snapshot);
2345 
2346 			if (fsck_err(c, dir_loop, "directory structure loop")) {
2347 				ret = remove_backpointer(trans, &inode);
2348 				bch_err_msg(c, ret, "removing dirent");
2349 				if (ret)
2350 					break;
2351 
2352 				ret = reattach_inode(trans, &inode, snapshot);
2353 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2354 			}
2355 			break;
2356 		}
2357 	}
2358 out:
2359 fsck_err:
2360 	bch2_trans_iter_exit(trans, &inode_iter);
2361 	printbuf_exit(&buf);
2362 	bch_err_fn(c, ret);
2363 	return ret;
2364 }
2365 
2366 /*
2367  * Check for unreachable inodes, as well as loops in the directory structure:
2368  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2369  * unreachable:
2370  */
2371 int bch2_check_directory_structure(struct bch_fs *c)
2372 {
2373 	pathbuf path = { 0, };
2374 	int ret;
2375 
2376 	ret = bch2_trans_run(c,
2377 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2378 					  BTREE_ITER_INTENT|
2379 					  BTREE_ITER_PREFETCH|
2380 					  BTREE_ITER_ALL_SNAPSHOTS, k,
2381 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2382 			if (!bkey_is_inode(k.k))
2383 				continue;
2384 
2385 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2386 				continue;
2387 
2388 			check_path(trans, &path, k);
2389 		})));
2390 	darray_exit(&path);
2391 
2392 	bch_err_fn(c, ret);
2393 	return ret;
2394 }
2395 
2396 struct nlink_table {
2397 	size_t		nr;
2398 	size_t		size;
2399 
2400 	struct nlink {
2401 		u64	inum;
2402 		u32	snapshot;
2403 		u32	count;
2404 	}		*d;
2405 };
2406 
2407 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2408 		     u64 inum, u32 snapshot)
2409 {
2410 	if (t->nr == t->size) {
2411 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2412 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2413 
2414 		if (!d) {
2415 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2416 				new_size);
2417 			return -BCH_ERR_ENOMEM_fsck_add_nlink;
2418 		}
2419 
2420 		if (t->d)
2421 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2422 		kvfree(t->d);
2423 
2424 		t->d = d;
2425 		t->size = new_size;
2426 	}
2427 
2428 
2429 	t->d[t->nr++] = (struct nlink) {
2430 		.inum		= inum,
2431 		.snapshot	= snapshot,
2432 	};
2433 
2434 	return 0;
2435 }
2436 
2437 static int nlink_cmp(const void *_l, const void *_r)
2438 {
2439 	const struct nlink *l = _l;
2440 	const struct nlink *r = _r;
2441 
2442 	return cmp_int(l->inum, r->inum);
2443 }
2444 
2445 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2446 		     struct nlink_table *links,
2447 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2448 {
2449 	struct nlink *link, key = {
2450 		.inum = inum, .snapshot = U32_MAX,
2451 	};
2452 
2453 	if (inum < range_start || inum >= range_end)
2454 		return;
2455 
2456 	link = __inline_bsearch(&key, links->d, links->nr,
2457 				sizeof(links->d[0]), nlink_cmp);
2458 	if (!link)
2459 		return;
2460 
2461 	while (link > links->d && link[0].inum == link[-1].inum)
2462 		--link;
2463 
2464 	for (; link < links->d + links->nr && link->inum == inum; link++)
2465 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2466 			link->count++;
2467 			if (link->snapshot >= snapshot)
2468 				break;
2469 		}
2470 }
2471 
2472 noinline_for_stack
2473 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2474 				       struct nlink_table *t,
2475 				       u64 start, u64 *end)
2476 {
2477 	int ret = bch2_trans_run(c,
2478 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2479 				   POS(0, start),
2480 				   BTREE_ITER_INTENT|
2481 				   BTREE_ITER_PREFETCH|
2482 				   BTREE_ITER_ALL_SNAPSHOTS, k, ({
2483 			if (!bkey_is_inode(k.k))
2484 				continue;
2485 
2486 			/* Should never fail, checked by bch2_inode_invalid: */
2487 			struct bch_inode_unpacked u;
2488 			BUG_ON(bch2_inode_unpack(k, &u));
2489 
2490 			/*
2491 			 * Backpointer and directory structure checks are sufficient for
2492 			 * directories, since they can't have hardlinks:
2493 			 */
2494 			if (S_ISDIR(u.bi_mode))
2495 				continue;
2496 
2497 			if (!u.bi_nlink)
2498 				continue;
2499 
2500 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2501 			if (ret) {
2502 				*end = k.k->p.offset;
2503 				ret = 0;
2504 				break;
2505 			}
2506 			0;
2507 		})));
2508 
2509 	bch_err_fn(c, ret);
2510 	return ret;
2511 }
2512 
2513 noinline_for_stack
2514 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2515 				     u64 range_start, u64 range_end)
2516 {
2517 	struct snapshots_seen s;
2518 
2519 	snapshots_seen_init(&s);
2520 
2521 	int ret = bch2_trans_run(c,
2522 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2523 				   BTREE_ITER_INTENT|
2524 				   BTREE_ITER_PREFETCH|
2525 				   BTREE_ITER_ALL_SNAPSHOTS, k, ({
2526 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2527 			if (ret)
2528 				break;
2529 
2530 			if (k.k->type == KEY_TYPE_dirent) {
2531 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2532 
2533 				if (d.v->d_type != DT_DIR &&
2534 				    d.v->d_type != DT_SUBVOL)
2535 					inc_link(c, &s, links, range_start, range_end,
2536 						 le64_to_cpu(d.v->d_inum),
2537 						 bch2_snapshot_equiv(c, d.k->p.snapshot));
2538 			}
2539 			0;
2540 		})));
2541 
2542 	snapshots_seen_exit(&s);
2543 
2544 	bch_err_fn(c, ret);
2545 	return ret;
2546 }
2547 
2548 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2549 				     struct bkey_s_c k,
2550 				     struct nlink_table *links,
2551 				     size_t *idx, u64 range_end)
2552 {
2553 	struct bch_fs *c = trans->c;
2554 	struct bch_inode_unpacked u;
2555 	struct nlink *link = &links->d[*idx];
2556 	int ret = 0;
2557 
2558 	if (k.k->p.offset >= range_end)
2559 		return 1;
2560 
2561 	if (!bkey_is_inode(k.k))
2562 		return 0;
2563 
2564 	BUG_ON(bch2_inode_unpack(k, &u));
2565 
2566 	if (S_ISDIR(u.bi_mode))
2567 		return 0;
2568 
2569 	if (!u.bi_nlink)
2570 		return 0;
2571 
2572 	while ((cmp_int(link->inum, k.k->p.offset) ?:
2573 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2574 		BUG_ON(*idx == links->nr);
2575 		link = &links->d[++*idx];
2576 	}
2577 
2578 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2579 			c, inode_wrong_nlink,
2580 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
2581 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2582 			bch2_inode_nlink_get(&u), link->count)) {
2583 		bch2_inode_nlink_set(&u, link->count);
2584 		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2585 	}
2586 fsck_err:
2587 	return ret;
2588 }
2589 
2590 noinline_for_stack
2591 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2592 			       struct nlink_table *links,
2593 			       u64 range_start, u64 range_end)
2594 {
2595 	size_t idx = 0;
2596 
2597 	int ret = bch2_trans_run(c,
2598 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2599 				POS(0, range_start),
2600 				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
2601 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2602 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2603 	if (ret < 0) {
2604 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2605 		return ret;
2606 	}
2607 
2608 	return 0;
2609 }
2610 
2611 int bch2_check_nlinks(struct bch_fs *c)
2612 {
2613 	struct nlink_table links = { 0 };
2614 	u64 this_iter_range_start, next_iter_range_start = 0;
2615 	int ret = 0;
2616 
2617 	do {
2618 		this_iter_range_start = next_iter_range_start;
2619 		next_iter_range_start = U64_MAX;
2620 
2621 		ret = check_nlinks_find_hardlinks(c, &links,
2622 						  this_iter_range_start,
2623 						  &next_iter_range_start);
2624 
2625 		ret = check_nlinks_walk_dirents(c, &links,
2626 					  this_iter_range_start,
2627 					  next_iter_range_start);
2628 		if (ret)
2629 			break;
2630 
2631 		ret = check_nlinks_update_hardlinks(c, &links,
2632 					 this_iter_range_start,
2633 					 next_iter_range_start);
2634 		if (ret)
2635 			break;
2636 
2637 		links.nr = 0;
2638 	} while (next_iter_range_start != U64_MAX);
2639 
2640 	kvfree(links.d);
2641 	bch_err_fn(c, ret);
2642 	return ret;
2643 }
2644 
2645 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2646 			     struct bkey_s_c k)
2647 {
2648 	struct bkey_s_c_reflink_p p;
2649 	struct bkey_i_reflink_p *u;
2650 
2651 	if (k.k->type != KEY_TYPE_reflink_p)
2652 		return 0;
2653 
2654 	p = bkey_s_c_to_reflink_p(k);
2655 
2656 	if (!p.v->front_pad && !p.v->back_pad)
2657 		return 0;
2658 
2659 	u = bch2_trans_kmalloc(trans, sizeof(*u));
2660 	int ret = PTR_ERR_OR_ZERO(u);
2661 	if (ret)
2662 		return ret;
2663 
2664 	bkey_reassemble(&u->k_i, k);
2665 	u->v.front_pad	= 0;
2666 	u->v.back_pad	= 0;
2667 
2668 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2669 }
2670 
2671 int bch2_fix_reflink_p(struct bch_fs *c)
2672 {
2673 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2674 		return 0;
2675 
2676 	int ret = bch2_trans_run(c,
2677 		for_each_btree_key_commit(trans, iter,
2678 				BTREE_ID_extents, POS_MIN,
2679 				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
2680 				BTREE_ITER_ALL_SNAPSHOTS, k,
2681 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2682 			fix_reflink_p_key(trans, &iter, k)));
2683 	bch_err_fn(c, ret);
2684 	return ret;
2685 }
2686