xref: /linux/fs/bcachefs/fsck.c (revision 5f30ee493044e9ea3a46167e5597a96f5c302adb)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "darray.h"
9 #include "dirent.h"
10 #include "error.h"
11 #include "fs-common.h"
12 #include "fsck.h"
13 #include "inode.h"
14 #include "keylist.h"
15 #include "recovery_passes.h"
16 #include "snapshot.h"
17 #include "super.h"
18 #include "xattr.h"
19 
20 #include <linux/bsearch.h>
21 #include <linux/dcache.h> /* struct qstr */
22 
23 /*
24  * XXX: this is handling transaction restarts without returning
25  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
26  */
27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
28 				    u32 snapshot)
29 {
30 	u64 sectors = 0;
31 
32 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
33 				SPOS(inum, 0, snapshot),
34 				POS(inum, U64_MAX),
35 				0, k, ({
36 		if (bkey_extent_is_allocation(k.k))
37 			sectors += k.k->size;
38 		0;
39 	}));
40 
41 	return ret ?: sectors;
42 }
43 
44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45 				    u32 snapshot)
46 {
47 	u64 subdirs = 0;
48 
49 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
50 				    SPOS(inum, 0, snapshot),
51 				    POS(inum, U64_MAX),
52 				    0, k, ({
53 		if (k.k->type == KEY_TYPE_dirent &&
54 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
55 			subdirs++;
56 		0;
57 	}));
58 
59 	return ret ?: subdirs;
60 }
61 
62 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
63 			 u32 *snapshot, u64 *inum)
64 {
65 	struct bch_subvolume s;
66 	int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
67 
68 	*snapshot = le32_to_cpu(s.snapshot);
69 	*inum = le64_to_cpu(s.inode);
70 	return ret;
71 }
72 
73 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
74 			      struct bch_inode_unpacked *inode)
75 {
76 	struct btree_iter iter;
77 	struct bkey_s_c k;
78 	int ret;
79 
80 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
81 				     BTREE_ITER_all_snapshots, k, ret) {
82 		if (k.k->p.offset != inode_nr)
83 			break;
84 		if (!bkey_is_inode(k.k))
85 			continue;
86 		ret = bch2_inode_unpack(k, inode);
87 		goto found;
88 	}
89 	ret = -BCH_ERR_ENOENT_inode;
90 found:
91 	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
92 	bch2_trans_iter_exit(trans, &iter);
93 	return ret;
94 }
95 
96 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
97 			struct bch_inode_unpacked *inode,
98 			u32 *snapshot)
99 {
100 	struct btree_iter iter;
101 	struct bkey_s_c k;
102 	int ret;
103 
104 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
105 			       SPOS(0, inode_nr, *snapshot), 0);
106 	ret = bkey_err(k);
107 	if (ret)
108 		goto err;
109 
110 	ret = bkey_is_inode(k.k)
111 		? bch2_inode_unpack(k, inode)
112 		: -BCH_ERR_ENOENT_inode;
113 	if (!ret)
114 		*snapshot = iter.pos.snapshot;
115 err:
116 	bch2_trans_iter_exit(trans, &iter);
117 	return ret;
118 }
119 
120 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
121 			   struct bch_hash_info hash_info,
122 			   subvol_inum dir, struct qstr *name,
123 			   u64 *target, unsigned *type, u32 snapshot)
124 {
125 	struct btree_iter iter;
126 	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
127 							 &hash_info, dir, name, 0, snapshot);
128 	int ret = bkey_err(k);
129 	if (ret)
130 		return ret;
131 
132 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
133 	*target = le64_to_cpu(d.v->d_inum);
134 	*type = d.v->d_type;
135 	bch2_trans_iter_exit(trans, &iter);
136 	return 0;
137 }
138 
139 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
140 {
141 	struct bch_fs *c = trans->c;
142 	struct btree_iter iter;
143 	struct bch_inode_unpacked dir_inode;
144 	struct bch_hash_info dir_hash_info;
145 	int ret;
146 
147 	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
148 	if (ret)
149 		goto err;
150 
151 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
152 
153 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
154 
155 	ret =   bch2_btree_iter_traverse(&iter) ?:
156 		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
157 				    &dir_hash_info, &iter,
158 				    BTREE_UPDATE_internal_snapshot_node);
159 	bch2_trans_iter_exit(trans, &iter);
160 err:
161 	bch_err_fn(c, ret);
162 	return ret;
163 }
164 
165 /* Get lost+found, create if it doesn't exist: */
166 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
167 			    struct bch_inode_unpacked *lostfound,
168 			    u64 reattaching_inum)
169 {
170 	struct bch_fs *c = trans->c;
171 	struct qstr lostfound_str = QSTR("lost+found");
172 	u64 inum = 0;
173 	unsigned d_type = 0;
174 	int ret;
175 
176 	struct bch_snapshot_tree st;
177 	ret = bch2_snapshot_tree_lookup(trans,
178 			bch2_snapshot_tree(c, snapshot), &st);
179 	if (ret)
180 		return ret;
181 
182 	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
183 
184 	struct bch_subvolume subvol;
185 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
186 				 false, 0, &subvol);
187 	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
188 		    le32_to_cpu(st.master_subvol), snapshot);
189 	if (ret)
190 		return ret;
191 
192 	if (!subvol.inode) {
193 		struct btree_iter iter;
194 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
195 				BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
196 				0, subvolume);
197 		ret = PTR_ERR_OR_ZERO(subvol);
198 		if (ret)
199 			return ret;
200 
201 		subvol->v.inode = cpu_to_le64(reattaching_inum);
202 		bch2_trans_iter_exit(trans, &iter);
203 	}
204 
205 	root_inum.inum = le64_to_cpu(subvol.inode);
206 
207 	struct bch_inode_unpacked root_inode;
208 	struct bch_hash_info root_hash_info;
209 	u32 root_inode_snapshot = snapshot;
210 	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
211 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
212 		    root_inum.inum, le32_to_cpu(st.master_subvol));
213 	if (ret)
214 		return ret;
215 
216 	root_hash_info = bch2_hash_info_init(c, &root_inode);
217 
218 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
219 			      &lostfound_str, &inum, &d_type, snapshot);
220 	if (bch2_err_matches(ret, ENOENT))
221 		goto create_lostfound;
222 
223 	bch_err_fn(c, ret);
224 	if (ret)
225 		return ret;
226 
227 	if (d_type != DT_DIR) {
228 		bch_err(c, "error looking up lost+found: not a directory");
229 		return -BCH_ERR_ENOENT_not_directory;
230 	}
231 
232 	/*
233 	 * The bch2_check_dirents pass has already run, dangling dirents
234 	 * shouldn't exist here:
235 	 */
236 	ret = lookup_inode(trans, inum, lostfound, &snapshot);
237 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
238 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
239 	return ret;
240 
241 create_lostfound:
242 	/*
243 	 * XXX: we could have a nicer log message here  if we had a nice way to
244 	 * walk backpointers to print a path
245 	 */
246 	bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
247 
248 	u64 now = bch2_current_time(c);
249 	struct btree_iter lostfound_iter = { NULL };
250 	u64 cpu = raw_smp_processor_id();
251 
252 	bch2_inode_init_early(c, lostfound);
253 	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
254 	lostfound->bi_dir = root_inode.bi_inum;
255 
256 	root_inode.bi_nlink++;
257 
258 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
259 	if (ret)
260 		goto err;
261 
262 	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
263 	ret = bch2_btree_iter_traverse(&lostfound_iter);
264 	if (ret)
265 		goto err;
266 
267 	ret =   bch2_dirent_create_snapshot(trans,
268 				0, root_inode.bi_inum, snapshot, &root_hash_info,
269 				mode_to_type(lostfound->bi_mode),
270 				&lostfound_str,
271 				lostfound->bi_inum,
272 				&lostfound->bi_dir_offset,
273 				STR_HASH_must_create) ?:
274 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
275 				       BTREE_UPDATE_internal_snapshot_node);
276 err:
277 	bch_err_msg(c, ret, "creating lost+found");
278 	bch2_trans_iter_exit(trans, &lostfound_iter);
279 	return ret;
280 }
281 
282 static int reattach_inode(struct btree_trans *trans,
283 			  struct bch_inode_unpacked *inode,
284 			  u32 inode_snapshot)
285 {
286 	struct bch_hash_info dir_hash;
287 	struct bch_inode_unpacked lostfound;
288 	char name_buf[20];
289 	struct qstr name;
290 	u64 dir_offset = 0;
291 	u32 dirent_snapshot = inode_snapshot;
292 	int ret;
293 
294 	if (inode->bi_subvol) {
295 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
296 
297 		u64 root_inum;
298 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
299 				    &dirent_snapshot, &root_inum);
300 		if (ret)
301 			return ret;
302 
303 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
304 	} else {
305 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
306 	}
307 
308 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
309 	if (ret)
310 		return ret;
311 
312 	if (S_ISDIR(inode->bi_mode)) {
313 		lostfound.bi_nlink++;
314 
315 		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
316 		if (ret)
317 			return ret;
318 	}
319 
320 	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
321 
322 	name = (struct qstr) QSTR(name_buf);
323 
324 	ret = bch2_dirent_create_snapshot(trans,
325 				inode->bi_parent_subvol, lostfound.bi_inum,
326 				dirent_snapshot,
327 				&dir_hash,
328 				inode_d_type(inode),
329 				&name,
330 				inode->bi_subvol ?: inode->bi_inum,
331 				&dir_offset,
332 				STR_HASH_must_create);
333 	if (ret)
334 		return ret;
335 
336 	inode->bi_dir		= lostfound.bi_inum;
337 	inode->bi_dir_offset	= dir_offset;
338 
339 	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
340 }
341 
342 static int remove_backpointer(struct btree_trans *trans,
343 			      struct bch_inode_unpacked *inode)
344 {
345 	struct btree_iter iter;
346 	struct bkey_s_c_dirent d;
347 	int ret;
348 
349 	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
350 				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
351 				     dirent);
352 	ret =   bkey_err(d) ?:
353 		__remove_dirent(trans, d.k->p);
354 	bch2_trans_iter_exit(trans, &iter);
355 	return ret;
356 }
357 
358 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
359 {
360 	struct bch_fs *c = trans->c;
361 
362 	struct bch_inode_unpacked inode;
363 	int ret = bch2_inode_find_by_inum_trans(trans,
364 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
365 				&inode);
366 	if (ret)
367 		return ret;
368 
369 	ret = remove_backpointer(trans, &inode);
370 	bch_err_msg(c, ret, "removing dirent");
371 	if (ret)
372 		return ret;
373 
374 	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
375 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
376 	return ret;
377 }
378 
379 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
380 {
381 	struct bch_fs *c = trans->c;
382 
383 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
384 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
385 		return -BCH_ERR_fsck_repair_unimplemented;
386 	}
387 
388 	/*
389 	 * If inum isn't set, that means we're being called from check_dirents,
390 	 * not check_inodes - the root of this subvolume doesn't exist or we
391 	 * would have found it there:
392 	 */
393 	if (!inum) {
394 		struct btree_iter inode_iter = {};
395 		struct bch_inode_unpacked new_inode;
396 		u64 cpu = raw_smp_processor_id();
397 
398 		bch2_inode_init_early(c, &new_inode);
399 		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
400 
401 		new_inode.bi_subvol = subvolid;
402 
403 		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
404 			  bch2_btree_iter_traverse(&inode_iter) ?:
405 			  bch2_inode_write(trans, &inode_iter, &new_inode);
406 		bch2_trans_iter_exit(trans, &inode_iter);
407 		if (ret)
408 			return ret;
409 
410 		inum = new_inode.bi_inum;
411 	}
412 
413 	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
414 
415 	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
416 	int ret = PTR_ERR_OR_ZERO(new_subvol);
417 	if (ret)
418 		return ret;
419 
420 	bkey_subvolume_init(&new_subvol->k_i);
421 	new_subvol->k.p.offset	= subvolid;
422 	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
423 	new_subvol->v.inode	= cpu_to_le64(inum);
424 	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
425 	if (ret)
426 		return ret;
427 
428 	struct btree_iter iter;
429 	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
430 			BTREE_ID_snapshots, POS(0, snapshotid),
431 			0, snapshot);
432 	ret = PTR_ERR_OR_ZERO(s);
433 	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
434 	if (ret)
435 		return ret;
436 
437 	u32 snapshot_tree = le32_to_cpu(s->v.tree);
438 
439 	s->v.subvol = cpu_to_le32(subvolid);
440 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
441 	bch2_trans_iter_exit(trans, &iter);
442 
443 	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
444 			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
445 			0, snapshot_tree);
446 	ret = PTR_ERR_OR_ZERO(st);
447 	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
448 	if (ret)
449 		return ret;
450 
451 	if (!st->v.master_subvol)
452 		st->v.master_subvol = cpu_to_le32(subvolid);
453 
454 	bch2_trans_iter_exit(trans, &iter);
455 	return 0;
456 }
457 
458 static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
459 {
460 	struct bch_fs *c = trans->c;
461 	struct bch_inode_unpacked new_inode;
462 
463 	bch2_inode_init_early(c, &new_inode);
464 	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
465 	new_inode.bi_size = size;
466 	new_inode.bi_inum = inum;
467 
468 	return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
469 }
470 
471 static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
472 {
473 	struct btree_iter iter = {};
474 
475 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
476 	struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
477 	bch2_trans_iter_exit(trans, &iter);
478 	int ret = bkey_err(k);
479 	if (ret)
480 		return ret;
481 
482 	return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
483 }
484 
485 struct snapshots_seen {
486 	struct bpos			pos;
487 	snapshot_id_list		ids;
488 };
489 
490 static inline void snapshots_seen_exit(struct snapshots_seen *s)
491 {
492 	darray_exit(&s->ids);
493 }
494 
495 static inline void snapshots_seen_init(struct snapshots_seen *s)
496 {
497 	memset(s, 0, sizeof(*s));
498 }
499 
500 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
501 {
502 	u32 *i;
503 	__darray_for_each(s->ids, i) {
504 		if (*i == id)
505 			return 0;
506 		if (*i > id)
507 			break;
508 	}
509 
510 	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
511 	if (ret)
512 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
513 			s->ids.size);
514 	return ret;
515 }
516 
517 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
518 				 enum btree_id btree_id, struct bpos pos)
519 {
520 	if (!bkey_eq(s->pos, pos))
521 		s->ids.nr = 0;
522 	s->pos = pos;
523 
524 	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
525 }
526 
527 /**
528  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
529  * and @ancestor hasn't been overwritten in @seen
530  *
531  * @c:		filesystem handle
532  * @seen:	list of snapshot ids already seen at current position
533  * @id:		descendent snapshot id
534  * @ancestor:	ancestor snapshot id
535  *
536  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
537  */
538 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
539 				    u32 id, u32 ancestor)
540 {
541 	ssize_t i;
542 
543 	EBUG_ON(id > ancestor);
544 
545 	/* @ancestor should be the snapshot most recently added to @seen */
546 	EBUG_ON(ancestor != seen->pos.snapshot);
547 	EBUG_ON(ancestor != darray_last(seen->ids));
548 
549 	if (id == ancestor)
550 		return true;
551 
552 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
553 		return false;
554 
555 	/*
556 	 * We know that @id is a descendant of @ancestor, we're checking if
557 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
558 	 * @ascestor and with @id as a descendent.
559 	 *
560 	 * But we already know that we're scanning IDs between @id and @ancestor
561 	 * numerically, since snapshot ID lists are kept sorted, so if we find
562 	 * an id that's an ancestor of @id we're done:
563 	 */
564 
565 	for (i = seen->ids.nr - 2;
566 	     i >= 0 && seen->ids.data[i] >= id;
567 	     --i)
568 		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
569 			return false;
570 
571 	return true;
572 }
573 
574 /**
575  * ref_visible - given a key with snapshot id @src that points to a key with
576  * snapshot id @dst, test whether there is some snapshot in which @dst is
577  * visible.
578  *
579  * @c:		filesystem handle
580  * @s:		list of snapshot IDs already seen at @src
581  * @src:	snapshot ID of src key
582  * @dst:	snapshot ID of dst key
583  * Returns:	true if there is some snapshot in which @dst is visible
584  *
585  * Assumes we're visiting @src keys in natural key order
586  */
587 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
588 			u32 src, u32 dst)
589 {
590 	return dst <= src
591 		? key_visible_in_snapshot(c, s, dst, src)
592 		: bch2_snapshot_is_ancestor(c, src, dst);
593 }
594 
595 static int ref_visible2(struct bch_fs *c,
596 			u32 src, struct snapshots_seen *src_seen,
597 			u32 dst, struct snapshots_seen *dst_seen)
598 {
599 	if (dst > src) {
600 		swap(dst, src);
601 		swap(dst_seen, src_seen);
602 	}
603 	return key_visible_in_snapshot(c, src_seen, dst, src);
604 }
605 
606 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
607 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
608 	     (_i)->snapshot <= (_snapshot); _i++)					\
609 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
610 
611 struct inode_walker_entry {
612 	struct bch_inode_unpacked inode;
613 	u32			snapshot;
614 	bool			seen_this_pos;
615 	u64			count;
616 };
617 
618 struct inode_walker {
619 	bool				first_this_inode;
620 	bool				recalculate_sums;
621 	struct bpos			last_pos;
622 
623 	DARRAY(struct inode_walker_entry) inodes;
624 };
625 
626 static void inode_walker_exit(struct inode_walker *w)
627 {
628 	darray_exit(&w->inodes);
629 }
630 
631 static struct inode_walker inode_walker_init(void)
632 {
633 	return (struct inode_walker) { 0, };
634 }
635 
636 static int add_inode(struct bch_fs *c, struct inode_walker *w,
637 		     struct bkey_s_c inode)
638 {
639 	struct bch_inode_unpacked u;
640 
641 	BUG_ON(bch2_inode_unpack(inode, &u));
642 
643 	return darray_push(&w->inodes, ((struct inode_walker_entry) {
644 		.inode		= u,
645 		.snapshot	= inode.k->p.snapshot,
646 	}));
647 }
648 
649 static int get_inodes_all_snapshots(struct btree_trans *trans,
650 				    struct inode_walker *w, u64 inum)
651 {
652 	struct bch_fs *c = trans->c;
653 	struct btree_iter iter;
654 	struct bkey_s_c k;
655 	int ret;
656 
657 	w->recalculate_sums = false;
658 	w->inodes.nr = 0;
659 
660 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
661 				     BTREE_ITER_all_snapshots, k, ret) {
662 		if (k.k->p.offset != inum)
663 			break;
664 
665 		if (bkey_is_inode(k.k))
666 			add_inode(c, w, k);
667 	}
668 	bch2_trans_iter_exit(trans, &iter);
669 
670 	if (ret)
671 		return ret;
672 
673 	w->first_this_inode = true;
674 	return 0;
675 }
676 
677 static struct inode_walker_entry *
678 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
679 {
680 	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
681 
682 	struct inode_walker_entry *i;
683 	__darray_for_each(w->inodes, i)
684 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
685 			goto found;
686 
687 	return NULL;
688 found:
689 	BUG_ON(k.k->p.snapshot > i->snapshot);
690 
691 	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
692 		struct inode_walker_entry new = *i;
693 
694 		new.snapshot = k.k->p.snapshot;
695 		new.count = 0;
696 
697 		struct printbuf buf = PRINTBUF;
698 		bch2_bkey_val_to_text(&buf, c, k);
699 
700 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
701 			 "unexpected because we should always update the inode when we update a key in that inode\n"
702 			 "%s",
703 			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
704 		printbuf_exit(&buf);
705 
706 		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
707 			--i;
708 
709 		size_t pos = i - w->inodes.data;
710 		int ret = darray_insert_item(&w->inodes, pos, new);
711 		if (ret)
712 			return ERR_PTR(ret);
713 
714 		i = w->inodes.data + pos;
715 	}
716 
717 	return i;
718 }
719 
720 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
721 					     struct inode_walker *w,
722 					     struct bkey_s_c k)
723 {
724 	if (w->last_pos.inode != k.k->p.inode) {
725 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
726 		if (ret)
727 			return ERR_PTR(ret);
728 	} else if (bkey_cmp(w->last_pos, k.k->p)) {
729 		darray_for_each(w->inodes, i)
730 			i->seen_this_pos = false;
731 	}
732 
733 	w->last_pos = k.k->p;
734 
735 	return lookup_inode_for_snapshot(trans->c, w, k);
736 }
737 
738 static int get_visible_inodes(struct btree_trans *trans,
739 			      struct inode_walker *w,
740 			      struct snapshots_seen *s,
741 			      u64 inum)
742 {
743 	struct bch_fs *c = trans->c;
744 	struct btree_iter iter;
745 	struct bkey_s_c k;
746 	int ret;
747 
748 	w->inodes.nr = 0;
749 
750 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
751 			   BTREE_ITER_all_snapshots, k, ret) {
752 		if (k.k->p.offset != inum)
753 			break;
754 
755 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
756 			continue;
757 
758 		if (bkey_is_inode(k.k))
759 			add_inode(c, w, k);
760 
761 		if (k.k->p.snapshot >= s->pos.snapshot)
762 			break;
763 	}
764 	bch2_trans_iter_exit(trans, &iter);
765 
766 	return ret;
767 }
768 
769 static int hash_redo_key(struct btree_trans *trans,
770 			 const struct bch_hash_desc desc,
771 			 struct bch_hash_info *hash_info,
772 			 struct btree_iter *k_iter, struct bkey_s_c k)
773 {
774 	struct bkey_i *delete;
775 	struct bkey_i *tmp;
776 
777 	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
778 	if (IS_ERR(delete))
779 		return PTR_ERR(delete);
780 
781 	tmp = bch2_bkey_make_mut_noupdate(trans, k);
782 	if (IS_ERR(tmp))
783 		return PTR_ERR(tmp);
784 
785 	bkey_init(&delete->k);
786 	delete->k.p = k_iter->pos;
787 	return  bch2_btree_iter_traverse(k_iter) ?:
788 		bch2_trans_update(trans, k_iter, delete, 0) ?:
789 		bch2_hash_set_in_snapshot(trans, desc, hash_info,
790 				       (subvol_inum) { 0, k.k->p.inode },
791 				       k.k->p.snapshot, tmp,
792 				       STR_HASH_must_create|
793 				       BTREE_UPDATE_internal_snapshot_node) ?:
794 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
795 }
796 
797 static int hash_check_key(struct btree_trans *trans,
798 			  const struct bch_hash_desc desc,
799 			  struct bch_hash_info *hash_info,
800 			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
801 {
802 	struct bch_fs *c = trans->c;
803 	struct btree_iter iter = { NULL };
804 	struct printbuf buf = PRINTBUF;
805 	struct bkey_s_c k;
806 	u64 hash;
807 	int ret = 0;
808 
809 	if (hash_k.k->type != desc.key_type)
810 		return 0;
811 
812 	hash = desc.hash_bkey(hash_info, hash_k);
813 
814 	if (likely(hash == hash_k.k->p.offset))
815 		return 0;
816 
817 	if (hash_k.k->p.offset < hash)
818 		goto bad_hash;
819 
820 	for_each_btree_key_norestart(trans, iter, desc.btree_id,
821 				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
822 				     BTREE_ITER_slots, k, ret) {
823 		if (bkey_eq(k.k->p, hash_k.k->p))
824 			break;
825 
826 		if (fsck_err_on(k.k->type == desc.key_type &&
827 				!desc.cmp_bkey(k, hash_k), c,
828 				hash_table_key_duplicate,
829 				"duplicate hash table keys:\n%s",
830 				(printbuf_reset(&buf),
831 				 bch2_bkey_val_to_text(&buf, c, hash_k),
832 				 buf.buf))) {
833 			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
834 			break;
835 		}
836 
837 		if (bkey_deleted(k.k)) {
838 			bch2_trans_iter_exit(trans, &iter);
839 			goto bad_hash;
840 		}
841 	}
842 out:
843 	bch2_trans_iter_exit(trans, &iter);
844 	printbuf_exit(&buf);
845 	return ret;
846 bad_hash:
847 	if (fsck_err(c, hash_table_key_wrong_offset,
848 		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
849 		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
850 		     (printbuf_reset(&buf),
851 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
852 		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
853 		bch_err_fn(c, ret);
854 		if (ret)
855 			return ret;
856 		ret = -BCH_ERR_transaction_restart_nested;
857 	}
858 fsck_err:
859 	goto out;
860 }
861 
862 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
863 						struct btree_iter *iter,
864 						struct bpos pos)
865 {
866 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
867 }
868 
869 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
870 					       struct btree_iter *iter,
871 					       struct bch_inode_unpacked *inode,
872 					       u32 *snapshot)
873 {
874 	if (inode->bi_subvol) {
875 		u64 inum;
876 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
877 		if (ret)
878 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
879 	}
880 
881 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
882 }
883 
884 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
885 				   struct bkey_s_c_dirent d)
886 {
887 	return  inode->bi_dir		== d.k->p.inode &&
888 		inode->bi_dir_offset	== d.k->p.offset;
889 }
890 
891 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
892 				   struct bch_inode_unpacked *inode)
893 {
894 	return d.v->d_type == DT_SUBVOL
895 		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
896 		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
897 }
898 
899 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
900 {
901 	struct btree_iter iter;
902 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
903 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
904 	bch2_trans_iter_exit(trans, &iter);
905 	return ret;
906 }
907 
908 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
909 				    struct bch_inode_unpacked *inode,
910 				    u32 inode_snapshot, bool *write_inode)
911 {
912 	struct bch_fs *c = trans->c;
913 	struct printbuf buf = PRINTBUF;
914 
915 	struct btree_iter dirent_iter = {};
916 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
917 	int ret = bkey_err(d);
918 	if (ret && !bch2_err_matches(ret, ENOENT))
919 		return ret;
920 
921 	if (fsck_err_on(ret,
922 			c, inode_points_to_missing_dirent,
923 			"inode points to missing dirent\n%s",
924 			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
925 	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
926 			c, inode_points_to_wrong_dirent,
927 			"inode points to dirent that does not point back:\n%s",
928 			(bch2_bkey_val_to_text(&buf, c, inode_k),
929 			 prt_newline(&buf),
930 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
931 		/*
932 		 * We just clear the backpointer fields for now. If we find a
933 		 * dirent that points to this inode in check_dirents(), we'll
934 		 * update it then; then when we get to check_path() if the
935 		 * backpointer is still 0 we'll reattach it.
936 		 */
937 		inode->bi_dir = 0;
938 		inode->bi_dir_offset = 0;
939 		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
940 		*write_inode = true;
941 	}
942 
943 	ret = 0;
944 fsck_err:
945 	bch2_trans_iter_exit(trans, &dirent_iter);
946 	printbuf_exit(&buf);
947 	bch_err_fn(c, ret);
948 	return ret;
949 }
950 
951 static int check_inode(struct btree_trans *trans,
952 		       struct btree_iter *iter,
953 		       struct bkey_s_c k,
954 		       struct bch_inode_unpacked *prev,
955 		       struct snapshots_seen *s,
956 		       bool full)
957 {
958 	struct bch_fs *c = trans->c;
959 	struct bch_inode_unpacked u;
960 	bool do_update = false;
961 	int ret;
962 
963 	ret = bch2_check_key_has_snapshot(trans, iter, k);
964 	if (ret < 0)
965 		goto err;
966 	if (ret)
967 		return 0;
968 
969 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
970 	if (ret)
971 		goto err;
972 
973 	if (!bkey_is_inode(k.k))
974 		return 0;
975 
976 	BUG_ON(bch2_inode_unpack(k, &u));
977 
978 	if (!full &&
979 	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
980 			    BCH_INODE_i_sectors_dirty|
981 			    BCH_INODE_unlinked)))
982 		return 0;
983 
984 	if (prev->bi_inum != u.bi_inum)
985 		*prev = u;
986 
987 	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
988 			inode_d_type(prev)	!= inode_d_type(&u),
989 			c, inode_snapshot_mismatch,
990 			"inodes in different snapshots don't match")) {
991 		bch_err(c, "repair not implemented yet");
992 		return -BCH_ERR_fsck_repair_unimplemented;
993 	}
994 
995 	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
996 	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
997 		struct bpos new_min_pos;
998 
999 		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
1000 		if (ret)
1001 			goto err;
1002 
1003 		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
1004 
1005 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1006 
1007 		bch_err_msg(c, ret, "in fsck updating inode");
1008 		if (ret)
1009 			return ret;
1010 
1011 		if (!bpos_eq(new_min_pos, POS_MIN))
1012 			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
1013 		return 0;
1014 	}
1015 
1016 	if (u.bi_flags & BCH_INODE_unlinked) {
1017 		ret = check_inode_deleted_list(trans, k.k->p);
1018 		if (ret < 0)
1019 			return ret;
1020 
1021 		fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
1022 			    "inode %llu:%u unlinked, but not on deleted list",
1023 			    u.bi_inum, k.k->p.snapshot);
1024 		ret = 0;
1025 	}
1026 
1027 	if (u.bi_flags & BCH_INODE_unlinked &&
1028 	    (!c->sb.clean ||
1029 	     fsck_err(c, inode_unlinked_but_clean,
1030 		      "filesystem marked clean, but inode %llu unlinked",
1031 		      u.bi_inum))) {
1032 		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1033 		bch_err_msg(c, ret, "in fsck deleting inode");
1034 		return ret;
1035 	}
1036 
1037 	if (u.bi_flags & BCH_INODE_i_size_dirty &&
1038 	    (!c->sb.clean ||
1039 	     fsck_err(c, inode_i_size_dirty_but_clean,
1040 		      "filesystem marked clean, but inode %llu has i_size dirty",
1041 		      u.bi_inum))) {
1042 		bch_verbose(c, "truncating inode %llu", u.bi_inum);
1043 
1044 		/*
1045 		 * XXX: need to truncate partial blocks too here - or ideally
1046 		 * just switch units to bytes and that issue goes away
1047 		 */
1048 		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1049 				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
1050 				     iter->pos.snapshot),
1051 				POS(u.bi_inum, U64_MAX),
1052 				0, NULL);
1053 		bch_err_msg(c, ret, "in fsck truncating inode");
1054 		if (ret)
1055 			return ret;
1056 
1057 		/*
1058 		 * We truncated without our normal sector accounting hook, just
1059 		 * make sure we recalculate it:
1060 		 */
1061 		u.bi_flags |= BCH_INODE_i_sectors_dirty;
1062 
1063 		u.bi_flags &= ~BCH_INODE_i_size_dirty;
1064 		do_update = true;
1065 	}
1066 
1067 	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
1068 	    (!c->sb.clean ||
1069 	     fsck_err(c, inode_i_sectors_dirty_but_clean,
1070 		      "filesystem marked clean, but inode %llu has i_sectors dirty",
1071 		      u.bi_inum))) {
1072 		s64 sectors;
1073 
1074 		bch_verbose(c, "recounting sectors for inode %llu",
1075 			    u.bi_inum);
1076 
1077 		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1078 		if (sectors < 0) {
1079 			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1080 			return sectors;
1081 		}
1082 
1083 		u.bi_sectors = sectors;
1084 		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1085 		do_update = true;
1086 	}
1087 
1088 	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1089 		u.bi_dir = 0;
1090 		u.bi_dir_offset = 0;
1091 		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1092 		do_update = true;
1093 	}
1094 
1095 	if (u.bi_dir || u.bi_dir_offset) {
1096 		ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1097 		if (ret)
1098 			goto err;
1099 	}
1100 
1101 	if (fsck_err_on(u.bi_parent_subvol &&
1102 			(u.bi_subvol == 0 ||
1103 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1104 			c, inode_bi_parent_nonzero,
1105 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1106 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1107 		u.bi_parent_subvol = 0;
1108 		do_update = true;
1109 	}
1110 
1111 	if (u.bi_subvol) {
1112 		struct bch_subvolume s;
1113 
1114 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1115 		if (ret && !bch2_err_matches(ret, ENOENT))
1116 			goto err;
1117 
1118 		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1119 			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1120 			goto do_update;
1121 		}
1122 
1123 		if (fsck_err_on(ret,
1124 				c, inode_bi_subvol_missing,
1125 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1126 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1127 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1128 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1129 							   k.k->p.snapshot),
1130 				c, inode_bi_subvol_wrong,
1131 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1132 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1133 				le64_to_cpu(s.inode),
1134 				le32_to_cpu(s.snapshot))) {
1135 			u.bi_subvol = 0;
1136 			u.bi_parent_subvol = 0;
1137 			do_update = true;
1138 		}
1139 	}
1140 do_update:
1141 	if (do_update) {
1142 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1143 		bch_err_msg(c, ret, "in fsck updating inode");
1144 		if (ret)
1145 			return ret;
1146 	}
1147 err:
1148 fsck_err:
1149 	bch_err_fn(c, ret);
1150 	return ret;
1151 }
1152 
1153 int bch2_check_inodes(struct bch_fs *c)
1154 {
1155 	bool full = c->opts.fsck;
1156 	struct bch_inode_unpacked prev = { 0 };
1157 	struct snapshots_seen s;
1158 
1159 	snapshots_seen_init(&s);
1160 
1161 	int ret = bch2_trans_run(c,
1162 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1163 				POS_MIN,
1164 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1165 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1166 			check_inode(trans, &iter, k, &prev, &s, full)));
1167 
1168 	snapshots_seen_exit(&s);
1169 	bch_err_fn(c, ret);
1170 	return ret;
1171 }
1172 
1173 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1174 {
1175 	struct bch_fs *c = trans->c;
1176 	int ret = 0;
1177 	s64 count2;
1178 
1179 	darray_for_each(w->inodes, i) {
1180 		if (i->inode.bi_sectors == i->count)
1181 			continue;
1182 
1183 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1184 
1185 		if (w->recalculate_sums)
1186 			i->count = count2;
1187 
1188 		if (i->count != count2) {
1189 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1190 					    w->last_pos.inode, i->snapshot, i->count, count2);
1191 			return -BCH_ERR_internal_fsck_err;
1192 		}
1193 
1194 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1195 				c, inode_i_sectors_wrong,
1196 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1197 				w->last_pos.inode, i->snapshot,
1198 				i->inode.bi_sectors, i->count)) {
1199 			i->inode.bi_sectors = i->count;
1200 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1201 			if (ret)
1202 				break;
1203 		}
1204 	}
1205 fsck_err:
1206 	bch_err_fn(c, ret);
1207 	return ret;
1208 }
1209 
1210 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1211 {
1212 	u32 restart_count = trans->restart_count;
1213 	return check_i_sectors_notnested(trans, w) ?:
1214 		trans_was_restarted(trans, restart_count);
1215 }
1216 
1217 struct extent_end {
1218 	u32			snapshot;
1219 	u64			offset;
1220 	struct snapshots_seen	seen;
1221 };
1222 
1223 struct extent_ends {
1224 	struct bpos			last_pos;
1225 	DARRAY(struct extent_end)	e;
1226 };
1227 
1228 static void extent_ends_reset(struct extent_ends *extent_ends)
1229 {
1230 	darray_for_each(extent_ends->e, i)
1231 		snapshots_seen_exit(&i->seen);
1232 	extent_ends->e.nr = 0;
1233 }
1234 
1235 static void extent_ends_exit(struct extent_ends *extent_ends)
1236 {
1237 	extent_ends_reset(extent_ends);
1238 	darray_exit(&extent_ends->e);
1239 }
1240 
1241 static void extent_ends_init(struct extent_ends *extent_ends)
1242 {
1243 	memset(extent_ends, 0, sizeof(*extent_ends));
1244 }
1245 
1246 static int extent_ends_at(struct bch_fs *c,
1247 			  struct extent_ends *extent_ends,
1248 			  struct snapshots_seen *seen,
1249 			  struct bkey_s_c k)
1250 {
1251 	struct extent_end *i, n = (struct extent_end) {
1252 		.offset		= k.k->p.offset,
1253 		.snapshot	= k.k->p.snapshot,
1254 		.seen		= *seen,
1255 	};
1256 
1257 	n.seen.ids.data = kmemdup(seen->ids.data,
1258 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1259 			      GFP_KERNEL);
1260 	if (!n.seen.ids.data)
1261 		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1262 
1263 	__darray_for_each(extent_ends->e, i) {
1264 		if (i->snapshot == k.k->p.snapshot) {
1265 			snapshots_seen_exit(&i->seen);
1266 			*i = n;
1267 			return 0;
1268 		}
1269 
1270 		if (i->snapshot >= k.k->p.snapshot)
1271 			break;
1272 	}
1273 
1274 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1275 }
1276 
1277 static int overlapping_extents_found(struct btree_trans *trans,
1278 				     enum btree_id btree,
1279 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1280 				     struct bkey pos2,
1281 				     bool *fixed,
1282 				     struct extent_end *extent_end)
1283 {
1284 	struct bch_fs *c = trans->c;
1285 	struct printbuf buf = PRINTBUF;
1286 	struct btree_iter iter1, iter2 = { NULL };
1287 	struct bkey_s_c k1, k2;
1288 	int ret;
1289 
1290 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1291 
1292 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1293 			     BTREE_ITER_all_snapshots|
1294 			     BTREE_ITER_not_extents);
1295 	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1296 	ret = bkey_err(k1);
1297 	if (ret)
1298 		goto err;
1299 
1300 	prt_str(&buf, "\n  ");
1301 	bch2_bkey_val_to_text(&buf, c, k1);
1302 
1303 	if (!bpos_eq(pos1, k1.k->p)) {
1304 		prt_str(&buf, "\n  wanted\n  ");
1305 		bch2_bpos_to_text(&buf, pos1);
1306 		prt_str(&buf, "\n  ");
1307 		bch2_bkey_to_text(&buf, &pos2);
1308 
1309 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1310 			__func__, buf.buf);
1311 		ret = -BCH_ERR_internal_fsck_err;
1312 		goto err;
1313 	}
1314 
1315 	bch2_trans_copy_iter(&iter2, &iter1);
1316 
1317 	while (1) {
1318 		bch2_btree_iter_advance(&iter2);
1319 
1320 		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1321 		ret = bkey_err(k2);
1322 		if (ret)
1323 			goto err;
1324 
1325 		if (bpos_ge(k2.k->p, pos2.p))
1326 			break;
1327 	}
1328 
1329 	prt_str(&buf, "\n  ");
1330 	bch2_bkey_val_to_text(&buf, c, k2);
1331 
1332 	if (bpos_gt(k2.k->p, pos2.p) ||
1333 	    pos2.size != k2.k->size) {
1334 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1335 			__func__, buf.buf);
1336 		ret = -BCH_ERR_internal_fsck_err;
1337 		goto err;
1338 	}
1339 
1340 	prt_printf(&buf, "\n  overwriting %s extent",
1341 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1342 
1343 	if (fsck_err(c, extent_overlapping,
1344 		     "overlapping extents%s", buf.buf)) {
1345 		struct btree_iter *old_iter = &iter1;
1346 		struct disk_reservation res = { 0 };
1347 
1348 		if (pos1.snapshot < pos2.p.snapshot) {
1349 			old_iter = &iter2;
1350 			swap(k1, k2);
1351 		}
1352 
1353 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1354 
1355 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1356 				BTREE_UPDATE_internal_snapshot_node,
1357 				k1, k2) ?:
1358 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1359 		bch2_disk_reservation_put(c, &res);
1360 
1361 		if (ret)
1362 			goto err;
1363 
1364 		*fixed = true;
1365 
1366 		if (pos1.snapshot == pos2.p.snapshot) {
1367 			/*
1368 			 * We overwrote the first extent, and did the overwrite
1369 			 * in the same snapshot:
1370 			 */
1371 			extent_end->offset = bkey_start_offset(&pos2);
1372 		} else if (pos1.snapshot > pos2.p.snapshot) {
1373 			/*
1374 			 * We overwrote the first extent in pos2's snapshot:
1375 			 */
1376 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1377 		} else {
1378 			/*
1379 			 * We overwrote the second extent - restart
1380 			 * check_extent() from the top:
1381 			 */
1382 			ret = -BCH_ERR_transaction_restart_nested;
1383 		}
1384 	}
1385 fsck_err:
1386 err:
1387 	bch2_trans_iter_exit(trans, &iter2);
1388 	bch2_trans_iter_exit(trans, &iter1);
1389 	printbuf_exit(&buf);
1390 	return ret;
1391 }
1392 
1393 static int check_overlapping_extents(struct btree_trans *trans,
1394 			      struct snapshots_seen *seen,
1395 			      struct extent_ends *extent_ends,
1396 			      struct bkey_s_c k,
1397 			      struct btree_iter *iter,
1398 			      bool *fixed)
1399 {
1400 	struct bch_fs *c = trans->c;
1401 	int ret = 0;
1402 
1403 	/* transaction restart, running again */
1404 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1405 		return 0;
1406 
1407 	if (extent_ends->last_pos.inode != k.k->p.inode)
1408 		extent_ends_reset(extent_ends);
1409 
1410 	darray_for_each(extent_ends->e, i) {
1411 		if (i->offset <= bkey_start_offset(k.k))
1412 			continue;
1413 
1414 		if (!ref_visible2(c,
1415 				  k.k->p.snapshot, seen,
1416 				  i->snapshot, &i->seen))
1417 			continue;
1418 
1419 		ret = overlapping_extents_found(trans, iter->btree_id,
1420 						SPOS(iter->pos.inode,
1421 						     i->offset,
1422 						     i->snapshot),
1423 						&i->seen,
1424 						*k.k, fixed, i);
1425 		if (ret)
1426 			goto err;
1427 	}
1428 
1429 	extent_ends->last_pos = k.k->p;
1430 err:
1431 	return ret;
1432 }
1433 
1434 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1435 				struct bkey_s_c k)
1436 {
1437 	struct bch_fs *c = trans->c;
1438 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1439 	struct bch_extent_crc_unpacked crc;
1440 	const union bch_extent_entry *i;
1441 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1442 
1443 	bkey_for_each_crc(k.k, ptrs, crc, i)
1444 		if (crc_is_encoded(crc) &&
1445 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1446 			struct printbuf buf = PRINTBUF;
1447 
1448 			bch2_bkey_val_to_text(&buf, c, k);
1449 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1450 			printbuf_exit(&buf);
1451 		}
1452 
1453 	return 0;
1454 }
1455 
1456 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1457 			struct bkey_s_c k,
1458 			struct inode_walker *inode,
1459 			struct snapshots_seen *s,
1460 			struct extent_ends *extent_ends)
1461 {
1462 	struct bch_fs *c = trans->c;
1463 	struct inode_walker_entry *i;
1464 	struct printbuf buf = PRINTBUF;
1465 	int ret = 0;
1466 
1467 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1468 	if (ret) {
1469 		ret = ret < 0 ? ret : 0;
1470 		goto out;
1471 	}
1472 
1473 	if (inode->last_pos.inode != k.k->p.inode) {
1474 		ret = check_i_sectors(trans, inode);
1475 		if (ret)
1476 			goto err;
1477 	}
1478 
1479 	i = walk_inode(trans, inode, k);
1480 	ret = PTR_ERR_OR_ZERO(i);
1481 	if (ret)
1482 		goto err;
1483 
1484 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1485 	if (ret)
1486 		goto err;
1487 
1488 	if (k.k->type != KEY_TYPE_whiteout) {
1489 		if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1490 			ret =   reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
1491 				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1492 			if (ret)
1493 				goto err;
1494 
1495 			inode->last_pos.inode--;
1496 			ret = -BCH_ERR_transaction_restart_nested;
1497 			goto err;
1498 		}
1499 
1500 		if (fsck_err_on(!i, c, extent_in_missing_inode,
1501 				"extent in missing inode:\n  %s",
1502 				(printbuf_reset(&buf),
1503 				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1504 			goto delete;
1505 
1506 		if (fsck_err_on(i &&
1507 				!S_ISREG(i->inode.bi_mode) &&
1508 				!S_ISLNK(i->inode.bi_mode),
1509 				c, extent_in_non_reg_inode,
1510 				"extent in non regular inode mode %o:\n  %s",
1511 				i->inode.bi_mode,
1512 				(printbuf_reset(&buf),
1513 				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1514 			goto delete;
1515 
1516 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1517 						&inode->recalculate_sums);
1518 		if (ret)
1519 			goto err;
1520 	}
1521 
1522 	/*
1523 	 * Check inodes in reverse order, from oldest snapshots to newest,
1524 	 * starting from the inode that matches this extent's snapshot. If we
1525 	 * didn't have one, iterate over all inodes:
1526 	 */
1527 	if (!i)
1528 		i = inode->inodes.data + inode->inodes.nr - 1;
1529 
1530 	for (;
1531 	     inode->inodes.data && i >= inode->inodes.data;
1532 	     --i) {
1533 		if (i->snapshot > k.k->p.snapshot ||
1534 		    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1535 			continue;
1536 
1537 		if (k.k->type != KEY_TYPE_whiteout) {
1538 			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1539 					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1540 					!bkey_extent_is_reservation(k),
1541 					c, extent_past_end_of_inode,
1542 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
1543 					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1544 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1545 				struct btree_iter iter2;
1546 
1547 				bch2_trans_copy_iter(&iter2, iter);
1548 				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1549 				ret =   bch2_btree_iter_traverse(&iter2) ?:
1550 					bch2_btree_delete_at(trans, &iter2,
1551 						BTREE_UPDATE_internal_snapshot_node);
1552 				bch2_trans_iter_exit(trans, &iter2);
1553 				if (ret)
1554 					goto err;
1555 
1556 				iter->k.type = KEY_TYPE_whiteout;
1557 			}
1558 
1559 			if (bkey_extent_is_allocation(k.k))
1560 				i->count += k.k->size;
1561 		}
1562 
1563 		i->seen_this_pos = true;
1564 	}
1565 
1566 	if (k.k->type != KEY_TYPE_whiteout) {
1567 		ret = extent_ends_at(c, extent_ends, s, k);
1568 		if (ret)
1569 			goto err;
1570 	}
1571 out:
1572 err:
1573 fsck_err:
1574 	printbuf_exit(&buf);
1575 	bch_err_fn(c, ret);
1576 	return ret;
1577 delete:
1578 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1579 	goto out;
1580 }
1581 
1582 /*
1583  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1584  * that i_size an i_sectors are consistent
1585  */
1586 int bch2_check_extents(struct bch_fs *c)
1587 {
1588 	struct inode_walker w = inode_walker_init();
1589 	struct snapshots_seen s;
1590 	struct extent_ends extent_ends;
1591 	struct disk_reservation res = { 0 };
1592 
1593 	snapshots_seen_init(&s);
1594 	extent_ends_init(&extent_ends);
1595 
1596 	int ret = bch2_trans_run(c,
1597 		for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1598 				POS(BCACHEFS_ROOT_INO, 0),
1599 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1600 				&res, NULL,
1601 				BCH_TRANS_COMMIT_no_enospc, ({
1602 			bch2_disk_reservation_put(c, &res);
1603 			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1604 			check_extent_overbig(trans, &iter, k);
1605 		})) ?:
1606 		check_i_sectors_notnested(trans, &w));
1607 
1608 	bch2_disk_reservation_put(c, &res);
1609 	extent_ends_exit(&extent_ends);
1610 	inode_walker_exit(&w);
1611 	snapshots_seen_exit(&s);
1612 
1613 	bch_err_fn(c, ret);
1614 	return ret;
1615 }
1616 
1617 int bch2_check_indirect_extents(struct bch_fs *c)
1618 {
1619 	struct disk_reservation res = { 0 };
1620 
1621 	int ret = bch2_trans_run(c,
1622 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1623 				POS_MIN,
1624 				BTREE_ITER_prefetch, k,
1625 				&res, NULL,
1626 				BCH_TRANS_COMMIT_no_enospc, ({
1627 			bch2_disk_reservation_put(c, &res);
1628 			check_extent_overbig(trans, &iter, k);
1629 		})));
1630 
1631 	bch2_disk_reservation_put(c, &res);
1632 	bch_err_fn(c, ret);
1633 	return ret;
1634 }
1635 
1636 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1637 {
1638 	struct bch_fs *c = trans->c;
1639 	int ret = 0;
1640 	s64 count2;
1641 
1642 	darray_for_each(w->inodes, i) {
1643 		if (i->inode.bi_nlink == i->count)
1644 			continue;
1645 
1646 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1647 		if (count2 < 0)
1648 			return count2;
1649 
1650 		if (i->count != count2) {
1651 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1652 					    w->last_pos.inode, i->snapshot, i->count, count2);
1653 			i->count = count2;
1654 			if (i->inode.bi_nlink == i->count)
1655 				continue;
1656 		}
1657 
1658 		if (fsck_err_on(i->inode.bi_nlink != i->count,
1659 				c, inode_dir_wrong_nlink,
1660 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1661 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1662 			i->inode.bi_nlink = i->count;
1663 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1664 			if (ret)
1665 				break;
1666 		}
1667 	}
1668 fsck_err:
1669 	bch_err_fn(c, ret);
1670 	return ret;
1671 }
1672 
1673 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1674 {
1675 	u32 restart_count = trans->restart_count;
1676 	return check_subdir_count_notnested(trans, w) ?:
1677 		trans_was_restarted(trans, restart_count);
1678 }
1679 
1680 noinline_for_stack
1681 static int check_dirent_inode_dirent(struct btree_trans *trans,
1682 				   struct btree_iter *iter,
1683 				   struct bkey_s_c_dirent d,
1684 				   struct bch_inode_unpacked *target,
1685 				   u32 target_snapshot)
1686 {
1687 	struct bch_fs *c = trans->c;
1688 	struct printbuf buf = PRINTBUF;
1689 	int ret = 0;
1690 
1691 	if (inode_points_to_dirent(target, d))
1692 		return 0;
1693 
1694 	if (bch2_inode_should_have_bp(target) &&
1695 	    !fsck_err(c, inode_wrong_backpointer,
1696 		      "dirent points to inode that does not point back:\n  %s",
1697 		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
1698 		       prt_printf(&buf, "\n  "),
1699 		       bch2_inode_unpacked_to_text(&buf, target),
1700 		       buf.buf)))
1701 		goto out_noiter;
1702 
1703 	if (!target->bi_dir &&
1704 	    !target->bi_dir_offset) {
1705 		target->bi_dir		= d.k->p.inode;
1706 		target->bi_dir_offset	= d.k->p.offset;
1707 		return __bch2_fsck_write_inode(trans, target, target_snapshot);
1708 	}
1709 
1710 	struct btree_iter bp_iter = { NULL };
1711 	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1712 			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1713 	ret = bkey_err(bp_dirent);
1714 	if (ret && !bch2_err_matches(ret, ENOENT))
1715 		goto err;
1716 
1717 	bool backpointer_exists = !ret;
1718 	ret = 0;
1719 
1720 	if (fsck_err_on(!backpointer_exists,
1721 			c, inode_wrong_backpointer,
1722 			"inode %llu:%u has wrong backpointer:\n"
1723 			"got       %llu:%llu\n"
1724 			"should be %llu:%llu",
1725 			target->bi_inum, target_snapshot,
1726 			target->bi_dir,
1727 			target->bi_dir_offset,
1728 			d.k->p.inode,
1729 			d.k->p.offset)) {
1730 		target->bi_dir		= d.k->p.inode;
1731 		target->bi_dir_offset	= d.k->p.offset;
1732 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1733 		goto out;
1734 	}
1735 
1736 	bch2_bkey_val_to_text(&buf, c, d.s_c);
1737 	prt_newline(&buf);
1738 	if (backpointer_exists)
1739 		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1740 
1741 	if (fsck_err_on(backpointer_exists &&
1742 			(S_ISDIR(target->bi_mode) ||
1743 			 target->bi_subvol),
1744 			c, inode_dir_multiple_links,
1745 			"%s %llu:%u with multiple links\n%s",
1746 			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1747 			target->bi_inum, target_snapshot, buf.buf)) {
1748 		ret = __remove_dirent(trans, d.k->p);
1749 		goto out;
1750 	}
1751 
1752 	/*
1753 	 * hardlinked file with nlink 0:
1754 	 * We're just adjusting nlink here so check_nlinks() will pick
1755 	 * it up, it ignores inodes with nlink 0
1756 	 */
1757 	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1758 			c, inode_multiple_links_but_nlink_0,
1759 			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1760 			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1761 		target->bi_nlink++;
1762 		target->bi_flags &= ~BCH_INODE_unlinked;
1763 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1764 		if (ret)
1765 			goto err;
1766 	}
1767 out:
1768 err:
1769 fsck_err:
1770 	bch2_trans_iter_exit(trans, &bp_iter);
1771 out_noiter:
1772 	printbuf_exit(&buf);
1773 	bch_err_fn(c, ret);
1774 	return ret;
1775 }
1776 
1777 noinline_for_stack
1778 static int check_dirent_target(struct btree_trans *trans,
1779 			       struct btree_iter *iter,
1780 			       struct bkey_s_c_dirent d,
1781 			       struct bch_inode_unpacked *target,
1782 			       u32 target_snapshot)
1783 {
1784 	struct bch_fs *c = trans->c;
1785 	struct bkey_i_dirent *n;
1786 	struct printbuf buf = PRINTBUF;
1787 	int ret = 0;
1788 
1789 	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1790 	if (ret)
1791 		goto err;
1792 
1793 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
1794 			c, dirent_d_type_wrong,
1795 			"incorrect d_type: got %s, should be %s:\n%s",
1796 			bch2_d_type_str(d.v->d_type),
1797 			bch2_d_type_str(inode_d_type(target)),
1798 			(printbuf_reset(&buf),
1799 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1800 		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1801 		ret = PTR_ERR_OR_ZERO(n);
1802 		if (ret)
1803 			goto err;
1804 
1805 		bkey_reassemble(&n->k_i, d.s_c);
1806 		n->v.d_type = inode_d_type(target);
1807 		if (n->v.d_type == DT_SUBVOL) {
1808 			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1809 			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1810 		} else {
1811 			n->v.d_inum = cpu_to_le64(target->bi_inum);
1812 		}
1813 
1814 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1815 		if (ret)
1816 			goto err;
1817 
1818 		d = dirent_i_to_s_c(n);
1819 	}
1820 err:
1821 fsck_err:
1822 	printbuf_exit(&buf);
1823 	bch_err_fn(c, ret);
1824 	return ret;
1825 }
1826 
1827 /* find a subvolume that's a descendent of @snapshot: */
1828 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1829 {
1830 	struct btree_iter iter;
1831 	struct bkey_s_c k;
1832 	int ret;
1833 
1834 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1835 		if (k.k->type != KEY_TYPE_subvolume)
1836 			continue;
1837 
1838 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1839 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1840 			bch2_trans_iter_exit(trans, &iter);
1841 			*subvolid = k.k->p.offset;
1842 			goto found;
1843 		}
1844 	}
1845 	if (!ret)
1846 		ret = -ENOENT;
1847 found:
1848 	bch2_trans_iter_exit(trans, &iter);
1849 	return ret;
1850 }
1851 
1852 noinline_for_stack
1853 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1854 				  struct bkey_s_c_dirent d)
1855 {
1856 	struct bch_fs *c = trans->c;
1857 	struct btree_iter subvol_iter = {};
1858 	struct bch_inode_unpacked subvol_root;
1859 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1860 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1861 	u32 parent_snapshot;
1862 	u32 new_parent_subvol = 0;
1863 	u64 parent_inum;
1864 	struct printbuf buf = PRINTBUF;
1865 	int ret = 0;
1866 
1867 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1868 	if (ret && !bch2_err_matches(ret, ENOENT))
1869 		return ret;
1870 
1871 	if (ret ||
1872 	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
1873 		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1874 		if (ret2 && !bch2_err_matches(ret, ENOENT))
1875 			return ret2;
1876 	}
1877 
1878 	if (ret &&
1879 	    !new_parent_subvol &&
1880 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1881 		/*
1882 		 * Couldn't find a subvol for dirent's snapshot - but we lost
1883 		 * subvols, so we need to reconstruct:
1884 		 */
1885 		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
1886 		if (ret)
1887 			return ret;
1888 
1889 		parent_snapshot = d.k->p.snapshot;
1890 	}
1891 
1892 	if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
1893 			"dirent parent_subvol points to missing subvolume\n%s",
1894 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1895 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1896 			c, dirent_not_visible_in_parent_subvol,
1897 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1898 			parent_snapshot,
1899 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1900 		if (!new_parent_subvol) {
1901 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
1902 			return -BCH_ERR_fsck_repair_unimplemented;
1903 		}
1904 
1905 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1906 		ret = PTR_ERR_OR_ZERO(new_dirent);
1907 		if (ret)
1908 			goto err;
1909 
1910 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1911 	}
1912 
1913 	struct bkey_s_c_subvolume s =
1914 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
1915 					 BTREE_ID_subvolumes, POS(0, target_subvol),
1916 					 0, subvolume);
1917 	ret = bkey_err(s.s_c);
1918 	if (ret && !bch2_err_matches(ret, ENOENT))
1919 		return ret;
1920 
1921 	if (ret) {
1922 		if (fsck_err(c, dirent_to_missing_subvol,
1923 			     "dirent points to missing subvolume\n%s",
1924 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1925 			return __remove_dirent(trans, d.k->p);
1926 		ret = 0;
1927 		goto out;
1928 	}
1929 
1930 	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
1931 			c, subvol_fs_path_parent_wrong,
1932 			"subvol with wrong fs_path_parent, should be be %u\n%s",
1933 			parent_subvol,
1934 			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
1935 		struct bkey_i_subvolume *n =
1936 			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
1937 		ret = PTR_ERR_OR_ZERO(n);
1938 		if (ret)
1939 			goto err;
1940 
1941 		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
1942 	}
1943 
1944 	u64 target_inum = le64_to_cpu(s.v->inode);
1945 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
1946 
1947 	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
1948 	if (ret && !bch2_err_matches(ret, ENOENT))
1949 		goto err;
1950 
1951 	if (ret) {
1952 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
1953 		ret = -BCH_ERR_fsck_repair_unimplemented;
1954 		ret = 0;
1955 		goto err;
1956 	}
1957 
1958 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
1959 			c, inode_bi_parent_wrong,
1960 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
1961 			target_inum,
1962 			subvol_root.bi_parent_subvol, parent_subvol)) {
1963 		subvol_root.bi_parent_subvol = parent_subvol;
1964 		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
1965 		if (ret)
1966 			goto err;
1967 	}
1968 
1969 	ret = check_dirent_target(trans, iter, d, &subvol_root,
1970 				  target_snapshot);
1971 	if (ret)
1972 		goto err;
1973 out:
1974 err:
1975 fsck_err:
1976 	bch2_trans_iter_exit(trans, &subvol_iter);
1977 	printbuf_exit(&buf);
1978 	return ret;
1979 }
1980 
1981 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1982 			struct bkey_s_c k,
1983 			struct bch_hash_info *hash_info,
1984 			struct inode_walker *dir,
1985 			struct inode_walker *target,
1986 			struct snapshots_seen *s)
1987 {
1988 	struct bch_fs *c = trans->c;
1989 	struct inode_walker_entry *i;
1990 	struct printbuf buf = PRINTBUF;
1991 	int ret = 0;
1992 
1993 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1994 	if (ret) {
1995 		ret = ret < 0 ? ret : 0;
1996 		goto out;
1997 	}
1998 
1999 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2000 	if (ret)
2001 		goto err;
2002 
2003 	if (k.k->type == KEY_TYPE_whiteout)
2004 		goto out;
2005 
2006 	if (dir->last_pos.inode != k.k->p.inode) {
2007 		ret = check_subdir_count(trans, dir);
2008 		if (ret)
2009 			goto err;
2010 	}
2011 
2012 	BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
2013 
2014 	i = walk_inode(trans, dir, k);
2015 	ret = PTR_ERR_OR_ZERO(i);
2016 	if (ret < 0)
2017 		goto err;
2018 
2019 	if (dir->first_this_inode && dir->inodes.nr)
2020 		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
2021 	dir->first_this_inode = false;
2022 
2023 	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
2024 		ret =   reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
2025 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2026 		if (ret)
2027 			goto err;
2028 
2029 		dir->last_pos.inode--;
2030 		ret = -BCH_ERR_transaction_restart_nested;
2031 		goto err;
2032 	}
2033 
2034 	if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
2035 			"dirent in nonexisting directory:\n%s",
2036 			(printbuf_reset(&buf),
2037 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
2038 		ret = bch2_btree_delete_at(trans, iter,
2039 				BTREE_UPDATE_internal_snapshot_node);
2040 		goto out;
2041 	}
2042 
2043 	if (!i)
2044 		goto out;
2045 
2046 	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
2047 			c, dirent_in_non_dir_inode,
2048 			"dirent in non directory inode type %s:\n%s",
2049 			bch2_d_type_str(inode_d_type(&i->inode)),
2050 			(printbuf_reset(&buf),
2051 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
2052 		ret = bch2_btree_delete_at(trans, iter, 0);
2053 		goto out;
2054 	}
2055 
2056 	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
2057 	if (ret < 0)
2058 		goto err;
2059 	if (ret) {
2060 		/* dirent has been deleted */
2061 		ret = 0;
2062 		goto out;
2063 	}
2064 
2065 	if (k.k->type != KEY_TYPE_dirent)
2066 		goto out;
2067 
2068 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2069 
2070 	if (d.v->d_type == DT_SUBVOL) {
2071 		ret = check_dirent_to_subvol(trans, iter, d);
2072 		if (ret)
2073 			goto err;
2074 	} else {
2075 		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2076 		if (ret)
2077 			goto err;
2078 
2079 		if (fsck_err_on(!target->inodes.nr,
2080 				c, dirent_to_missing_inode,
2081 				"dirent points to missing inode:\n%s",
2082 				(printbuf_reset(&buf),
2083 				 bch2_bkey_val_to_text(&buf, c, k),
2084 				 buf.buf))) {
2085 			ret = __remove_dirent(trans, d.k->p);
2086 			if (ret)
2087 				goto err;
2088 		}
2089 
2090 		darray_for_each(target->inodes, i) {
2091 			ret = check_dirent_target(trans, iter, d,
2092 						  &i->inode, i->snapshot);
2093 			if (ret)
2094 				goto err;
2095 		}
2096 
2097 		if (d.v->d_type == DT_DIR)
2098 			for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
2099 				i->count++;
2100 	}
2101 out:
2102 err:
2103 fsck_err:
2104 	printbuf_exit(&buf);
2105 	bch_err_fn(c, ret);
2106 	return ret;
2107 }
2108 
2109 /*
2110  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2111  * validate d_type
2112  */
2113 int bch2_check_dirents(struct bch_fs *c)
2114 {
2115 	struct inode_walker dir = inode_walker_init();
2116 	struct inode_walker target = inode_walker_init();
2117 	struct snapshots_seen s;
2118 	struct bch_hash_info hash_info;
2119 
2120 	snapshots_seen_init(&s);
2121 
2122 	int ret = bch2_trans_run(c,
2123 		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2124 				POS(BCACHEFS_ROOT_INO, 0),
2125 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2126 				k,
2127 				NULL, NULL,
2128 				BCH_TRANS_COMMIT_no_enospc,
2129 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2130 		check_subdir_count_notnested(trans, &dir));
2131 
2132 	snapshots_seen_exit(&s);
2133 	inode_walker_exit(&dir);
2134 	inode_walker_exit(&target);
2135 	bch_err_fn(c, ret);
2136 	return ret;
2137 }
2138 
2139 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2140 		       struct bkey_s_c k,
2141 		       struct bch_hash_info *hash_info,
2142 		       struct inode_walker *inode)
2143 {
2144 	struct bch_fs *c = trans->c;
2145 	struct inode_walker_entry *i;
2146 	int ret;
2147 
2148 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2149 	if (ret < 0)
2150 		return ret;
2151 	if (ret)
2152 		return 0;
2153 
2154 	i = walk_inode(trans, inode, k);
2155 	ret = PTR_ERR_OR_ZERO(i);
2156 	if (ret)
2157 		return ret;
2158 
2159 	if (inode->first_this_inode && inode->inodes.nr)
2160 		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
2161 	inode->first_this_inode = false;
2162 
2163 	if (fsck_err_on(!i, c, xattr_in_missing_inode,
2164 			"xattr for missing inode %llu",
2165 			k.k->p.inode))
2166 		return bch2_btree_delete_at(trans, iter, 0);
2167 
2168 	if (!i)
2169 		return 0;
2170 
2171 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2172 fsck_err:
2173 	bch_err_fn(c, ret);
2174 	return ret;
2175 }
2176 
2177 /*
2178  * Walk xattrs: verify that they all have a corresponding inode
2179  */
2180 int bch2_check_xattrs(struct bch_fs *c)
2181 {
2182 	struct inode_walker inode = inode_walker_init();
2183 	struct bch_hash_info hash_info;
2184 	int ret = 0;
2185 
2186 	ret = bch2_trans_run(c,
2187 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2188 			POS(BCACHEFS_ROOT_INO, 0),
2189 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2190 			k,
2191 			NULL, NULL,
2192 			BCH_TRANS_COMMIT_no_enospc,
2193 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2194 	bch_err_fn(c, ret);
2195 	return ret;
2196 }
2197 
2198 static int check_root_trans(struct btree_trans *trans)
2199 {
2200 	struct bch_fs *c = trans->c;
2201 	struct bch_inode_unpacked root_inode;
2202 	u32 snapshot;
2203 	u64 inum;
2204 	int ret;
2205 
2206 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2207 	if (ret && !bch2_err_matches(ret, ENOENT))
2208 		return ret;
2209 
2210 	if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
2211 				"root subvol missing")) {
2212 		struct bkey_i_subvolume *root_subvol =
2213 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2214 		ret = PTR_ERR_OR_ZERO(root_subvol);
2215 		if (ret)
2216 			goto err;
2217 
2218 		snapshot	= U32_MAX;
2219 		inum		= BCACHEFS_ROOT_INO;
2220 
2221 		bkey_subvolume_init(&root_subvol->k_i);
2222 		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2223 		root_subvol->v.flags	= 0;
2224 		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2225 		root_subvol->v.inode	= cpu_to_le64(inum);
2226 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2227 		bch_err_msg(c, ret, "writing root subvol");
2228 		if (ret)
2229 			goto err;
2230 	}
2231 
2232 	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2233 	if (ret && !bch2_err_matches(ret, ENOENT))
2234 		return ret;
2235 
2236 	if (mustfix_fsck_err_on(ret, c, root_dir_missing,
2237 				"root directory missing") ||
2238 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2239 				c, root_inode_not_dir,
2240 				"root inode not a directory")) {
2241 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2242 				0, NULL);
2243 		root_inode.bi_inum = inum;
2244 
2245 		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2246 		bch_err_msg(c, ret, "writing root inode");
2247 	}
2248 err:
2249 fsck_err:
2250 	return ret;
2251 }
2252 
2253 /* Get root directory, create if it doesn't exist: */
2254 int bch2_check_root(struct bch_fs *c)
2255 {
2256 	int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2257 		check_root_trans(trans));
2258 	bch_err_fn(c, ret);
2259 	return ret;
2260 }
2261 
2262 typedef DARRAY(u32) darray_u32;
2263 
2264 static bool darray_u32_has(darray_u32 *d, u32 v)
2265 {
2266 	darray_for_each(*d, i)
2267 		if (*i == v)
2268 			return true;
2269 	return false;
2270 }
2271 
2272 /*
2273  * We've checked that inode backpointers point to valid dirents; here, it's
2274  * sufficient to check that the subvolume root has a dirent:
2275  */
2276 static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
2277 {
2278 	struct bch_inode_unpacked inode;
2279 	int ret = bch2_inode_find_by_inum_trans(trans,
2280 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2281 				&inode);
2282 	if (ret)
2283 		return ret;
2284 
2285 	return inode.bi_dir != 0;
2286 }
2287 
2288 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2289 {
2290 	struct bch_fs *c = trans->c;
2291 	struct btree_iter parent_iter = {};
2292 	darray_u32 subvol_path = {};
2293 	struct printbuf buf = PRINTBUF;
2294 	int ret = 0;
2295 
2296 	if (k.k->type != KEY_TYPE_subvolume)
2297 		return 0;
2298 
2299 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2300 		ret = darray_push(&subvol_path, k.k->p.offset);
2301 		if (ret)
2302 			goto err;
2303 
2304 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2305 
2306 		ret = subvol_has_dirent(trans, s);
2307 		if (ret < 0)
2308 			break;
2309 
2310 		if (fsck_err_on(!ret,
2311 				c, subvol_unreachable,
2312 				"unreachable subvolume %s",
2313 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2314 				 buf.buf))) {
2315 			ret = reattach_subvol(trans, s);
2316 			break;
2317 		}
2318 
2319 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2320 
2321 		if (darray_u32_has(&subvol_path, parent)) {
2322 			if (fsck_err(c, subvol_loop, "subvolume loop"))
2323 				ret = reattach_subvol(trans, s);
2324 			break;
2325 		}
2326 
2327 		bch2_trans_iter_exit(trans, &parent_iter);
2328 		bch2_trans_iter_init(trans, &parent_iter,
2329 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2330 		k = bch2_btree_iter_peek_slot(&parent_iter);
2331 		ret = bkey_err(k);
2332 		if (ret)
2333 			goto err;
2334 
2335 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2336 				c, subvol_unreachable,
2337 				"unreachable subvolume %s",
2338 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2339 				 buf.buf))) {
2340 			ret = reattach_subvol(trans, s);
2341 			break;
2342 		}
2343 	}
2344 fsck_err:
2345 err:
2346 	printbuf_exit(&buf);
2347 	darray_exit(&subvol_path);
2348 	bch2_trans_iter_exit(trans, &parent_iter);
2349 	return ret;
2350 }
2351 
2352 int bch2_check_subvolume_structure(struct bch_fs *c)
2353 {
2354 	int ret = bch2_trans_run(c,
2355 		for_each_btree_key_commit(trans, iter,
2356 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2357 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2358 			check_subvol_path(trans, &iter, k)));
2359 	bch_err_fn(c, ret);
2360 	return ret;
2361 }
2362 
2363 struct pathbuf_entry {
2364 	u64	inum;
2365 	u32	snapshot;
2366 };
2367 
2368 typedef DARRAY(struct pathbuf_entry) pathbuf;
2369 
2370 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2371 {
2372 	darray_for_each(*p, i)
2373 		if (i->inum	== inum &&
2374 		    i->snapshot	== snapshot)
2375 			return true;
2376 	return false;
2377 }
2378 
2379 /*
2380  * Check that a given inode is reachable from its subvolume root - we already
2381  * verified subvolume connectivity:
2382  *
2383  * XXX: we should also be verifying that inodes are in the right subvolumes
2384  */
2385 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2386 {
2387 	struct bch_fs *c = trans->c;
2388 	struct btree_iter inode_iter = {};
2389 	struct bch_inode_unpacked inode;
2390 	struct printbuf buf = PRINTBUF;
2391 	u32 snapshot = inode_k.k->p.snapshot;
2392 	int ret = 0;
2393 
2394 	p->nr = 0;
2395 
2396 	BUG_ON(bch2_inode_unpack(inode_k, &inode));
2397 
2398 	while (!inode.bi_subvol) {
2399 		struct btree_iter dirent_iter;
2400 		struct bkey_s_c_dirent d;
2401 		u32 parent_snapshot = snapshot;
2402 
2403 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2404 		ret = bkey_err(d.s_c);
2405 		if (ret && !bch2_err_matches(ret, ENOENT))
2406 			break;
2407 
2408 		if (!ret && !dirent_points_to_inode(d, &inode)) {
2409 			bch2_trans_iter_exit(trans, &dirent_iter);
2410 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2411 		}
2412 
2413 		if (bch2_err_matches(ret, ENOENT)) {
2414 			ret = 0;
2415 			if (fsck_err(c, inode_unreachable,
2416 				     "unreachable inode\n%s",
2417 				     (printbuf_reset(&buf),
2418 				      bch2_bkey_val_to_text(&buf, c, inode_k),
2419 				      buf.buf)))
2420 				ret = reattach_inode(trans, &inode, snapshot);
2421 			goto out;
2422 		}
2423 
2424 		bch2_trans_iter_exit(trans, &dirent_iter);
2425 
2426 		if (!S_ISDIR(inode.bi_mode))
2427 			break;
2428 
2429 		ret = darray_push(p, ((struct pathbuf_entry) {
2430 			.inum		= inode.bi_inum,
2431 			.snapshot	= snapshot,
2432 		}));
2433 		if (ret)
2434 			return ret;
2435 
2436 		snapshot = parent_snapshot;
2437 
2438 		bch2_trans_iter_exit(trans, &inode_iter);
2439 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2440 					     SPOS(0, inode.bi_dir, snapshot), 0);
2441 		ret = bkey_err(inode_k) ?:
2442 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2443 			: bch2_inode_unpack(inode_k, &inode);
2444 		if (ret) {
2445 			/* Should have been caught in dirents pass */
2446 			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
2447 				bch_err(c, "error looking up parent directory: %i", ret);
2448 			break;
2449 		}
2450 
2451 		snapshot = inode_k.k->p.snapshot;
2452 
2453 		if (path_is_dup(p, inode.bi_inum, snapshot)) {
2454 			/* XXX print path */
2455 			bch_err(c, "directory structure loop");
2456 
2457 			darray_for_each(*p, i)
2458 				pr_err("%llu:%u", i->inum, i->snapshot);
2459 			pr_err("%llu:%u", inode.bi_inum, snapshot);
2460 
2461 			if (fsck_err(c, dir_loop, "directory structure loop")) {
2462 				ret = remove_backpointer(trans, &inode);
2463 				bch_err_msg(c, ret, "removing dirent");
2464 				if (ret)
2465 					break;
2466 
2467 				ret = reattach_inode(trans, &inode, snapshot);
2468 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2469 			}
2470 			break;
2471 		}
2472 	}
2473 out:
2474 fsck_err:
2475 	bch2_trans_iter_exit(trans, &inode_iter);
2476 	printbuf_exit(&buf);
2477 	bch_err_fn(c, ret);
2478 	return ret;
2479 }
2480 
2481 /*
2482  * Check for unreachable inodes, as well as loops in the directory structure:
2483  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2484  * unreachable:
2485  */
2486 int bch2_check_directory_structure(struct bch_fs *c)
2487 {
2488 	pathbuf path = { 0, };
2489 	int ret;
2490 
2491 	ret = bch2_trans_run(c,
2492 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2493 					  BTREE_ITER_intent|
2494 					  BTREE_ITER_prefetch|
2495 					  BTREE_ITER_all_snapshots, k,
2496 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2497 			if (!bkey_is_inode(k.k))
2498 				continue;
2499 
2500 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2501 				continue;
2502 
2503 			check_path(trans, &path, k);
2504 		})));
2505 	darray_exit(&path);
2506 
2507 	bch_err_fn(c, ret);
2508 	return ret;
2509 }
2510 
2511 struct nlink_table {
2512 	size_t		nr;
2513 	size_t		size;
2514 
2515 	struct nlink {
2516 		u64	inum;
2517 		u32	snapshot;
2518 		u32	count;
2519 	}		*d;
2520 };
2521 
2522 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2523 		     u64 inum, u32 snapshot)
2524 {
2525 	if (t->nr == t->size) {
2526 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2527 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2528 
2529 		if (!d) {
2530 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2531 				new_size);
2532 			return -BCH_ERR_ENOMEM_fsck_add_nlink;
2533 		}
2534 
2535 		if (t->d)
2536 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2537 		kvfree(t->d);
2538 
2539 		t->d = d;
2540 		t->size = new_size;
2541 	}
2542 
2543 
2544 	t->d[t->nr++] = (struct nlink) {
2545 		.inum		= inum,
2546 		.snapshot	= snapshot,
2547 	};
2548 
2549 	return 0;
2550 }
2551 
2552 static int nlink_cmp(const void *_l, const void *_r)
2553 {
2554 	const struct nlink *l = _l;
2555 	const struct nlink *r = _r;
2556 
2557 	return cmp_int(l->inum, r->inum);
2558 }
2559 
2560 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2561 		     struct nlink_table *links,
2562 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2563 {
2564 	struct nlink *link, key = {
2565 		.inum = inum, .snapshot = U32_MAX,
2566 	};
2567 
2568 	if (inum < range_start || inum >= range_end)
2569 		return;
2570 
2571 	link = __inline_bsearch(&key, links->d, links->nr,
2572 				sizeof(links->d[0]), nlink_cmp);
2573 	if (!link)
2574 		return;
2575 
2576 	while (link > links->d && link[0].inum == link[-1].inum)
2577 		--link;
2578 
2579 	for (; link < links->d + links->nr && link->inum == inum; link++)
2580 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2581 			link->count++;
2582 			if (link->snapshot >= snapshot)
2583 				break;
2584 		}
2585 }
2586 
2587 noinline_for_stack
2588 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2589 				       struct nlink_table *t,
2590 				       u64 start, u64 *end)
2591 {
2592 	int ret = bch2_trans_run(c,
2593 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2594 				   POS(0, start),
2595 				   BTREE_ITER_intent|
2596 				   BTREE_ITER_prefetch|
2597 				   BTREE_ITER_all_snapshots, k, ({
2598 			if (!bkey_is_inode(k.k))
2599 				continue;
2600 
2601 			/* Should never fail, checked by bch2_inode_invalid: */
2602 			struct bch_inode_unpacked u;
2603 			BUG_ON(bch2_inode_unpack(k, &u));
2604 
2605 			/*
2606 			 * Backpointer and directory structure checks are sufficient for
2607 			 * directories, since they can't have hardlinks:
2608 			 */
2609 			if (S_ISDIR(u.bi_mode))
2610 				continue;
2611 
2612 			if (!u.bi_nlink)
2613 				continue;
2614 
2615 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2616 			if (ret) {
2617 				*end = k.k->p.offset;
2618 				ret = 0;
2619 				break;
2620 			}
2621 			0;
2622 		})));
2623 
2624 	bch_err_fn(c, ret);
2625 	return ret;
2626 }
2627 
2628 noinline_for_stack
2629 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2630 				     u64 range_start, u64 range_end)
2631 {
2632 	struct snapshots_seen s;
2633 
2634 	snapshots_seen_init(&s);
2635 
2636 	int ret = bch2_trans_run(c,
2637 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2638 				   BTREE_ITER_intent|
2639 				   BTREE_ITER_prefetch|
2640 				   BTREE_ITER_all_snapshots, k, ({
2641 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2642 			if (ret)
2643 				break;
2644 
2645 			if (k.k->type == KEY_TYPE_dirent) {
2646 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2647 
2648 				if (d.v->d_type != DT_DIR &&
2649 				    d.v->d_type != DT_SUBVOL)
2650 					inc_link(c, &s, links, range_start, range_end,
2651 						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2652 			}
2653 			0;
2654 		})));
2655 
2656 	snapshots_seen_exit(&s);
2657 
2658 	bch_err_fn(c, ret);
2659 	return ret;
2660 }
2661 
2662 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2663 				     struct bkey_s_c k,
2664 				     struct nlink_table *links,
2665 				     size_t *idx, u64 range_end)
2666 {
2667 	struct bch_fs *c = trans->c;
2668 	struct bch_inode_unpacked u;
2669 	struct nlink *link = &links->d[*idx];
2670 	int ret = 0;
2671 
2672 	if (k.k->p.offset >= range_end)
2673 		return 1;
2674 
2675 	if (!bkey_is_inode(k.k))
2676 		return 0;
2677 
2678 	BUG_ON(bch2_inode_unpack(k, &u));
2679 
2680 	if (S_ISDIR(u.bi_mode))
2681 		return 0;
2682 
2683 	if (!u.bi_nlink)
2684 		return 0;
2685 
2686 	while ((cmp_int(link->inum, k.k->p.offset) ?:
2687 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2688 		BUG_ON(*idx == links->nr);
2689 		link = &links->d[++*idx];
2690 	}
2691 
2692 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2693 			c, inode_wrong_nlink,
2694 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
2695 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2696 			bch2_inode_nlink_get(&u), link->count)) {
2697 		bch2_inode_nlink_set(&u, link->count);
2698 		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2699 	}
2700 fsck_err:
2701 	return ret;
2702 }
2703 
2704 noinline_for_stack
2705 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2706 			       struct nlink_table *links,
2707 			       u64 range_start, u64 range_end)
2708 {
2709 	size_t idx = 0;
2710 
2711 	int ret = bch2_trans_run(c,
2712 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2713 				POS(0, range_start),
2714 				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2715 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2716 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2717 	if (ret < 0) {
2718 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2719 		return ret;
2720 	}
2721 
2722 	return 0;
2723 }
2724 
2725 int bch2_check_nlinks(struct bch_fs *c)
2726 {
2727 	struct nlink_table links = { 0 };
2728 	u64 this_iter_range_start, next_iter_range_start = 0;
2729 	int ret = 0;
2730 
2731 	do {
2732 		this_iter_range_start = next_iter_range_start;
2733 		next_iter_range_start = U64_MAX;
2734 
2735 		ret = check_nlinks_find_hardlinks(c, &links,
2736 						  this_iter_range_start,
2737 						  &next_iter_range_start);
2738 
2739 		ret = check_nlinks_walk_dirents(c, &links,
2740 					  this_iter_range_start,
2741 					  next_iter_range_start);
2742 		if (ret)
2743 			break;
2744 
2745 		ret = check_nlinks_update_hardlinks(c, &links,
2746 					 this_iter_range_start,
2747 					 next_iter_range_start);
2748 		if (ret)
2749 			break;
2750 
2751 		links.nr = 0;
2752 	} while (next_iter_range_start != U64_MAX);
2753 
2754 	kvfree(links.d);
2755 	bch_err_fn(c, ret);
2756 	return ret;
2757 }
2758 
2759 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2760 			     struct bkey_s_c k)
2761 {
2762 	struct bkey_s_c_reflink_p p;
2763 	struct bkey_i_reflink_p *u;
2764 
2765 	if (k.k->type != KEY_TYPE_reflink_p)
2766 		return 0;
2767 
2768 	p = bkey_s_c_to_reflink_p(k);
2769 
2770 	if (!p.v->front_pad && !p.v->back_pad)
2771 		return 0;
2772 
2773 	u = bch2_trans_kmalloc(trans, sizeof(*u));
2774 	int ret = PTR_ERR_OR_ZERO(u);
2775 	if (ret)
2776 		return ret;
2777 
2778 	bkey_reassemble(&u->k_i, k);
2779 	u->v.front_pad	= 0;
2780 	u->v.back_pad	= 0;
2781 
2782 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2783 }
2784 
2785 int bch2_fix_reflink_p(struct bch_fs *c)
2786 {
2787 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2788 		return 0;
2789 
2790 	int ret = bch2_trans_run(c,
2791 		for_each_btree_key_commit(trans, iter,
2792 				BTREE_ID_extents, POS_MIN,
2793 				BTREE_ITER_intent|BTREE_ITER_prefetch|
2794 				BTREE_ITER_all_snapshots, k,
2795 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2796 			fix_reflink_p_key(trans, &iter, k)));
2797 	bch_err_fn(c, ret);
2798 	return ret;
2799 }
2800