xref: /linux/fs/bcachefs/fsck.c (revision f7d7ccf92f2b9398781f791b4af1a74a9f65b5c3)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "darray.h"
9 #include "dirent.h"
10 #include "error.h"
11 #include "fs-common.h"
12 #include "fsck.h"
13 #include "inode.h"
14 #include "keylist.h"
15 #include "recovery_passes.h"
16 #include "snapshot.h"
17 #include "super.h"
18 #include "xattr.h"
19 
20 #include <linux/bsearch.h>
21 #include <linux/dcache.h> /* struct qstr */
22 
23 /*
24  * XXX: this is handling transaction restarts without returning
25  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
26  */
27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
28 				    u32 snapshot)
29 {
30 	u64 sectors = 0;
31 
32 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
33 				SPOS(inum, 0, snapshot),
34 				POS(inum, U64_MAX),
35 				0, k, ({
36 		if (bkey_extent_is_allocation(k.k))
37 			sectors += k.k->size;
38 		0;
39 	}));
40 
41 	return ret ?: sectors;
42 }
43 
44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45 				    u32 snapshot)
46 {
47 	u64 subdirs = 0;
48 
49 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
50 				    SPOS(inum, 0, snapshot),
51 				    POS(inum, U64_MAX),
52 				    0, k, ({
53 		if (k.k->type == KEY_TYPE_dirent &&
54 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
55 			subdirs++;
56 		0;
57 	}));
58 
59 	return ret ?: subdirs;
60 }
61 
62 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
63 			 u32 *snapshot, u64 *inum)
64 {
65 	struct bch_subvolume s;
66 	int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
67 
68 	*snapshot = le32_to_cpu(s.snapshot);
69 	*inum = le64_to_cpu(s.inode);
70 	return ret;
71 }
72 
73 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
74 			      struct bch_inode_unpacked *inode)
75 {
76 	struct btree_iter iter;
77 	struct bkey_s_c k;
78 	int ret;
79 
80 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
81 				     BTREE_ITER_all_snapshots, k, ret) {
82 		if (k.k->p.offset != inode_nr)
83 			break;
84 		if (!bkey_is_inode(k.k))
85 			continue;
86 		ret = bch2_inode_unpack(k, inode);
87 		goto found;
88 	}
89 	ret = -BCH_ERR_ENOENT_inode;
90 found:
91 	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
92 	bch2_trans_iter_exit(trans, &iter);
93 	return ret;
94 }
95 
96 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
97 			struct bch_inode_unpacked *inode,
98 			u32 *snapshot)
99 {
100 	struct btree_iter iter;
101 	struct bkey_s_c k;
102 	int ret;
103 
104 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
105 			       SPOS(0, inode_nr, *snapshot), 0);
106 	ret = bkey_err(k);
107 	if (ret)
108 		goto err;
109 
110 	ret = bkey_is_inode(k.k)
111 		? bch2_inode_unpack(k, inode)
112 		: -BCH_ERR_ENOENT_inode;
113 	if (!ret)
114 		*snapshot = iter.pos.snapshot;
115 err:
116 	bch2_trans_iter_exit(trans, &iter);
117 	return ret;
118 }
119 
120 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
121 			   struct bch_hash_info hash_info,
122 			   subvol_inum dir, struct qstr *name,
123 			   u64 *target, unsigned *type, u32 snapshot)
124 {
125 	struct btree_iter iter;
126 	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
127 							 &hash_info, dir, name, 0, snapshot);
128 	int ret = bkey_err(k);
129 	if (ret)
130 		return ret;
131 
132 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
133 	*target = le64_to_cpu(d.v->d_inum);
134 	*type = d.v->d_type;
135 	bch2_trans_iter_exit(trans, &iter);
136 	return 0;
137 }
138 
139 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
140 {
141 	struct bch_fs *c = trans->c;
142 	struct btree_iter iter;
143 	struct bch_inode_unpacked dir_inode;
144 	struct bch_hash_info dir_hash_info;
145 	int ret;
146 
147 	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
148 	if (ret)
149 		goto err;
150 
151 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
152 
153 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
154 
155 	ret =   bch2_btree_iter_traverse(&iter) ?:
156 		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
157 				    &dir_hash_info, &iter,
158 				    BTREE_UPDATE_internal_snapshot_node);
159 	bch2_trans_iter_exit(trans, &iter);
160 err:
161 	bch_err_fn(c, ret);
162 	return ret;
163 }
164 
165 /* Get lost+found, create if it doesn't exist: */
166 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
167 			    struct bch_inode_unpacked *lostfound,
168 			    u64 reattaching_inum)
169 {
170 	struct bch_fs *c = trans->c;
171 	struct qstr lostfound_str = QSTR("lost+found");
172 	u64 inum = 0;
173 	unsigned d_type = 0;
174 	int ret;
175 
176 	struct bch_snapshot_tree st;
177 	ret = bch2_snapshot_tree_lookup(trans,
178 			bch2_snapshot_tree(c, snapshot), &st);
179 	if (ret)
180 		return ret;
181 
182 	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
183 
184 	struct bch_subvolume subvol;
185 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
186 				 false, 0, &subvol);
187 	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
188 		    le32_to_cpu(st.master_subvol), snapshot);
189 	if (ret)
190 		return ret;
191 
192 	if (!subvol.inode) {
193 		struct btree_iter iter;
194 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
195 				BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
196 				0, subvolume);
197 		ret = PTR_ERR_OR_ZERO(subvol);
198 		if (ret)
199 			return ret;
200 
201 		subvol->v.inode = cpu_to_le64(reattaching_inum);
202 		bch2_trans_iter_exit(trans, &iter);
203 	}
204 
205 	root_inum.inum = le64_to_cpu(subvol.inode);
206 
207 	struct bch_inode_unpacked root_inode;
208 	struct bch_hash_info root_hash_info;
209 	u32 root_inode_snapshot = snapshot;
210 	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
211 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
212 		    root_inum.inum, le32_to_cpu(st.master_subvol));
213 	if (ret)
214 		return ret;
215 
216 	root_hash_info = bch2_hash_info_init(c, &root_inode);
217 
218 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
219 			      &lostfound_str, &inum, &d_type, snapshot);
220 	if (bch2_err_matches(ret, ENOENT))
221 		goto create_lostfound;
222 
223 	bch_err_fn(c, ret);
224 	if (ret)
225 		return ret;
226 
227 	if (d_type != DT_DIR) {
228 		bch_err(c, "error looking up lost+found: not a directory");
229 		return -BCH_ERR_ENOENT_not_directory;
230 	}
231 
232 	/*
233 	 * The bch2_check_dirents pass has already run, dangling dirents
234 	 * shouldn't exist here:
235 	 */
236 	ret = lookup_inode(trans, inum, lostfound, &snapshot);
237 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
238 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
239 	return ret;
240 
241 create_lostfound:
242 	/*
243 	 * XXX: we could have a nicer log message here  if we had a nice way to
244 	 * walk backpointers to print a path
245 	 */
246 	bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
247 
248 	u64 now = bch2_current_time(c);
249 	struct btree_iter lostfound_iter = { NULL };
250 	u64 cpu = raw_smp_processor_id();
251 
252 	bch2_inode_init_early(c, lostfound);
253 	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
254 	lostfound->bi_dir = root_inode.bi_inum;
255 
256 	root_inode.bi_nlink++;
257 
258 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
259 	if (ret)
260 		goto err;
261 
262 	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
263 	ret = bch2_btree_iter_traverse(&lostfound_iter);
264 	if (ret)
265 		goto err;
266 
267 	ret =   bch2_dirent_create_snapshot(trans,
268 				0, root_inode.bi_inum, snapshot, &root_hash_info,
269 				mode_to_type(lostfound->bi_mode),
270 				&lostfound_str,
271 				lostfound->bi_inum,
272 				&lostfound->bi_dir_offset,
273 				STR_HASH_must_create) ?:
274 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
275 				       BTREE_UPDATE_internal_snapshot_node);
276 err:
277 	bch_err_msg(c, ret, "creating lost+found");
278 	bch2_trans_iter_exit(trans, &lostfound_iter);
279 	return ret;
280 }
281 
282 static int reattach_inode(struct btree_trans *trans,
283 			  struct bch_inode_unpacked *inode,
284 			  u32 inode_snapshot)
285 {
286 	struct bch_fs *c = trans->c;
287 	struct bch_hash_info dir_hash;
288 	struct bch_inode_unpacked lostfound;
289 	char name_buf[20];
290 	struct qstr name;
291 	u64 dir_offset = 0;
292 	u32 dirent_snapshot = inode_snapshot;
293 	int ret;
294 
295 	if (inode->bi_subvol) {
296 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
297 
298 		u64 root_inum;
299 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
300 				    &dirent_snapshot, &root_inum);
301 		if (ret)
302 			return ret;
303 
304 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
305 	} else {
306 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
307 	}
308 
309 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
310 	if (ret)
311 		return ret;
312 
313 	if (S_ISDIR(inode->bi_mode)) {
314 		lostfound.bi_nlink++;
315 
316 		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
317 		if (ret)
318 			return ret;
319 	}
320 
321 	dir_hash = bch2_hash_info_init(c, &lostfound);
322 
323 	name = (struct qstr) QSTR(name_buf);
324 
325 	ret = bch2_dirent_create_snapshot(trans,
326 				inode->bi_parent_subvol, lostfound.bi_inum,
327 				dirent_snapshot,
328 				&dir_hash,
329 				inode_d_type(inode),
330 				&name,
331 				inode->bi_subvol ?: inode->bi_inum,
332 				&dir_offset,
333 				STR_HASH_must_create);
334 	if (ret) {
335 		bch_err_msg(c, ret, "error creating dirent");
336 		return ret;
337 	}
338 
339 	inode->bi_dir		= lostfound.bi_inum;
340 	inode->bi_dir_offset	= dir_offset;
341 
342 	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
343 }
344 
345 static int remove_backpointer(struct btree_trans *trans,
346 			      struct bch_inode_unpacked *inode)
347 {
348 	struct btree_iter iter;
349 	struct bkey_s_c_dirent d;
350 	int ret;
351 
352 	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
353 				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
354 				     dirent);
355 	ret =   bkey_err(d) ?:
356 		__remove_dirent(trans, d.k->p);
357 	bch2_trans_iter_exit(trans, &iter);
358 	return ret;
359 }
360 
361 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
362 {
363 	struct bch_fs *c = trans->c;
364 
365 	struct bch_inode_unpacked inode;
366 	int ret = bch2_inode_find_by_inum_trans(trans,
367 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
368 				&inode);
369 	if (ret)
370 		return ret;
371 
372 	ret = remove_backpointer(trans, &inode);
373 	bch_err_msg(c, ret, "removing dirent");
374 	if (ret)
375 		return ret;
376 
377 	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
378 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
379 	return ret;
380 }
381 
382 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
383 {
384 	struct bch_fs *c = trans->c;
385 
386 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
387 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
388 		return -BCH_ERR_fsck_repair_unimplemented;
389 	}
390 
391 	/*
392 	 * If inum isn't set, that means we're being called from check_dirents,
393 	 * not check_inodes - the root of this subvolume doesn't exist or we
394 	 * would have found it there:
395 	 */
396 	if (!inum) {
397 		struct btree_iter inode_iter = {};
398 		struct bch_inode_unpacked new_inode;
399 		u64 cpu = raw_smp_processor_id();
400 
401 		bch2_inode_init_early(c, &new_inode);
402 		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
403 
404 		new_inode.bi_subvol = subvolid;
405 
406 		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
407 			  bch2_btree_iter_traverse(&inode_iter) ?:
408 			  bch2_inode_write(trans, &inode_iter, &new_inode);
409 		bch2_trans_iter_exit(trans, &inode_iter);
410 		if (ret)
411 			return ret;
412 
413 		inum = new_inode.bi_inum;
414 	}
415 
416 	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
417 
418 	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
419 	int ret = PTR_ERR_OR_ZERO(new_subvol);
420 	if (ret)
421 		return ret;
422 
423 	bkey_subvolume_init(&new_subvol->k_i);
424 	new_subvol->k.p.offset	= subvolid;
425 	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
426 	new_subvol->v.inode	= cpu_to_le64(inum);
427 	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
428 	if (ret)
429 		return ret;
430 
431 	struct btree_iter iter;
432 	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
433 			BTREE_ID_snapshots, POS(0, snapshotid),
434 			0, snapshot);
435 	ret = PTR_ERR_OR_ZERO(s);
436 	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
437 	if (ret)
438 		return ret;
439 
440 	u32 snapshot_tree = le32_to_cpu(s->v.tree);
441 
442 	s->v.subvol = cpu_to_le32(subvolid);
443 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
444 	bch2_trans_iter_exit(trans, &iter);
445 
446 	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
447 			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
448 			0, snapshot_tree);
449 	ret = PTR_ERR_OR_ZERO(st);
450 	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
451 	if (ret)
452 		return ret;
453 
454 	if (!st->v.master_subvol)
455 		st->v.master_subvol = cpu_to_le32(subvolid);
456 
457 	bch2_trans_iter_exit(trans, &iter);
458 	return 0;
459 }
460 
461 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
462 {
463 	struct bch_fs *c = trans->c;
464 	unsigned i_mode = S_IFREG;
465 	u64 i_size = 0;
466 
467 	switch (btree) {
468 	case BTREE_ID_extents: {
469 		struct btree_iter iter = {};
470 
471 		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
472 		struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
473 		bch2_trans_iter_exit(trans, &iter);
474 		int ret = bkey_err(k);
475 		if (ret)
476 			return ret;
477 
478 		i_size = k.k->p.offset << 9;
479 		break;
480 	}
481 	case BTREE_ID_dirents:
482 		i_mode = S_IFDIR;
483 		break;
484 	case BTREE_ID_xattrs:
485 		break;
486 	default:
487 		BUG();
488 	}
489 
490 	struct bch_inode_unpacked new_inode;
491 	bch2_inode_init_early(c, &new_inode);
492 	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
493 	new_inode.bi_size = i_size;
494 	new_inode.bi_inum = inum;
495 
496 	return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
497 }
498 
499 struct snapshots_seen {
500 	struct bpos			pos;
501 	snapshot_id_list		ids;
502 };
503 
504 static inline void snapshots_seen_exit(struct snapshots_seen *s)
505 {
506 	darray_exit(&s->ids);
507 }
508 
509 static inline void snapshots_seen_init(struct snapshots_seen *s)
510 {
511 	memset(s, 0, sizeof(*s));
512 }
513 
514 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
515 {
516 	u32 *i;
517 	__darray_for_each(s->ids, i) {
518 		if (*i == id)
519 			return 0;
520 		if (*i > id)
521 			break;
522 	}
523 
524 	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
525 	if (ret)
526 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
527 			s->ids.size);
528 	return ret;
529 }
530 
531 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
532 				 enum btree_id btree_id, struct bpos pos)
533 {
534 	if (!bkey_eq(s->pos, pos))
535 		s->ids.nr = 0;
536 	s->pos = pos;
537 
538 	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
539 }
540 
541 /**
542  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
543  * and @ancestor hasn't been overwritten in @seen
544  *
545  * @c:		filesystem handle
546  * @seen:	list of snapshot ids already seen at current position
547  * @id:		descendent snapshot id
548  * @ancestor:	ancestor snapshot id
549  *
550  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
551  */
552 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
553 				    u32 id, u32 ancestor)
554 {
555 	ssize_t i;
556 
557 	EBUG_ON(id > ancestor);
558 
559 	/* @ancestor should be the snapshot most recently added to @seen */
560 	EBUG_ON(ancestor != seen->pos.snapshot);
561 	EBUG_ON(ancestor != darray_last(seen->ids));
562 
563 	if (id == ancestor)
564 		return true;
565 
566 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
567 		return false;
568 
569 	/*
570 	 * We know that @id is a descendant of @ancestor, we're checking if
571 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
572 	 * @ascestor and with @id as a descendent.
573 	 *
574 	 * But we already know that we're scanning IDs between @id and @ancestor
575 	 * numerically, since snapshot ID lists are kept sorted, so if we find
576 	 * an id that's an ancestor of @id we're done:
577 	 */
578 
579 	for (i = seen->ids.nr - 2;
580 	     i >= 0 && seen->ids.data[i] >= id;
581 	     --i)
582 		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
583 			return false;
584 
585 	return true;
586 }
587 
588 /**
589  * ref_visible - given a key with snapshot id @src that points to a key with
590  * snapshot id @dst, test whether there is some snapshot in which @dst is
591  * visible.
592  *
593  * @c:		filesystem handle
594  * @s:		list of snapshot IDs already seen at @src
595  * @src:	snapshot ID of src key
596  * @dst:	snapshot ID of dst key
597  * Returns:	true if there is some snapshot in which @dst is visible
598  *
599  * Assumes we're visiting @src keys in natural key order
600  */
601 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
602 			u32 src, u32 dst)
603 {
604 	return dst <= src
605 		? key_visible_in_snapshot(c, s, dst, src)
606 		: bch2_snapshot_is_ancestor(c, src, dst);
607 }
608 
609 static int ref_visible2(struct bch_fs *c,
610 			u32 src, struct snapshots_seen *src_seen,
611 			u32 dst, struct snapshots_seen *dst_seen)
612 {
613 	if (dst > src) {
614 		swap(dst, src);
615 		swap(dst_seen, src_seen);
616 	}
617 	return key_visible_in_snapshot(c, src_seen, dst, src);
618 }
619 
620 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
621 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
622 	     (_i)->snapshot <= (_snapshot); _i++)					\
623 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
624 
625 struct inode_walker_entry {
626 	struct bch_inode_unpacked inode;
627 	u32			snapshot;
628 	bool			seen_this_pos;
629 	u64			count;
630 };
631 
632 struct inode_walker {
633 	bool				first_this_inode;
634 	bool				recalculate_sums;
635 	struct bpos			last_pos;
636 
637 	DARRAY(struct inode_walker_entry) inodes;
638 };
639 
640 static void inode_walker_exit(struct inode_walker *w)
641 {
642 	darray_exit(&w->inodes);
643 }
644 
645 static struct inode_walker inode_walker_init(void)
646 {
647 	return (struct inode_walker) { 0, };
648 }
649 
650 static int add_inode(struct bch_fs *c, struct inode_walker *w,
651 		     struct bkey_s_c inode)
652 {
653 	struct bch_inode_unpacked u;
654 
655 	BUG_ON(bch2_inode_unpack(inode, &u));
656 
657 	return darray_push(&w->inodes, ((struct inode_walker_entry) {
658 		.inode		= u,
659 		.snapshot	= inode.k->p.snapshot,
660 	}));
661 }
662 
663 static int get_inodes_all_snapshots(struct btree_trans *trans,
664 				    struct inode_walker *w, u64 inum)
665 {
666 	struct bch_fs *c = trans->c;
667 	struct btree_iter iter;
668 	struct bkey_s_c k;
669 	int ret;
670 
671 	w->recalculate_sums = false;
672 	w->inodes.nr = 0;
673 
674 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
675 				     BTREE_ITER_all_snapshots, k, ret) {
676 		if (k.k->p.offset != inum)
677 			break;
678 
679 		if (bkey_is_inode(k.k))
680 			add_inode(c, w, k);
681 	}
682 	bch2_trans_iter_exit(trans, &iter);
683 
684 	if (ret)
685 		return ret;
686 
687 	w->first_this_inode = true;
688 	return 0;
689 }
690 
691 static struct inode_walker_entry *
692 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
693 {
694 	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
695 
696 	struct inode_walker_entry *i;
697 	__darray_for_each(w->inodes, i)
698 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
699 			goto found;
700 
701 	return NULL;
702 found:
703 	BUG_ON(k.k->p.snapshot > i->snapshot);
704 
705 	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
706 		struct inode_walker_entry new = *i;
707 
708 		new.snapshot = k.k->p.snapshot;
709 		new.count = 0;
710 
711 		struct printbuf buf = PRINTBUF;
712 		bch2_bkey_val_to_text(&buf, c, k);
713 
714 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
715 			 "unexpected because we should always update the inode when we update a key in that inode\n"
716 			 "%s",
717 			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
718 		printbuf_exit(&buf);
719 
720 		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
721 			--i;
722 
723 		size_t pos = i - w->inodes.data;
724 		int ret = darray_insert_item(&w->inodes, pos, new);
725 		if (ret)
726 			return ERR_PTR(ret);
727 
728 		i = w->inodes.data + pos;
729 	}
730 
731 	return i;
732 }
733 
734 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
735 					     struct inode_walker *w,
736 					     struct bkey_s_c k)
737 {
738 	if (w->last_pos.inode != k.k->p.inode) {
739 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
740 		if (ret)
741 			return ERR_PTR(ret);
742 	} else if (bkey_cmp(w->last_pos, k.k->p)) {
743 		darray_for_each(w->inodes, i)
744 			i->seen_this_pos = false;
745 	}
746 
747 	w->last_pos = k.k->p;
748 
749 	return lookup_inode_for_snapshot(trans->c, w, k);
750 }
751 
752 static int get_visible_inodes(struct btree_trans *trans,
753 			      struct inode_walker *w,
754 			      struct snapshots_seen *s,
755 			      u64 inum)
756 {
757 	struct bch_fs *c = trans->c;
758 	struct btree_iter iter;
759 	struct bkey_s_c k;
760 	int ret;
761 
762 	w->inodes.nr = 0;
763 
764 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
765 			   BTREE_ITER_all_snapshots, k, ret) {
766 		if (k.k->p.offset != inum)
767 			break;
768 
769 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
770 			continue;
771 
772 		if (bkey_is_inode(k.k))
773 			add_inode(c, w, k);
774 
775 		if (k.k->p.snapshot >= s->pos.snapshot)
776 			break;
777 	}
778 	bch2_trans_iter_exit(trans, &iter);
779 
780 	return ret;
781 }
782 
783 static int hash_redo_key(struct btree_trans *trans,
784 			 const struct bch_hash_desc desc,
785 			 struct bch_hash_info *hash_info,
786 			 struct btree_iter *k_iter, struct bkey_s_c k)
787 {
788 	struct bkey_i *delete;
789 	struct bkey_i *tmp;
790 
791 	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
792 	if (IS_ERR(delete))
793 		return PTR_ERR(delete);
794 
795 	tmp = bch2_bkey_make_mut_noupdate(trans, k);
796 	if (IS_ERR(tmp))
797 		return PTR_ERR(tmp);
798 
799 	bkey_init(&delete->k);
800 	delete->k.p = k_iter->pos;
801 	return  bch2_btree_iter_traverse(k_iter) ?:
802 		bch2_trans_update(trans, k_iter, delete, 0) ?:
803 		bch2_hash_set_in_snapshot(trans, desc, hash_info,
804 				       (subvol_inum) { 0, k.k->p.inode },
805 				       k.k->p.snapshot, tmp,
806 				       STR_HASH_must_create|
807 				       BTREE_UPDATE_internal_snapshot_node) ?:
808 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
809 }
810 
811 static int hash_check_key(struct btree_trans *trans,
812 			  const struct bch_hash_desc desc,
813 			  struct bch_hash_info *hash_info,
814 			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
815 {
816 	struct bch_fs *c = trans->c;
817 	struct btree_iter iter = { NULL };
818 	struct printbuf buf = PRINTBUF;
819 	struct bkey_s_c k;
820 	u64 hash;
821 	int ret = 0;
822 
823 	if (hash_k.k->type != desc.key_type)
824 		return 0;
825 
826 	hash = desc.hash_bkey(hash_info, hash_k);
827 
828 	if (likely(hash == hash_k.k->p.offset))
829 		return 0;
830 
831 	if (hash_k.k->p.offset < hash)
832 		goto bad_hash;
833 
834 	for_each_btree_key_norestart(trans, iter, desc.btree_id,
835 				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
836 				     BTREE_ITER_slots, k, ret) {
837 		if (bkey_eq(k.k->p, hash_k.k->p))
838 			break;
839 
840 		if (fsck_err_on(k.k->type == desc.key_type &&
841 				!desc.cmp_bkey(k, hash_k),
842 				trans, hash_table_key_duplicate,
843 				"duplicate hash table keys:\n%s",
844 				(printbuf_reset(&buf),
845 				 bch2_bkey_val_to_text(&buf, c, hash_k),
846 				 buf.buf))) {
847 			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
848 			break;
849 		}
850 
851 		if (bkey_deleted(k.k)) {
852 			bch2_trans_iter_exit(trans, &iter);
853 			goto bad_hash;
854 		}
855 	}
856 out:
857 	bch2_trans_iter_exit(trans, &iter);
858 	printbuf_exit(&buf);
859 	return ret;
860 bad_hash:
861 	if (fsck_err(trans, hash_table_key_wrong_offset,
862 		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
863 		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
864 		     (printbuf_reset(&buf),
865 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
866 		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
867 		bch_err_fn(c, ret);
868 		if (ret)
869 			return ret;
870 		ret = -BCH_ERR_transaction_restart_nested;
871 	}
872 fsck_err:
873 	goto out;
874 }
875 
876 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
877 						struct btree_iter *iter,
878 						struct bpos pos)
879 {
880 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
881 }
882 
883 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
884 					       struct btree_iter *iter,
885 					       struct bch_inode_unpacked *inode,
886 					       u32 *snapshot)
887 {
888 	if (inode->bi_subvol) {
889 		u64 inum;
890 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
891 		if (ret)
892 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
893 	}
894 
895 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
896 }
897 
898 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
899 				   struct bkey_s_c_dirent d)
900 {
901 	return  inode->bi_dir		== d.k->p.inode &&
902 		inode->bi_dir_offset	== d.k->p.offset;
903 }
904 
905 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
906 				   struct bch_inode_unpacked *inode)
907 {
908 	return d.v->d_type == DT_SUBVOL
909 		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
910 		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
911 }
912 
913 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
914 {
915 	struct btree_iter iter;
916 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
917 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
918 	bch2_trans_iter_exit(trans, &iter);
919 	return ret;
920 }
921 
922 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
923 				    struct bch_inode_unpacked *inode,
924 				    u32 inode_snapshot, bool *write_inode)
925 {
926 	struct bch_fs *c = trans->c;
927 	struct printbuf buf = PRINTBUF;
928 
929 	struct btree_iter dirent_iter = {};
930 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
931 	int ret = bkey_err(d);
932 	if (ret && !bch2_err_matches(ret, ENOENT))
933 		return ret;
934 
935 	if (fsck_err_on(ret,
936 			trans, inode_points_to_missing_dirent,
937 			"inode points to missing dirent\n%s",
938 			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
939 	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
940 			trans, inode_points_to_wrong_dirent,
941 			"inode points to dirent that does not point back:\n%s",
942 			(bch2_bkey_val_to_text(&buf, c, inode_k),
943 			 prt_newline(&buf),
944 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
945 		/*
946 		 * We just clear the backpointer fields for now. If we find a
947 		 * dirent that points to this inode in check_dirents(), we'll
948 		 * update it then; then when we get to check_path() if the
949 		 * backpointer is still 0 we'll reattach it.
950 		 */
951 		inode->bi_dir = 0;
952 		inode->bi_dir_offset = 0;
953 		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
954 		*write_inode = true;
955 	}
956 
957 	ret = 0;
958 fsck_err:
959 	bch2_trans_iter_exit(trans, &dirent_iter);
960 	printbuf_exit(&buf);
961 	bch_err_fn(c, ret);
962 	return ret;
963 }
964 
965 static int check_inode(struct btree_trans *trans,
966 		       struct btree_iter *iter,
967 		       struct bkey_s_c k,
968 		       struct bch_inode_unpacked *prev,
969 		       struct snapshots_seen *s,
970 		       bool full)
971 {
972 	struct bch_fs *c = trans->c;
973 	struct bch_inode_unpacked u;
974 	bool do_update = false;
975 	int ret;
976 
977 	ret = bch2_check_key_has_snapshot(trans, iter, k);
978 	if (ret < 0)
979 		goto err;
980 	if (ret)
981 		return 0;
982 
983 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
984 	if (ret)
985 		goto err;
986 
987 	if (!bkey_is_inode(k.k))
988 		return 0;
989 
990 	BUG_ON(bch2_inode_unpack(k, &u));
991 
992 	if (!full &&
993 	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
994 			    BCH_INODE_i_sectors_dirty|
995 			    BCH_INODE_unlinked)))
996 		return 0;
997 
998 	if (prev->bi_inum != u.bi_inum)
999 		*prev = u;
1000 
1001 	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
1002 			inode_d_type(prev)	!= inode_d_type(&u),
1003 			trans, inode_snapshot_mismatch,
1004 			"inodes in different snapshots don't match")) {
1005 		bch_err(c, "repair not implemented yet");
1006 		return -BCH_ERR_fsck_repair_unimplemented;
1007 	}
1008 
1009 	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
1010 	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
1011 		struct bpos new_min_pos;
1012 
1013 		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
1014 		if (ret)
1015 			goto err;
1016 
1017 		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
1018 
1019 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1020 
1021 		bch_err_msg(c, ret, "in fsck updating inode");
1022 		if (ret)
1023 			return ret;
1024 
1025 		if (!bpos_eq(new_min_pos, POS_MIN))
1026 			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
1027 		return 0;
1028 	}
1029 
1030 	if (u.bi_flags & BCH_INODE_unlinked) {
1031 		ret = check_inode_deleted_list(trans, k.k->p);
1032 		if (ret < 0)
1033 			return ret;
1034 
1035 		fsck_err_on(!ret,
1036 			    trans, unlinked_inode_not_on_deleted_list,
1037 			    "inode %llu:%u unlinked, but not on deleted list",
1038 			    u.bi_inum, k.k->p.snapshot);
1039 		ret = 0;
1040 	}
1041 
1042 	if (u.bi_flags & BCH_INODE_unlinked &&
1043 	    (!c->sb.clean ||
1044 	     fsck_err(trans, inode_unlinked_but_clean,
1045 		      "filesystem marked clean, but inode %llu unlinked",
1046 		      u.bi_inum))) {
1047 		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1048 		bch_err_msg(c, ret, "in fsck deleting inode");
1049 		return ret;
1050 	}
1051 
1052 	if (u.bi_flags & BCH_INODE_i_size_dirty &&
1053 	    (!c->sb.clean ||
1054 	     fsck_err(trans, inode_i_size_dirty_but_clean,
1055 		      "filesystem marked clean, but inode %llu has i_size dirty",
1056 		      u.bi_inum))) {
1057 		bch_verbose(c, "truncating inode %llu", u.bi_inum);
1058 
1059 		/*
1060 		 * XXX: need to truncate partial blocks too here - or ideally
1061 		 * just switch units to bytes and that issue goes away
1062 		 */
1063 		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1064 				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
1065 				     iter->pos.snapshot),
1066 				POS(u.bi_inum, U64_MAX),
1067 				0, NULL);
1068 		bch_err_msg(c, ret, "in fsck truncating inode");
1069 		if (ret)
1070 			return ret;
1071 
1072 		/*
1073 		 * We truncated without our normal sector accounting hook, just
1074 		 * make sure we recalculate it:
1075 		 */
1076 		u.bi_flags |= BCH_INODE_i_sectors_dirty;
1077 
1078 		u.bi_flags &= ~BCH_INODE_i_size_dirty;
1079 		do_update = true;
1080 	}
1081 
1082 	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
1083 	    (!c->sb.clean ||
1084 	     fsck_err(trans, inode_i_sectors_dirty_but_clean,
1085 		      "filesystem marked clean, but inode %llu has i_sectors dirty",
1086 		      u.bi_inum))) {
1087 		s64 sectors;
1088 
1089 		bch_verbose(c, "recounting sectors for inode %llu",
1090 			    u.bi_inum);
1091 
1092 		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1093 		if (sectors < 0) {
1094 			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1095 			return sectors;
1096 		}
1097 
1098 		u.bi_sectors = sectors;
1099 		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1100 		do_update = true;
1101 	}
1102 
1103 	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1104 		u.bi_dir = 0;
1105 		u.bi_dir_offset = 0;
1106 		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1107 		do_update = true;
1108 	}
1109 
1110 	if (u.bi_dir || u.bi_dir_offset) {
1111 		ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1112 		if (ret)
1113 			goto err;
1114 	}
1115 
1116 	if (fsck_err_on(u.bi_parent_subvol &&
1117 			(u.bi_subvol == 0 ||
1118 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1119 			trans, inode_bi_parent_nonzero,
1120 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1121 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1122 		u.bi_parent_subvol = 0;
1123 		do_update = true;
1124 	}
1125 
1126 	if (u.bi_subvol) {
1127 		struct bch_subvolume s;
1128 
1129 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1130 		if (ret && !bch2_err_matches(ret, ENOENT))
1131 			goto err;
1132 
1133 		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1134 			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1135 			goto do_update;
1136 		}
1137 
1138 		if (fsck_err_on(ret,
1139 				trans, inode_bi_subvol_missing,
1140 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1141 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1142 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1143 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1144 							   k.k->p.snapshot),
1145 				trans, inode_bi_subvol_wrong,
1146 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1147 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1148 				le64_to_cpu(s.inode),
1149 				le32_to_cpu(s.snapshot))) {
1150 			u.bi_subvol = 0;
1151 			u.bi_parent_subvol = 0;
1152 			do_update = true;
1153 		}
1154 	}
1155 do_update:
1156 	if (do_update) {
1157 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1158 		bch_err_msg(c, ret, "in fsck updating inode");
1159 		if (ret)
1160 			return ret;
1161 	}
1162 err:
1163 fsck_err:
1164 	bch_err_fn(c, ret);
1165 	return ret;
1166 }
1167 
1168 int bch2_check_inodes(struct bch_fs *c)
1169 {
1170 	bool full = c->opts.fsck;
1171 	struct bch_inode_unpacked prev = { 0 };
1172 	struct snapshots_seen s;
1173 
1174 	snapshots_seen_init(&s);
1175 
1176 	int ret = bch2_trans_run(c,
1177 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1178 				POS_MIN,
1179 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1180 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1181 			check_inode(trans, &iter, k, &prev, &s, full)));
1182 
1183 	snapshots_seen_exit(&s);
1184 	bch_err_fn(c, ret);
1185 	return ret;
1186 }
1187 
1188 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1189 {
1190 	switch (btree) {
1191 	case BTREE_ID_extents:
1192 		return S_ISREG(mode) || S_ISLNK(mode);
1193 	case BTREE_ID_dirents:
1194 		return S_ISDIR(mode);
1195 	case BTREE_ID_xattrs:
1196 		return true;
1197 	default:
1198 		BUG();
1199 	}
1200 }
1201 
1202 static int check_key_has_inode(struct btree_trans *trans,
1203 			       struct btree_iter *iter,
1204 			       struct inode_walker *inode,
1205 			       struct inode_walker_entry *i,
1206 			       struct bkey_s_c k)
1207 {
1208 	struct bch_fs *c = trans->c;
1209 	struct printbuf buf = PRINTBUF;
1210 	int ret = PTR_ERR_OR_ZERO(i);
1211 	if (ret)
1212 		return ret;
1213 
1214 	if (k.k->type == KEY_TYPE_whiteout)
1215 		goto out;
1216 
1217 	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1218 		ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1219 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1220 		if (ret)
1221 			goto err;
1222 
1223 		inode->last_pos.inode--;
1224 		ret = -BCH_ERR_transaction_restart_nested;
1225 		goto err;
1226 	}
1227 
1228 	if (fsck_err_on(!i,
1229 			trans, key_in_missing_inode,
1230 			"key in missing inode:\n  %s",
1231 			(printbuf_reset(&buf),
1232 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1233 		goto delete;
1234 
1235 	if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1236 			trans, key_in_wrong_inode_type,
1237 			"key for wrong inode mode %o:\n  %s",
1238 			i->inode.bi_mode,
1239 			(printbuf_reset(&buf),
1240 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1241 		goto delete;
1242 out:
1243 err:
1244 fsck_err:
1245 	printbuf_exit(&buf);
1246 	bch_err_fn(c, ret);
1247 	return ret;
1248 delete:
1249 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1250 	goto out;
1251 }
1252 
1253 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1254 {
1255 	struct bch_fs *c = trans->c;
1256 	int ret = 0;
1257 	s64 count2;
1258 
1259 	darray_for_each(w->inodes, i) {
1260 		if (i->inode.bi_sectors == i->count)
1261 			continue;
1262 
1263 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1264 
1265 		if (w->recalculate_sums)
1266 			i->count = count2;
1267 
1268 		if (i->count != count2) {
1269 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1270 					    w->last_pos.inode, i->snapshot, i->count, count2);
1271 			return -BCH_ERR_internal_fsck_err;
1272 		}
1273 
1274 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1275 				trans, inode_i_sectors_wrong,
1276 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1277 				w->last_pos.inode, i->snapshot,
1278 				i->inode.bi_sectors, i->count)) {
1279 			i->inode.bi_sectors = i->count;
1280 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1281 			if (ret)
1282 				break;
1283 		}
1284 	}
1285 fsck_err:
1286 	bch_err_fn(c, ret);
1287 	return ret;
1288 }
1289 
1290 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1291 {
1292 	u32 restart_count = trans->restart_count;
1293 	return check_i_sectors_notnested(trans, w) ?:
1294 		trans_was_restarted(trans, restart_count);
1295 }
1296 
1297 struct extent_end {
1298 	u32			snapshot;
1299 	u64			offset;
1300 	struct snapshots_seen	seen;
1301 };
1302 
1303 struct extent_ends {
1304 	struct bpos			last_pos;
1305 	DARRAY(struct extent_end)	e;
1306 };
1307 
1308 static void extent_ends_reset(struct extent_ends *extent_ends)
1309 {
1310 	darray_for_each(extent_ends->e, i)
1311 		snapshots_seen_exit(&i->seen);
1312 	extent_ends->e.nr = 0;
1313 }
1314 
1315 static void extent_ends_exit(struct extent_ends *extent_ends)
1316 {
1317 	extent_ends_reset(extent_ends);
1318 	darray_exit(&extent_ends->e);
1319 }
1320 
1321 static void extent_ends_init(struct extent_ends *extent_ends)
1322 {
1323 	memset(extent_ends, 0, sizeof(*extent_ends));
1324 }
1325 
1326 static int extent_ends_at(struct bch_fs *c,
1327 			  struct extent_ends *extent_ends,
1328 			  struct snapshots_seen *seen,
1329 			  struct bkey_s_c k)
1330 {
1331 	struct extent_end *i, n = (struct extent_end) {
1332 		.offset		= k.k->p.offset,
1333 		.snapshot	= k.k->p.snapshot,
1334 		.seen		= *seen,
1335 	};
1336 
1337 	n.seen.ids.data = kmemdup(seen->ids.data,
1338 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1339 			      GFP_KERNEL);
1340 	if (!n.seen.ids.data)
1341 		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1342 
1343 	__darray_for_each(extent_ends->e, i) {
1344 		if (i->snapshot == k.k->p.snapshot) {
1345 			snapshots_seen_exit(&i->seen);
1346 			*i = n;
1347 			return 0;
1348 		}
1349 
1350 		if (i->snapshot >= k.k->p.snapshot)
1351 			break;
1352 	}
1353 
1354 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1355 }
1356 
1357 static int overlapping_extents_found(struct btree_trans *trans,
1358 				     enum btree_id btree,
1359 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1360 				     struct bkey pos2,
1361 				     bool *fixed,
1362 				     struct extent_end *extent_end)
1363 {
1364 	struct bch_fs *c = trans->c;
1365 	struct printbuf buf = PRINTBUF;
1366 	struct btree_iter iter1, iter2 = { NULL };
1367 	struct bkey_s_c k1, k2;
1368 	int ret;
1369 
1370 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1371 
1372 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1373 			     BTREE_ITER_all_snapshots|
1374 			     BTREE_ITER_not_extents);
1375 	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1376 	ret = bkey_err(k1);
1377 	if (ret)
1378 		goto err;
1379 
1380 	prt_str(&buf, "\n  ");
1381 	bch2_bkey_val_to_text(&buf, c, k1);
1382 
1383 	if (!bpos_eq(pos1, k1.k->p)) {
1384 		prt_str(&buf, "\n  wanted\n  ");
1385 		bch2_bpos_to_text(&buf, pos1);
1386 		prt_str(&buf, "\n  ");
1387 		bch2_bkey_to_text(&buf, &pos2);
1388 
1389 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1390 			__func__, buf.buf);
1391 		ret = -BCH_ERR_internal_fsck_err;
1392 		goto err;
1393 	}
1394 
1395 	bch2_trans_copy_iter(&iter2, &iter1);
1396 
1397 	while (1) {
1398 		bch2_btree_iter_advance(&iter2);
1399 
1400 		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1401 		ret = bkey_err(k2);
1402 		if (ret)
1403 			goto err;
1404 
1405 		if (bpos_ge(k2.k->p, pos2.p))
1406 			break;
1407 	}
1408 
1409 	prt_str(&buf, "\n  ");
1410 	bch2_bkey_val_to_text(&buf, c, k2);
1411 
1412 	if (bpos_gt(k2.k->p, pos2.p) ||
1413 	    pos2.size != k2.k->size) {
1414 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1415 			__func__, buf.buf);
1416 		ret = -BCH_ERR_internal_fsck_err;
1417 		goto err;
1418 	}
1419 
1420 	prt_printf(&buf, "\n  overwriting %s extent",
1421 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1422 
1423 	if (fsck_err(trans, extent_overlapping,
1424 		     "overlapping extents%s", buf.buf)) {
1425 		struct btree_iter *old_iter = &iter1;
1426 		struct disk_reservation res = { 0 };
1427 
1428 		if (pos1.snapshot < pos2.p.snapshot) {
1429 			old_iter = &iter2;
1430 			swap(k1, k2);
1431 		}
1432 
1433 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1434 
1435 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1436 				BTREE_UPDATE_internal_snapshot_node,
1437 				k1, k2) ?:
1438 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1439 		bch2_disk_reservation_put(c, &res);
1440 
1441 		if (ret)
1442 			goto err;
1443 
1444 		*fixed = true;
1445 
1446 		if (pos1.snapshot == pos2.p.snapshot) {
1447 			/*
1448 			 * We overwrote the first extent, and did the overwrite
1449 			 * in the same snapshot:
1450 			 */
1451 			extent_end->offset = bkey_start_offset(&pos2);
1452 		} else if (pos1.snapshot > pos2.p.snapshot) {
1453 			/*
1454 			 * We overwrote the first extent in pos2's snapshot:
1455 			 */
1456 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1457 		} else {
1458 			/*
1459 			 * We overwrote the second extent - restart
1460 			 * check_extent() from the top:
1461 			 */
1462 			ret = -BCH_ERR_transaction_restart_nested;
1463 		}
1464 	}
1465 fsck_err:
1466 err:
1467 	bch2_trans_iter_exit(trans, &iter2);
1468 	bch2_trans_iter_exit(trans, &iter1);
1469 	printbuf_exit(&buf);
1470 	return ret;
1471 }
1472 
1473 static int check_overlapping_extents(struct btree_trans *trans,
1474 			      struct snapshots_seen *seen,
1475 			      struct extent_ends *extent_ends,
1476 			      struct bkey_s_c k,
1477 			      struct btree_iter *iter,
1478 			      bool *fixed)
1479 {
1480 	struct bch_fs *c = trans->c;
1481 	int ret = 0;
1482 
1483 	/* transaction restart, running again */
1484 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1485 		return 0;
1486 
1487 	if (extent_ends->last_pos.inode != k.k->p.inode)
1488 		extent_ends_reset(extent_ends);
1489 
1490 	darray_for_each(extent_ends->e, i) {
1491 		if (i->offset <= bkey_start_offset(k.k))
1492 			continue;
1493 
1494 		if (!ref_visible2(c,
1495 				  k.k->p.snapshot, seen,
1496 				  i->snapshot, &i->seen))
1497 			continue;
1498 
1499 		ret = overlapping_extents_found(trans, iter->btree_id,
1500 						SPOS(iter->pos.inode,
1501 						     i->offset,
1502 						     i->snapshot),
1503 						&i->seen,
1504 						*k.k, fixed, i);
1505 		if (ret)
1506 			goto err;
1507 	}
1508 
1509 	extent_ends->last_pos = k.k->p;
1510 err:
1511 	return ret;
1512 }
1513 
1514 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1515 				struct bkey_s_c k)
1516 {
1517 	struct bch_fs *c = trans->c;
1518 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1519 	struct bch_extent_crc_unpacked crc;
1520 	const union bch_extent_entry *i;
1521 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1522 
1523 	bkey_for_each_crc(k.k, ptrs, crc, i)
1524 		if (crc_is_encoded(crc) &&
1525 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1526 			struct printbuf buf = PRINTBUF;
1527 
1528 			bch2_bkey_val_to_text(&buf, c, k);
1529 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1530 			printbuf_exit(&buf);
1531 		}
1532 
1533 	return 0;
1534 }
1535 
1536 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1537 			struct bkey_s_c k,
1538 			struct inode_walker *inode,
1539 			struct snapshots_seen *s,
1540 			struct extent_ends *extent_ends)
1541 {
1542 	struct bch_fs *c = trans->c;
1543 	struct inode_walker_entry *i;
1544 	struct printbuf buf = PRINTBUF;
1545 	int ret = 0;
1546 
1547 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1548 	if (ret) {
1549 		ret = ret < 0 ? ret : 0;
1550 		goto out;
1551 	}
1552 
1553 	if (inode->last_pos.inode != k.k->p.inode) {
1554 		ret = check_i_sectors(trans, inode);
1555 		if (ret)
1556 			goto err;
1557 	}
1558 
1559 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1560 	if (ret)
1561 		goto err;
1562 
1563 	i = walk_inode(trans, inode, k);
1564 	ret = PTR_ERR_OR_ZERO(i);
1565 	if (ret)
1566 		goto err;
1567 
1568 	ret = check_key_has_inode(trans, iter, inode, i, k);
1569 	if (ret)
1570 		goto err;
1571 
1572 	if (k.k->type != KEY_TYPE_whiteout) {
1573 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1574 						&inode->recalculate_sums);
1575 		if (ret)
1576 			goto err;
1577 	}
1578 
1579 	/*
1580 	 * Check inodes in reverse order, from oldest snapshots to newest,
1581 	 * starting from the inode that matches this extent's snapshot. If we
1582 	 * didn't have one, iterate over all inodes:
1583 	 */
1584 	if (!i)
1585 		i = &darray_last(inode->inodes);
1586 
1587 	for (;
1588 	     inode->inodes.data && i >= inode->inodes.data;
1589 	     --i) {
1590 		if (i->snapshot > k.k->p.snapshot ||
1591 		    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1592 			continue;
1593 
1594 		if (k.k->type != KEY_TYPE_whiteout) {
1595 			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1596 					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1597 					!bkey_extent_is_reservation(k),
1598 					trans, extent_past_end_of_inode,
1599 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
1600 					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1601 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1602 				struct btree_iter iter2;
1603 
1604 				bch2_trans_copy_iter(&iter2, iter);
1605 				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1606 				ret =   bch2_btree_iter_traverse(&iter2) ?:
1607 					bch2_btree_delete_at(trans, &iter2,
1608 						BTREE_UPDATE_internal_snapshot_node);
1609 				bch2_trans_iter_exit(trans, &iter2);
1610 				if (ret)
1611 					goto err;
1612 
1613 				iter->k.type = KEY_TYPE_whiteout;
1614 			}
1615 
1616 			if (bkey_extent_is_allocation(k.k))
1617 				i->count += k.k->size;
1618 		}
1619 
1620 		i->seen_this_pos = true;
1621 	}
1622 
1623 	if (k.k->type != KEY_TYPE_whiteout) {
1624 		ret = extent_ends_at(c, extent_ends, s, k);
1625 		if (ret)
1626 			goto err;
1627 	}
1628 out:
1629 err:
1630 fsck_err:
1631 	printbuf_exit(&buf);
1632 	bch_err_fn(c, ret);
1633 	return ret;
1634 }
1635 
1636 /*
1637  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1638  * that i_size an i_sectors are consistent
1639  */
1640 int bch2_check_extents(struct bch_fs *c)
1641 {
1642 	struct inode_walker w = inode_walker_init();
1643 	struct snapshots_seen s;
1644 	struct extent_ends extent_ends;
1645 	struct disk_reservation res = { 0 };
1646 
1647 	snapshots_seen_init(&s);
1648 	extent_ends_init(&extent_ends);
1649 
1650 	int ret = bch2_trans_run(c,
1651 		for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1652 				POS(BCACHEFS_ROOT_INO, 0),
1653 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1654 				&res, NULL,
1655 				BCH_TRANS_COMMIT_no_enospc, ({
1656 			bch2_disk_reservation_put(c, &res);
1657 			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1658 			check_extent_overbig(trans, &iter, k);
1659 		})) ?:
1660 		check_i_sectors_notnested(trans, &w));
1661 
1662 	bch2_disk_reservation_put(c, &res);
1663 	extent_ends_exit(&extent_ends);
1664 	inode_walker_exit(&w);
1665 	snapshots_seen_exit(&s);
1666 
1667 	bch_err_fn(c, ret);
1668 	return ret;
1669 }
1670 
1671 int bch2_check_indirect_extents(struct bch_fs *c)
1672 {
1673 	struct disk_reservation res = { 0 };
1674 
1675 	int ret = bch2_trans_run(c,
1676 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1677 				POS_MIN,
1678 				BTREE_ITER_prefetch, k,
1679 				&res, NULL,
1680 				BCH_TRANS_COMMIT_no_enospc, ({
1681 			bch2_disk_reservation_put(c, &res);
1682 			check_extent_overbig(trans, &iter, k);
1683 		})));
1684 
1685 	bch2_disk_reservation_put(c, &res);
1686 	bch_err_fn(c, ret);
1687 	return ret;
1688 }
1689 
1690 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1691 {
1692 	struct bch_fs *c = trans->c;
1693 	int ret = 0;
1694 	s64 count2;
1695 
1696 	darray_for_each(w->inodes, i) {
1697 		if (i->inode.bi_nlink == i->count)
1698 			continue;
1699 
1700 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1701 		if (count2 < 0)
1702 			return count2;
1703 
1704 		if (i->count != count2) {
1705 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1706 					    w->last_pos.inode, i->snapshot, i->count, count2);
1707 			i->count = count2;
1708 			if (i->inode.bi_nlink == i->count)
1709 				continue;
1710 		}
1711 
1712 		if (fsck_err_on(i->inode.bi_nlink != i->count,
1713 				trans, inode_dir_wrong_nlink,
1714 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1715 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1716 			i->inode.bi_nlink = i->count;
1717 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1718 			if (ret)
1719 				break;
1720 		}
1721 	}
1722 fsck_err:
1723 	bch_err_fn(c, ret);
1724 	return ret;
1725 }
1726 
1727 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1728 {
1729 	u32 restart_count = trans->restart_count;
1730 	return check_subdir_count_notnested(trans, w) ?:
1731 		trans_was_restarted(trans, restart_count);
1732 }
1733 
1734 noinline_for_stack
1735 static int check_dirent_inode_dirent(struct btree_trans *trans,
1736 				   struct btree_iter *iter,
1737 				   struct bkey_s_c_dirent d,
1738 				   struct bch_inode_unpacked *target,
1739 				   u32 target_snapshot)
1740 {
1741 	struct bch_fs *c = trans->c;
1742 	struct printbuf buf = PRINTBUF;
1743 	int ret = 0;
1744 
1745 	if (inode_points_to_dirent(target, d))
1746 		return 0;
1747 
1748 	if (bch2_inode_should_have_bp(target) &&
1749 	    !fsck_err(trans, inode_wrong_backpointer,
1750 		      "dirent points to inode that does not point back:\n  %s",
1751 		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
1752 		       prt_printf(&buf, "\n  "),
1753 		       bch2_inode_unpacked_to_text(&buf, target),
1754 		       buf.buf)))
1755 		goto out_noiter;
1756 
1757 	if (!target->bi_dir &&
1758 	    !target->bi_dir_offset) {
1759 		target->bi_dir		= d.k->p.inode;
1760 		target->bi_dir_offset	= d.k->p.offset;
1761 		return __bch2_fsck_write_inode(trans, target, target_snapshot);
1762 	}
1763 
1764 	struct btree_iter bp_iter = { NULL };
1765 	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1766 			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1767 	ret = bkey_err(bp_dirent);
1768 	if (ret && !bch2_err_matches(ret, ENOENT))
1769 		goto err;
1770 
1771 	bool backpointer_exists = !ret;
1772 	ret = 0;
1773 
1774 	if (fsck_err_on(!backpointer_exists,
1775 			trans, inode_wrong_backpointer,
1776 			"inode %llu:%u has wrong backpointer:\n"
1777 			"got       %llu:%llu\n"
1778 			"should be %llu:%llu",
1779 			target->bi_inum, target_snapshot,
1780 			target->bi_dir,
1781 			target->bi_dir_offset,
1782 			d.k->p.inode,
1783 			d.k->p.offset)) {
1784 		target->bi_dir		= d.k->p.inode;
1785 		target->bi_dir_offset	= d.k->p.offset;
1786 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1787 		goto out;
1788 	}
1789 
1790 	bch2_bkey_val_to_text(&buf, c, d.s_c);
1791 	prt_newline(&buf);
1792 	if (backpointer_exists)
1793 		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1794 
1795 	if (fsck_err_on(backpointer_exists &&
1796 			(S_ISDIR(target->bi_mode) ||
1797 			 target->bi_subvol),
1798 			trans, inode_dir_multiple_links,
1799 			"%s %llu:%u with multiple links\n%s",
1800 			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1801 			target->bi_inum, target_snapshot, buf.buf)) {
1802 		ret = __remove_dirent(trans, d.k->p);
1803 		goto out;
1804 	}
1805 
1806 	/*
1807 	 * hardlinked file with nlink 0:
1808 	 * We're just adjusting nlink here so check_nlinks() will pick
1809 	 * it up, it ignores inodes with nlink 0
1810 	 */
1811 	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1812 			trans, inode_multiple_links_but_nlink_0,
1813 			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1814 			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1815 		target->bi_nlink++;
1816 		target->bi_flags &= ~BCH_INODE_unlinked;
1817 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1818 		if (ret)
1819 			goto err;
1820 	}
1821 out:
1822 err:
1823 fsck_err:
1824 	bch2_trans_iter_exit(trans, &bp_iter);
1825 out_noiter:
1826 	printbuf_exit(&buf);
1827 	bch_err_fn(c, ret);
1828 	return ret;
1829 }
1830 
1831 noinline_for_stack
1832 static int check_dirent_target(struct btree_trans *trans,
1833 			       struct btree_iter *iter,
1834 			       struct bkey_s_c_dirent d,
1835 			       struct bch_inode_unpacked *target,
1836 			       u32 target_snapshot)
1837 {
1838 	struct bch_fs *c = trans->c;
1839 	struct bkey_i_dirent *n;
1840 	struct printbuf buf = PRINTBUF;
1841 	int ret = 0;
1842 
1843 	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1844 	if (ret)
1845 		goto err;
1846 
1847 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
1848 			trans, dirent_d_type_wrong,
1849 			"incorrect d_type: got %s, should be %s:\n%s",
1850 			bch2_d_type_str(d.v->d_type),
1851 			bch2_d_type_str(inode_d_type(target)),
1852 			(printbuf_reset(&buf),
1853 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1854 		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1855 		ret = PTR_ERR_OR_ZERO(n);
1856 		if (ret)
1857 			goto err;
1858 
1859 		bkey_reassemble(&n->k_i, d.s_c);
1860 		n->v.d_type = inode_d_type(target);
1861 		if (n->v.d_type == DT_SUBVOL) {
1862 			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1863 			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1864 		} else {
1865 			n->v.d_inum = cpu_to_le64(target->bi_inum);
1866 		}
1867 
1868 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1869 		if (ret)
1870 			goto err;
1871 
1872 		d = dirent_i_to_s_c(n);
1873 	}
1874 err:
1875 fsck_err:
1876 	printbuf_exit(&buf);
1877 	bch_err_fn(c, ret);
1878 	return ret;
1879 }
1880 
1881 /* find a subvolume that's a descendent of @snapshot: */
1882 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1883 {
1884 	struct btree_iter iter;
1885 	struct bkey_s_c k;
1886 	int ret;
1887 
1888 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1889 		if (k.k->type != KEY_TYPE_subvolume)
1890 			continue;
1891 
1892 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1893 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1894 			bch2_trans_iter_exit(trans, &iter);
1895 			*subvolid = k.k->p.offset;
1896 			goto found;
1897 		}
1898 	}
1899 	if (!ret)
1900 		ret = -ENOENT;
1901 found:
1902 	bch2_trans_iter_exit(trans, &iter);
1903 	return ret;
1904 }
1905 
1906 noinline_for_stack
1907 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1908 				  struct bkey_s_c_dirent d)
1909 {
1910 	struct bch_fs *c = trans->c;
1911 	struct btree_iter subvol_iter = {};
1912 	struct bch_inode_unpacked subvol_root;
1913 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1914 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1915 	u32 parent_snapshot;
1916 	u32 new_parent_subvol = 0;
1917 	u64 parent_inum;
1918 	struct printbuf buf = PRINTBUF;
1919 	int ret = 0;
1920 
1921 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1922 	if (ret && !bch2_err_matches(ret, ENOENT))
1923 		return ret;
1924 
1925 	if (ret ||
1926 	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
1927 		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1928 		if (ret2 && !bch2_err_matches(ret, ENOENT))
1929 			return ret2;
1930 	}
1931 
1932 	if (ret &&
1933 	    !new_parent_subvol &&
1934 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1935 		/*
1936 		 * Couldn't find a subvol for dirent's snapshot - but we lost
1937 		 * subvols, so we need to reconstruct:
1938 		 */
1939 		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
1940 		if (ret)
1941 			return ret;
1942 
1943 		parent_snapshot = d.k->p.snapshot;
1944 	}
1945 
1946 	if (fsck_err_on(ret,
1947 			trans, dirent_to_missing_parent_subvol,
1948 			"dirent parent_subvol points to missing subvolume\n%s",
1949 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1950 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1951 			trans, dirent_not_visible_in_parent_subvol,
1952 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1953 			parent_snapshot,
1954 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1955 		if (!new_parent_subvol) {
1956 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
1957 			return -BCH_ERR_fsck_repair_unimplemented;
1958 		}
1959 
1960 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1961 		ret = PTR_ERR_OR_ZERO(new_dirent);
1962 		if (ret)
1963 			goto err;
1964 
1965 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1966 	}
1967 
1968 	struct bkey_s_c_subvolume s =
1969 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
1970 					 BTREE_ID_subvolumes, POS(0, target_subvol),
1971 					 0, subvolume);
1972 	ret = bkey_err(s.s_c);
1973 	if (ret && !bch2_err_matches(ret, ENOENT))
1974 		return ret;
1975 
1976 	if (ret) {
1977 		if (fsck_err(trans, dirent_to_missing_subvol,
1978 			     "dirent points to missing subvolume\n%s",
1979 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1980 			return __remove_dirent(trans, d.k->p);
1981 		ret = 0;
1982 		goto out;
1983 	}
1984 
1985 	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
1986 			trans, subvol_fs_path_parent_wrong,
1987 			"subvol with wrong fs_path_parent, should be be %u\n%s",
1988 			parent_subvol,
1989 			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
1990 		struct bkey_i_subvolume *n =
1991 			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
1992 		ret = PTR_ERR_OR_ZERO(n);
1993 		if (ret)
1994 			goto err;
1995 
1996 		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
1997 	}
1998 
1999 	u64 target_inum = le64_to_cpu(s.v->inode);
2000 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2001 
2002 	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
2003 	if (ret && !bch2_err_matches(ret, ENOENT))
2004 		goto err;
2005 
2006 	if (ret) {
2007 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2008 		ret = -BCH_ERR_fsck_repair_unimplemented;
2009 		goto err;
2010 	}
2011 
2012 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2013 			trans, inode_bi_parent_wrong,
2014 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2015 			target_inum,
2016 			subvol_root.bi_parent_subvol, parent_subvol)) {
2017 		subvol_root.bi_parent_subvol = parent_subvol;
2018 		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
2019 		if (ret)
2020 			goto err;
2021 	}
2022 
2023 	ret = check_dirent_target(trans, iter, d, &subvol_root,
2024 				  target_snapshot);
2025 	if (ret)
2026 		goto err;
2027 out:
2028 err:
2029 fsck_err:
2030 	bch2_trans_iter_exit(trans, &subvol_iter);
2031 	printbuf_exit(&buf);
2032 	return ret;
2033 }
2034 
2035 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2036 			struct bkey_s_c k,
2037 			struct bch_hash_info *hash_info,
2038 			struct inode_walker *dir,
2039 			struct inode_walker *target,
2040 			struct snapshots_seen *s)
2041 {
2042 	struct bch_fs *c = trans->c;
2043 	struct inode_walker_entry *i;
2044 	struct printbuf buf = PRINTBUF;
2045 	int ret = 0;
2046 
2047 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2048 	if (ret) {
2049 		ret = ret < 0 ? ret : 0;
2050 		goto out;
2051 	}
2052 
2053 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2054 	if (ret)
2055 		goto err;
2056 
2057 	if (k.k->type == KEY_TYPE_whiteout)
2058 		goto out;
2059 
2060 	if (dir->last_pos.inode != k.k->p.inode) {
2061 		ret = check_subdir_count(trans, dir);
2062 		if (ret)
2063 			goto err;
2064 	}
2065 
2066 	i = walk_inode(trans, dir, k);
2067 	ret = PTR_ERR_OR_ZERO(i);
2068 	if (ret < 0)
2069 		goto err;
2070 
2071 	ret = check_key_has_inode(trans, iter, dir, i, k);
2072 	if (ret)
2073 		goto err;
2074 
2075 	if (!i)
2076 		goto out;
2077 
2078 	if (dir->first_this_inode)
2079 		*hash_info = bch2_hash_info_init(c, &i->inode);
2080 	dir->first_this_inode = false;
2081 
2082 	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
2083 	if (ret < 0)
2084 		goto err;
2085 	if (ret) {
2086 		/* dirent has been deleted */
2087 		ret = 0;
2088 		goto out;
2089 	}
2090 
2091 	if (k.k->type != KEY_TYPE_dirent)
2092 		goto out;
2093 
2094 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2095 
2096 	if (d.v->d_type == DT_SUBVOL) {
2097 		ret = check_dirent_to_subvol(trans, iter, d);
2098 		if (ret)
2099 			goto err;
2100 	} else {
2101 		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2102 		if (ret)
2103 			goto err;
2104 
2105 		if (fsck_err_on(!target->inodes.nr,
2106 				trans, dirent_to_missing_inode,
2107 				"dirent points to missing inode:\n%s",
2108 				(printbuf_reset(&buf),
2109 				 bch2_bkey_val_to_text(&buf, c, k),
2110 				 buf.buf))) {
2111 			ret = __remove_dirent(trans, d.k->p);
2112 			if (ret)
2113 				goto err;
2114 		}
2115 
2116 		darray_for_each(target->inodes, i) {
2117 			ret = check_dirent_target(trans, iter, d,
2118 						  &i->inode, i->snapshot);
2119 			if (ret)
2120 				goto err;
2121 		}
2122 
2123 		if (d.v->d_type == DT_DIR)
2124 			for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
2125 				i->count++;
2126 	}
2127 out:
2128 err:
2129 fsck_err:
2130 	printbuf_exit(&buf);
2131 	bch_err_fn(c, ret);
2132 	return ret;
2133 }
2134 
2135 /*
2136  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2137  * validate d_type
2138  */
2139 int bch2_check_dirents(struct bch_fs *c)
2140 {
2141 	struct inode_walker dir = inode_walker_init();
2142 	struct inode_walker target = inode_walker_init();
2143 	struct snapshots_seen s;
2144 	struct bch_hash_info hash_info;
2145 
2146 	snapshots_seen_init(&s);
2147 
2148 	int ret = bch2_trans_run(c,
2149 		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2150 				POS(BCACHEFS_ROOT_INO, 0),
2151 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2152 				k,
2153 				NULL, NULL,
2154 				BCH_TRANS_COMMIT_no_enospc,
2155 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2156 		check_subdir_count_notnested(trans, &dir));
2157 
2158 	snapshots_seen_exit(&s);
2159 	inode_walker_exit(&dir);
2160 	inode_walker_exit(&target);
2161 	bch_err_fn(c, ret);
2162 	return ret;
2163 }
2164 
2165 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2166 		       struct bkey_s_c k,
2167 		       struct bch_hash_info *hash_info,
2168 		       struct inode_walker *inode)
2169 {
2170 	struct bch_fs *c = trans->c;
2171 	struct inode_walker_entry *i;
2172 	int ret;
2173 
2174 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2175 	if (ret < 0)
2176 		return ret;
2177 	if (ret)
2178 		return 0;
2179 
2180 	i = walk_inode(trans, inode, k);
2181 	ret = PTR_ERR_OR_ZERO(i);
2182 	if (ret)
2183 		return ret;
2184 
2185 	ret = check_key_has_inode(trans, iter, inode, i, k);
2186 	if (ret)
2187 		return ret;
2188 
2189 	if (!i)
2190 		return 0;
2191 
2192 	if (inode->first_this_inode)
2193 		*hash_info = bch2_hash_info_init(c, &i->inode);
2194 	inode->first_this_inode = false;
2195 
2196 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2197 	bch_err_fn(c, ret);
2198 	return ret;
2199 }
2200 
2201 /*
2202  * Walk xattrs: verify that they all have a corresponding inode
2203  */
2204 int bch2_check_xattrs(struct bch_fs *c)
2205 {
2206 	struct inode_walker inode = inode_walker_init();
2207 	struct bch_hash_info hash_info;
2208 	int ret = 0;
2209 
2210 	ret = bch2_trans_run(c,
2211 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2212 			POS(BCACHEFS_ROOT_INO, 0),
2213 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2214 			k,
2215 			NULL, NULL,
2216 			BCH_TRANS_COMMIT_no_enospc,
2217 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2218 
2219 	inode_walker_exit(&inode);
2220 	bch_err_fn(c, ret);
2221 	return ret;
2222 }
2223 
2224 static int check_root_trans(struct btree_trans *trans)
2225 {
2226 	struct bch_fs *c = trans->c;
2227 	struct bch_inode_unpacked root_inode;
2228 	u32 snapshot;
2229 	u64 inum;
2230 	int ret;
2231 
2232 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2233 	if (ret && !bch2_err_matches(ret, ENOENT))
2234 		return ret;
2235 
2236 	if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2237 				"root subvol missing")) {
2238 		struct bkey_i_subvolume *root_subvol =
2239 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2240 		ret = PTR_ERR_OR_ZERO(root_subvol);
2241 		if (ret)
2242 			goto err;
2243 
2244 		snapshot	= U32_MAX;
2245 		inum		= BCACHEFS_ROOT_INO;
2246 
2247 		bkey_subvolume_init(&root_subvol->k_i);
2248 		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2249 		root_subvol->v.flags	= 0;
2250 		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2251 		root_subvol->v.inode	= cpu_to_le64(inum);
2252 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2253 		bch_err_msg(c, ret, "writing root subvol");
2254 		if (ret)
2255 			goto err;
2256 	}
2257 
2258 	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2259 	if (ret && !bch2_err_matches(ret, ENOENT))
2260 		return ret;
2261 
2262 	if (mustfix_fsck_err_on(ret,
2263 				trans, root_dir_missing,
2264 				"root directory missing") ||
2265 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2266 				trans, root_inode_not_dir,
2267 				"root inode not a directory")) {
2268 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2269 				0, NULL);
2270 		root_inode.bi_inum = inum;
2271 
2272 		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2273 		bch_err_msg(c, ret, "writing root inode");
2274 	}
2275 err:
2276 fsck_err:
2277 	return ret;
2278 }
2279 
2280 /* Get root directory, create if it doesn't exist: */
2281 int bch2_check_root(struct bch_fs *c)
2282 {
2283 	int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2284 		check_root_trans(trans));
2285 	bch_err_fn(c, ret);
2286 	return ret;
2287 }
2288 
2289 typedef DARRAY(u32) darray_u32;
2290 
2291 static bool darray_u32_has(darray_u32 *d, u32 v)
2292 {
2293 	darray_for_each(*d, i)
2294 		if (*i == v)
2295 			return true;
2296 	return false;
2297 }
2298 
2299 /*
2300  * We've checked that inode backpointers point to valid dirents; here, it's
2301  * sufficient to check that the subvolume root has a dirent:
2302  */
2303 static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
2304 {
2305 	struct bch_inode_unpacked inode;
2306 	int ret = bch2_inode_find_by_inum_trans(trans,
2307 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2308 				&inode);
2309 	if (ret)
2310 		return ret;
2311 
2312 	return inode.bi_dir != 0;
2313 }
2314 
2315 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2316 {
2317 	struct bch_fs *c = trans->c;
2318 	struct btree_iter parent_iter = {};
2319 	darray_u32 subvol_path = {};
2320 	struct printbuf buf = PRINTBUF;
2321 	int ret = 0;
2322 
2323 	if (k.k->type != KEY_TYPE_subvolume)
2324 		return 0;
2325 
2326 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2327 		ret = darray_push(&subvol_path, k.k->p.offset);
2328 		if (ret)
2329 			goto err;
2330 
2331 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2332 
2333 		ret = subvol_has_dirent(trans, s);
2334 		if (ret < 0)
2335 			break;
2336 
2337 		if (fsck_err_on(!ret,
2338 				trans, subvol_unreachable,
2339 				"unreachable subvolume %s",
2340 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2341 				 buf.buf))) {
2342 			ret = reattach_subvol(trans, s);
2343 			break;
2344 		}
2345 
2346 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2347 
2348 		if (darray_u32_has(&subvol_path, parent)) {
2349 			if (fsck_err(c, subvol_loop, "subvolume loop"))
2350 				ret = reattach_subvol(trans, s);
2351 			break;
2352 		}
2353 
2354 		bch2_trans_iter_exit(trans, &parent_iter);
2355 		bch2_trans_iter_init(trans, &parent_iter,
2356 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2357 		k = bch2_btree_iter_peek_slot(&parent_iter);
2358 		ret = bkey_err(k);
2359 		if (ret)
2360 			goto err;
2361 
2362 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2363 				trans, subvol_unreachable,
2364 				"unreachable subvolume %s",
2365 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2366 				 buf.buf))) {
2367 			ret = reattach_subvol(trans, s);
2368 			break;
2369 		}
2370 	}
2371 fsck_err:
2372 err:
2373 	printbuf_exit(&buf);
2374 	darray_exit(&subvol_path);
2375 	bch2_trans_iter_exit(trans, &parent_iter);
2376 	return ret;
2377 }
2378 
2379 int bch2_check_subvolume_structure(struct bch_fs *c)
2380 {
2381 	int ret = bch2_trans_run(c,
2382 		for_each_btree_key_commit(trans, iter,
2383 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2384 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2385 			check_subvol_path(trans, &iter, k)));
2386 	bch_err_fn(c, ret);
2387 	return ret;
2388 }
2389 
2390 struct pathbuf_entry {
2391 	u64	inum;
2392 	u32	snapshot;
2393 };
2394 
2395 typedef DARRAY(struct pathbuf_entry) pathbuf;
2396 
2397 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2398 {
2399 	darray_for_each(*p, i)
2400 		if (i->inum	== inum &&
2401 		    i->snapshot	== snapshot)
2402 			return true;
2403 	return false;
2404 }
2405 
2406 /*
2407  * Check that a given inode is reachable from its subvolume root - we already
2408  * verified subvolume connectivity:
2409  *
2410  * XXX: we should also be verifying that inodes are in the right subvolumes
2411  */
2412 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2413 {
2414 	struct bch_fs *c = trans->c;
2415 	struct btree_iter inode_iter = {};
2416 	struct bch_inode_unpacked inode;
2417 	struct printbuf buf = PRINTBUF;
2418 	u32 snapshot = inode_k.k->p.snapshot;
2419 	int ret = 0;
2420 
2421 	p->nr = 0;
2422 
2423 	BUG_ON(bch2_inode_unpack(inode_k, &inode));
2424 
2425 	while (!inode.bi_subvol) {
2426 		struct btree_iter dirent_iter;
2427 		struct bkey_s_c_dirent d;
2428 		u32 parent_snapshot = snapshot;
2429 
2430 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2431 		ret = bkey_err(d.s_c);
2432 		if (ret && !bch2_err_matches(ret, ENOENT))
2433 			break;
2434 
2435 		if (!ret && !dirent_points_to_inode(d, &inode)) {
2436 			bch2_trans_iter_exit(trans, &dirent_iter);
2437 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2438 		}
2439 
2440 		if (bch2_err_matches(ret, ENOENT)) {
2441 			ret = 0;
2442 			if (fsck_err(trans, inode_unreachable,
2443 				     "unreachable inode\n%s",
2444 				     (printbuf_reset(&buf),
2445 				      bch2_bkey_val_to_text(&buf, c, inode_k),
2446 				      buf.buf)))
2447 				ret = reattach_inode(trans, &inode, snapshot);
2448 			goto out;
2449 		}
2450 
2451 		bch2_trans_iter_exit(trans, &dirent_iter);
2452 
2453 		if (!S_ISDIR(inode.bi_mode))
2454 			break;
2455 
2456 		ret = darray_push(p, ((struct pathbuf_entry) {
2457 			.inum		= inode.bi_inum,
2458 			.snapshot	= snapshot,
2459 		}));
2460 		if (ret)
2461 			return ret;
2462 
2463 		snapshot = parent_snapshot;
2464 
2465 		bch2_trans_iter_exit(trans, &inode_iter);
2466 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2467 					     SPOS(0, inode.bi_dir, snapshot), 0);
2468 		ret = bkey_err(inode_k) ?:
2469 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2470 			: bch2_inode_unpack(inode_k, &inode);
2471 		if (ret) {
2472 			/* Should have been caught in dirents pass */
2473 			bch_err_msg(c, ret, "error looking up parent directory");
2474 			break;
2475 		}
2476 
2477 		snapshot = inode_k.k->p.snapshot;
2478 
2479 		if (path_is_dup(p, inode.bi_inum, snapshot)) {
2480 			/* XXX print path */
2481 			bch_err(c, "directory structure loop");
2482 
2483 			darray_for_each(*p, i)
2484 				pr_err("%llu:%u", i->inum, i->snapshot);
2485 			pr_err("%llu:%u", inode.bi_inum, snapshot);
2486 
2487 			if (fsck_err(trans, dir_loop, "directory structure loop")) {
2488 				ret = remove_backpointer(trans, &inode);
2489 				bch_err_msg(c, ret, "removing dirent");
2490 				if (ret)
2491 					break;
2492 
2493 				ret = reattach_inode(trans, &inode, snapshot);
2494 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2495 			}
2496 			break;
2497 		}
2498 	}
2499 out:
2500 fsck_err:
2501 	bch2_trans_iter_exit(trans, &inode_iter);
2502 	printbuf_exit(&buf);
2503 	bch_err_fn(c, ret);
2504 	return ret;
2505 }
2506 
2507 /*
2508  * Check for unreachable inodes, as well as loops in the directory structure:
2509  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2510  * unreachable:
2511  */
2512 int bch2_check_directory_structure(struct bch_fs *c)
2513 {
2514 	pathbuf path = { 0, };
2515 	int ret;
2516 
2517 	ret = bch2_trans_run(c,
2518 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2519 					  BTREE_ITER_intent|
2520 					  BTREE_ITER_prefetch|
2521 					  BTREE_ITER_all_snapshots, k,
2522 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2523 			if (!bkey_is_inode(k.k))
2524 				continue;
2525 
2526 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2527 				continue;
2528 
2529 			check_path(trans, &path, k);
2530 		})));
2531 	darray_exit(&path);
2532 
2533 	bch_err_fn(c, ret);
2534 	return ret;
2535 }
2536 
2537 struct nlink_table {
2538 	size_t		nr;
2539 	size_t		size;
2540 
2541 	struct nlink {
2542 		u64	inum;
2543 		u32	snapshot;
2544 		u32	count;
2545 	}		*d;
2546 };
2547 
2548 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2549 		     u64 inum, u32 snapshot)
2550 {
2551 	if (t->nr == t->size) {
2552 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2553 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2554 
2555 		if (!d) {
2556 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2557 				new_size);
2558 			return -BCH_ERR_ENOMEM_fsck_add_nlink;
2559 		}
2560 
2561 		if (t->d)
2562 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2563 		kvfree(t->d);
2564 
2565 		t->d = d;
2566 		t->size = new_size;
2567 	}
2568 
2569 
2570 	t->d[t->nr++] = (struct nlink) {
2571 		.inum		= inum,
2572 		.snapshot	= snapshot,
2573 	};
2574 
2575 	return 0;
2576 }
2577 
2578 static int nlink_cmp(const void *_l, const void *_r)
2579 {
2580 	const struct nlink *l = _l;
2581 	const struct nlink *r = _r;
2582 
2583 	return cmp_int(l->inum, r->inum);
2584 }
2585 
2586 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2587 		     struct nlink_table *links,
2588 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2589 {
2590 	struct nlink *link, key = {
2591 		.inum = inum, .snapshot = U32_MAX,
2592 	};
2593 
2594 	if (inum < range_start || inum >= range_end)
2595 		return;
2596 
2597 	link = __inline_bsearch(&key, links->d, links->nr,
2598 				sizeof(links->d[0]), nlink_cmp);
2599 	if (!link)
2600 		return;
2601 
2602 	while (link > links->d && link[0].inum == link[-1].inum)
2603 		--link;
2604 
2605 	for (; link < links->d + links->nr && link->inum == inum; link++)
2606 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2607 			link->count++;
2608 			if (link->snapshot >= snapshot)
2609 				break;
2610 		}
2611 }
2612 
2613 noinline_for_stack
2614 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2615 				       struct nlink_table *t,
2616 				       u64 start, u64 *end)
2617 {
2618 	int ret = bch2_trans_run(c,
2619 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2620 				   POS(0, start),
2621 				   BTREE_ITER_intent|
2622 				   BTREE_ITER_prefetch|
2623 				   BTREE_ITER_all_snapshots, k, ({
2624 			if (!bkey_is_inode(k.k))
2625 				continue;
2626 
2627 			/* Should never fail, checked by bch2_inode_invalid: */
2628 			struct bch_inode_unpacked u;
2629 			BUG_ON(bch2_inode_unpack(k, &u));
2630 
2631 			/*
2632 			 * Backpointer and directory structure checks are sufficient for
2633 			 * directories, since they can't have hardlinks:
2634 			 */
2635 			if (S_ISDIR(u.bi_mode))
2636 				continue;
2637 
2638 			if (!u.bi_nlink)
2639 				continue;
2640 
2641 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2642 			if (ret) {
2643 				*end = k.k->p.offset;
2644 				ret = 0;
2645 				break;
2646 			}
2647 			0;
2648 		})));
2649 
2650 	bch_err_fn(c, ret);
2651 	return ret;
2652 }
2653 
2654 noinline_for_stack
2655 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2656 				     u64 range_start, u64 range_end)
2657 {
2658 	struct snapshots_seen s;
2659 
2660 	snapshots_seen_init(&s);
2661 
2662 	int ret = bch2_trans_run(c,
2663 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2664 				   BTREE_ITER_intent|
2665 				   BTREE_ITER_prefetch|
2666 				   BTREE_ITER_all_snapshots, k, ({
2667 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2668 			if (ret)
2669 				break;
2670 
2671 			if (k.k->type == KEY_TYPE_dirent) {
2672 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2673 
2674 				if (d.v->d_type != DT_DIR &&
2675 				    d.v->d_type != DT_SUBVOL)
2676 					inc_link(c, &s, links, range_start, range_end,
2677 						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2678 			}
2679 			0;
2680 		})));
2681 
2682 	snapshots_seen_exit(&s);
2683 
2684 	bch_err_fn(c, ret);
2685 	return ret;
2686 }
2687 
2688 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2689 				     struct bkey_s_c k,
2690 				     struct nlink_table *links,
2691 				     size_t *idx, u64 range_end)
2692 {
2693 	struct bch_inode_unpacked u;
2694 	struct nlink *link = &links->d[*idx];
2695 	int ret = 0;
2696 
2697 	if (k.k->p.offset >= range_end)
2698 		return 1;
2699 
2700 	if (!bkey_is_inode(k.k))
2701 		return 0;
2702 
2703 	BUG_ON(bch2_inode_unpack(k, &u));
2704 
2705 	if (S_ISDIR(u.bi_mode))
2706 		return 0;
2707 
2708 	if (!u.bi_nlink)
2709 		return 0;
2710 
2711 	while ((cmp_int(link->inum, k.k->p.offset) ?:
2712 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2713 		BUG_ON(*idx == links->nr);
2714 		link = &links->d[++*idx];
2715 	}
2716 
2717 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2718 			trans, inode_wrong_nlink,
2719 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
2720 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2721 			bch2_inode_nlink_get(&u), link->count)) {
2722 		bch2_inode_nlink_set(&u, link->count);
2723 		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2724 	}
2725 fsck_err:
2726 	return ret;
2727 }
2728 
2729 noinline_for_stack
2730 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2731 			       struct nlink_table *links,
2732 			       u64 range_start, u64 range_end)
2733 {
2734 	size_t idx = 0;
2735 
2736 	int ret = bch2_trans_run(c,
2737 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2738 				POS(0, range_start),
2739 				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2740 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2741 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2742 	if (ret < 0) {
2743 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2744 		return ret;
2745 	}
2746 
2747 	return 0;
2748 }
2749 
2750 int bch2_check_nlinks(struct bch_fs *c)
2751 {
2752 	struct nlink_table links = { 0 };
2753 	u64 this_iter_range_start, next_iter_range_start = 0;
2754 	int ret = 0;
2755 
2756 	do {
2757 		this_iter_range_start = next_iter_range_start;
2758 		next_iter_range_start = U64_MAX;
2759 
2760 		ret = check_nlinks_find_hardlinks(c, &links,
2761 						  this_iter_range_start,
2762 						  &next_iter_range_start);
2763 
2764 		ret = check_nlinks_walk_dirents(c, &links,
2765 					  this_iter_range_start,
2766 					  next_iter_range_start);
2767 		if (ret)
2768 			break;
2769 
2770 		ret = check_nlinks_update_hardlinks(c, &links,
2771 					 this_iter_range_start,
2772 					 next_iter_range_start);
2773 		if (ret)
2774 			break;
2775 
2776 		links.nr = 0;
2777 	} while (next_iter_range_start != U64_MAX);
2778 
2779 	kvfree(links.d);
2780 	bch_err_fn(c, ret);
2781 	return ret;
2782 }
2783 
2784 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2785 			     struct bkey_s_c k)
2786 {
2787 	struct bkey_s_c_reflink_p p;
2788 	struct bkey_i_reflink_p *u;
2789 
2790 	if (k.k->type != KEY_TYPE_reflink_p)
2791 		return 0;
2792 
2793 	p = bkey_s_c_to_reflink_p(k);
2794 
2795 	if (!p.v->front_pad && !p.v->back_pad)
2796 		return 0;
2797 
2798 	u = bch2_trans_kmalloc(trans, sizeof(*u));
2799 	int ret = PTR_ERR_OR_ZERO(u);
2800 	if (ret)
2801 		return ret;
2802 
2803 	bkey_reassemble(&u->k_i, k);
2804 	u->v.front_pad	= 0;
2805 	u->v.back_pad	= 0;
2806 
2807 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2808 }
2809 
2810 int bch2_fix_reflink_p(struct bch_fs *c)
2811 {
2812 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2813 		return 0;
2814 
2815 	int ret = bch2_trans_run(c,
2816 		for_each_btree_key_commit(trans, iter,
2817 				BTREE_ID_extents, POS_MIN,
2818 				BTREE_ITER_intent|BTREE_ITER_prefetch|
2819 				BTREE_ITER_all_snapshots, k,
2820 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2821 			fix_reflink_p_key(trans, &iter, k)));
2822 	bch_err_fn(c, ret);
2823 	return ret;
2824 }
2825