xref: /linux/fs/bcachefs/fsck.c (revision c434e25b62f8efcfbb6bf1f7ce55960206c1137e)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "darray.h"
9 #include "dirent.h"
10 #include "error.h"
11 #include "fs-common.h"
12 #include "fsck.h"
13 #include "inode.h"
14 #include "keylist.h"
15 #include "recovery_passes.h"
16 #include "snapshot.h"
17 #include "super.h"
18 #include "xattr.h"
19 
20 #include <linux/bsearch.h>
21 #include <linux/dcache.h> /* struct qstr */
22 
23 /*
24  * XXX: this is handling transaction restarts without returning
25  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
26  */
27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
28 				    u32 snapshot)
29 {
30 	u64 sectors = 0;
31 
32 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
33 				SPOS(inum, 0, snapshot),
34 				POS(inum, U64_MAX),
35 				0, k, ({
36 		if (bkey_extent_is_allocation(k.k))
37 			sectors += k.k->size;
38 		0;
39 	}));
40 
41 	return ret ?: sectors;
42 }
43 
44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45 				    u32 snapshot)
46 {
47 	u64 subdirs = 0;
48 
49 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
50 				    SPOS(inum, 0, snapshot),
51 				    POS(inum, U64_MAX),
52 				    0, k, ({
53 		if (k.k->type == KEY_TYPE_dirent &&
54 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
55 			subdirs++;
56 		0;
57 	}));
58 
59 	return ret ?: subdirs;
60 }
61 
62 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
63 			 u32 *snapshot, u64 *inum)
64 {
65 	struct bch_subvolume s;
66 	int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
67 
68 	*snapshot = le32_to_cpu(s.snapshot);
69 	*inum = le64_to_cpu(s.inode);
70 	return ret;
71 }
72 
73 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
74 			      struct bch_inode_unpacked *inode)
75 {
76 	struct btree_iter iter;
77 	struct bkey_s_c k;
78 	int ret;
79 
80 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
81 				     BTREE_ITER_all_snapshots, k, ret) {
82 		if (k.k->p.offset != inode_nr)
83 			break;
84 		if (!bkey_is_inode(k.k))
85 			continue;
86 		ret = bch2_inode_unpack(k, inode);
87 		goto found;
88 	}
89 	ret = -BCH_ERR_ENOENT_inode;
90 found:
91 	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
92 	bch2_trans_iter_exit(trans, &iter);
93 	return ret;
94 }
95 
96 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
97 			struct bch_inode_unpacked *inode,
98 			u32 *snapshot)
99 {
100 	struct btree_iter iter;
101 	struct bkey_s_c k;
102 	int ret;
103 
104 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
105 			       SPOS(0, inode_nr, *snapshot), 0);
106 	ret = bkey_err(k);
107 	if (ret)
108 		goto err;
109 
110 	ret = bkey_is_inode(k.k)
111 		? bch2_inode_unpack(k, inode)
112 		: -BCH_ERR_ENOENT_inode;
113 	if (!ret)
114 		*snapshot = iter.pos.snapshot;
115 err:
116 	bch2_trans_iter_exit(trans, &iter);
117 	return ret;
118 }
119 
120 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
121 			   struct bch_hash_info hash_info,
122 			   subvol_inum dir, struct qstr *name,
123 			   u64 *target, unsigned *type, u32 snapshot)
124 {
125 	struct btree_iter iter;
126 	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
127 							 &hash_info, dir, name, 0, snapshot);
128 	int ret = bkey_err(k);
129 	if (ret)
130 		return ret;
131 
132 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
133 	*target = le64_to_cpu(d.v->d_inum);
134 	*type = d.v->d_type;
135 	bch2_trans_iter_exit(trans, &iter);
136 	return 0;
137 }
138 
139 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
140 {
141 	struct bch_fs *c = trans->c;
142 	struct btree_iter iter;
143 	struct bch_inode_unpacked dir_inode;
144 	struct bch_hash_info dir_hash_info;
145 	int ret;
146 
147 	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
148 	if (ret)
149 		goto err;
150 
151 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
152 
153 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
154 
155 	ret =   bch2_btree_iter_traverse(&iter) ?:
156 		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
157 				    &dir_hash_info, &iter,
158 				    BTREE_UPDATE_internal_snapshot_node);
159 	bch2_trans_iter_exit(trans, &iter);
160 err:
161 	bch_err_fn(c, ret);
162 	return ret;
163 }
164 
165 /* Get lost+found, create if it doesn't exist: */
166 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
167 			    struct bch_inode_unpacked *lostfound,
168 			    u64 reattaching_inum)
169 {
170 	struct bch_fs *c = trans->c;
171 	struct qstr lostfound_str = QSTR("lost+found");
172 	u64 inum = 0;
173 	unsigned d_type = 0;
174 	int ret;
175 
176 	struct bch_snapshot_tree st;
177 	ret = bch2_snapshot_tree_lookup(trans,
178 			bch2_snapshot_tree(c, snapshot), &st);
179 	if (ret)
180 		return ret;
181 
182 	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
183 
184 	struct bch_subvolume subvol;
185 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
186 				 false, 0, &subvol);
187 	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
188 		    le32_to_cpu(st.master_subvol), snapshot);
189 	if (ret)
190 		return ret;
191 
192 	if (!subvol.inode) {
193 		struct btree_iter iter;
194 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
195 				BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
196 				0, subvolume);
197 		ret = PTR_ERR_OR_ZERO(subvol);
198 		if (ret)
199 			return ret;
200 
201 		subvol->v.inode = cpu_to_le64(reattaching_inum);
202 		bch2_trans_iter_exit(trans, &iter);
203 	}
204 
205 	root_inum.inum = le64_to_cpu(subvol.inode);
206 
207 	struct bch_inode_unpacked root_inode;
208 	struct bch_hash_info root_hash_info;
209 	u32 root_inode_snapshot = snapshot;
210 	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
211 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
212 		    root_inum.inum, le32_to_cpu(st.master_subvol));
213 	if (ret)
214 		return ret;
215 
216 	root_hash_info = bch2_hash_info_init(c, &root_inode);
217 
218 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
219 			      &lostfound_str, &inum, &d_type, snapshot);
220 	if (bch2_err_matches(ret, ENOENT))
221 		goto create_lostfound;
222 
223 	bch_err_fn(c, ret);
224 	if (ret)
225 		return ret;
226 
227 	if (d_type != DT_DIR) {
228 		bch_err(c, "error looking up lost+found: not a directory");
229 		return -BCH_ERR_ENOENT_not_directory;
230 	}
231 
232 	/*
233 	 * The bch2_check_dirents pass has already run, dangling dirents
234 	 * shouldn't exist here:
235 	 */
236 	ret = lookup_inode(trans, inum, lostfound, &snapshot);
237 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
238 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
239 	return ret;
240 
241 create_lostfound:
242 	/*
243 	 * XXX: we could have a nicer log message here  if we had a nice way to
244 	 * walk backpointers to print a path
245 	 */
246 	bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
247 
248 	u64 now = bch2_current_time(c);
249 	struct btree_iter lostfound_iter = { NULL };
250 	u64 cpu = raw_smp_processor_id();
251 
252 	bch2_inode_init_early(c, lostfound);
253 	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
254 	lostfound->bi_dir = root_inode.bi_inum;
255 
256 	root_inode.bi_nlink++;
257 
258 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
259 	if (ret)
260 		goto err;
261 
262 	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
263 	ret = bch2_btree_iter_traverse(&lostfound_iter);
264 	if (ret)
265 		goto err;
266 
267 	ret =   bch2_dirent_create_snapshot(trans,
268 				0, root_inode.bi_inum, snapshot, &root_hash_info,
269 				mode_to_type(lostfound->bi_mode),
270 				&lostfound_str,
271 				lostfound->bi_inum,
272 				&lostfound->bi_dir_offset,
273 				STR_HASH_must_create) ?:
274 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
275 				       BTREE_UPDATE_internal_snapshot_node);
276 err:
277 	bch_err_msg(c, ret, "creating lost+found");
278 	bch2_trans_iter_exit(trans, &lostfound_iter);
279 	return ret;
280 }
281 
282 static int reattach_inode(struct btree_trans *trans,
283 			  struct bch_inode_unpacked *inode,
284 			  u32 inode_snapshot)
285 {
286 	struct bch_hash_info dir_hash;
287 	struct bch_inode_unpacked lostfound;
288 	char name_buf[20];
289 	struct qstr name;
290 	u64 dir_offset = 0;
291 	u32 dirent_snapshot = inode_snapshot;
292 	int ret;
293 
294 	if (inode->bi_subvol) {
295 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
296 
297 		u64 root_inum;
298 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
299 				    &dirent_snapshot, &root_inum);
300 		if (ret)
301 			return ret;
302 
303 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
304 	} else {
305 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
306 	}
307 
308 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
309 	if (ret)
310 		return ret;
311 
312 	if (S_ISDIR(inode->bi_mode)) {
313 		lostfound.bi_nlink++;
314 
315 		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
316 		if (ret)
317 			return ret;
318 	}
319 
320 	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
321 
322 	name = (struct qstr) QSTR(name_buf);
323 
324 	ret = bch2_dirent_create_snapshot(trans,
325 				inode->bi_parent_subvol, lostfound.bi_inum,
326 				dirent_snapshot,
327 				&dir_hash,
328 				inode_d_type(inode),
329 				&name,
330 				inode->bi_subvol ?: inode->bi_inum,
331 				&dir_offset,
332 				STR_HASH_must_create);
333 	if (ret)
334 		return ret;
335 
336 	inode->bi_dir		= lostfound.bi_inum;
337 	inode->bi_dir_offset	= dir_offset;
338 
339 	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
340 }
341 
342 static int remove_backpointer(struct btree_trans *trans,
343 			      struct bch_inode_unpacked *inode)
344 {
345 	struct btree_iter iter;
346 	struct bkey_s_c_dirent d;
347 	int ret;
348 
349 	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
350 				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
351 				     dirent);
352 	ret =   bkey_err(d) ?:
353 		__remove_dirent(trans, d.k->p);
354 	bch2_trans_iter_exit(trans, &iter);
355 	return ret;
356 }
357 
358 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
359 {
360 	struct bch_fs *c = trans->c;
361 
362 	struct bch_inode_unpacked inode;
363 	int ret = bch2_inode_find_by_inum_trans(trans,
364 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
365 				&inode);
366 	if (ret)
367 		return ret;
368 
369 	ret = remove_backpointer(trans, &inode);
370 	bch_err_msg(c, ret, "removing dirent");
371 	if (ret)
372 		return ret;
373 
374 	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
375 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
376 	return ret;
377 }
378 
379 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
380 {
381 	struct bch_fs *c = trans->c;
382 
383 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
384 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
385 		return -BCH_ERR_fsck_repair_unimplemented;
386 	}
387 
388 	/*
389 	 * If inum isn't set, that means we're being called from check_dirents,
390 	 * not check_inodes - the root of this subvolume doesn't exist or we
391 	 * would have found it there:
392 	 */
393 	if (!inum) {
394 		struct btree_iter inode_iter = {};
395 		struct bch_inode_unpacked new_inode;
396 		u64 cpu = raw_smp_processor_id();
397 
398 		bch2_inode_init_early(c, &new_inode);
399 		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
400 
401 		new_inode.bi_subvol = subvolid;
402 
403 		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
404 			  bch2_btree_iter_traverse(&inode_iter) ?:
405 			  bch2_inode_write(trans, &inode_iter, &new_inode);
406 		bch2_trans_iter_exit(trans, &inode_iter);
407 		if (ret)
408 			return ret;
409 
410 		inum = new_inode.bi_inum;
411 	}
412 
413 	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
414 
415 	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
416 	int ret = PTR_ERR_OR_ZERO(new_subvol);
417 	if (ret)
418 		return ret;
419 
420 	bkey_subvolume_init(&new_subvol->k_i);
421 	new_subvol->k.p.offset	= subvolid;
422 	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
423 	new_subvol->v.inode	= cpu_to_le64(inum);
424 	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
425 	if (ret)
426 		return ret;
427 
428 	struct btree_iter iter;
429 	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
430 			BTREE_ID_snapshots, POS(0, snapshotid),
431 			0, snapshot);
432 	ret = PTR_ERR_OR_ZERO(s);
433 	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
434 	if (ret)
435 		return ret;
436 
437 	u32 snapshot_tree = le32_to_cpu(s->v.tree);
438 
439 	s->v.subvol = cpu_to_le32(subvolid);
440 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
441 	bch2_trans_iter_exit(trans, &iter);
442 
443 	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
444 			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
445 			0, snapshot_tree);
446 	ret = PTR_ERR_OR_ZERO(st);
447 	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
448 	if (ret)
449 		return ret;
450 
451 	if (!st->v.master_subvol)
452 		st->v.master_subvol = cpu_to_le32(subvolid);
453 
454 	bch2_trans_iter_exit(trans, &iter);
455 	return 0;
456 }
457 
458 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
459 {
460 	struct bch_fs *c = trans->c;
461 	unsigned i_mode = S_IFREG;
462 	u64 i_size = 0;
463 
464 	switch (btree) {
465 	case BTREE_ID_extents: {
466 		struct btree_iter iter = {};
467 
468 		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
469 		struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
470 		bch2_trans_iter_exit(trans, &iter);
471 		int ret = bkey_err(k);
472 		if (ret)
473 			return ret;
474 
475 		i_size = k.k->p.offset << 9;
476 		break;
477 	}
478 	case BTREE_ID_dirents:
479 		i_mode = S_IFDIR;
480 		break;
481 	case BTREE_ID_xattrs:
482 		break;
483 	default:
484 		BUG();
485 	}
486 
487 	struct bch_inode_unpacked new_inode;
488 	bch2_inode_init_early(c, &new_inode);
489 	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
490 	new_inode.bi_size = i_size;
491 	new_inode.bi_inum = inum;
492 
493 	return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
494 }
495 
496 struct snapshots_seen {
497 	struct bpos			pos;
498 	snapshot_id_list		ids;
499 };
500 
501 static inline void snapshots_seen_exit(struct snapshots_seen *s)
502 {
503 	darray_exit(&s->ids);
504 }
505 
506 static inline void snapshots_seen_init(struct snapshots_seen *s)
507 {
508 	memset(s, 0, sizeof(*s));
509 }
510 
511 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
512 {
513 	u32 *i;
514 	__darray_for_each(s->ids, i) {
515 		if (*i == id)
516 			return 0;
517 		if (*i > id)
518 			break;
519 	}
520 
521 	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
522 	if (ret)
523 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
524 			s->ids.size);
525 	return ret;
526 }
527 
528 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
529 				 enum btree_id btree_id, struct bpos pos)
530 {
531 	if (!bkey_eq(s->pos, pos))
532 		s->ids.nr = 0;
533 	s->pos = pos;
534 
535 	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
536 }
537 
538 /**
539  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
540  * and @ancestor hasn't been overwritten in @seen
541  *
542  * @c:		filesystem handle
543  * @seen:	list of snapshot ids already seen at current position
544  * @id:		descendent snapshot id
545  * @ancestor:	ancestor snapshot id
546  *
547  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
548  */
549 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
550 				    u32 id, u32 ancestor)
551 {
552 	ssize_t i;
553 
554 	EBUG_ON(id > ancestor);
555 
556 	/* @ancestor should be the snapshot most recently added to @seen */
557 	EBUG_ON(ancestor != seen->pos.snapshot);
558 	EBUG_ON(ancestor != darray_last(seen->ids));
559 
560 	if (id == ancestor)
561 		return true;
562 
563 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
564 		return false;
565 
566 	/*
567 	 * We know that @id is a descendant of @ancestor, we're checking if
568 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
569 	 * @ascestor and with @id as a descendent.
570 	 *
571 	 * But we already know that we're scanning IDs between @id and @ancestor
572 	 * numerically, since snapshot ID lists are kept sorted, so if we find
573 	 * an id that's an ancestor of @id we're done:
574 	 */
575 
576 	for (i = seen->ids.nr - 2;
577 	     i >= 0 && seen->ids.data[i] >= id;
578 	     --i)
579 		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
580 			return false;
581 
582 	return true;
583 }
584 
585 /**
586  * ref_visible - given a key with snapshot id @src that points to a key with
587  * snapshot id @dst, test whether there is some snapshot in which @dst is
588  * visible.
589  *
590  * @c:		filesystem handle
591  * @s:		list of snapshot IDs already seen at @src
592  * @src:	snapshot ID of src key
593  * @dst:	snapshot ID of dst key
594  * Returns:	true if there is some snapshot in which @dst is visible
595  *
596  * Assumes we're visiting @src keys in natural key order
597  */
598 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
599 			u32 src, u32 dst)
600 {
601 	return dst <= src
602 		? key_visible_in_snapshot(c, s, dst, src)
603 		: bch2_snapshot_is_ancestor(c, src, dst);
604 }
605 
606 static int ref_visible2(struct bch_fs *c,
607 			u32 src, struct snapshots_seen *src_seen,
608 			u32 dst, struct snapshots_seen *dst_seen)
609 {
610 	if (dst > src) {
611 		swap(dst, src);
612 		swap(dst_seen, src_seen);
613 	}
614 	return key_visible_in_snapshot(c, src_seen, dst, src);
615 }
616 
617 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
618 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
619 	     (_i)->snapshot <= (_snapshot); _i++)					\
620 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
621 
622 struct inode_walker_entry {
623 	struct bch_inode_unpacked inode;
624 	u32			snapshot;
625 	bool			seen_this_pos;
626 	u64			count;
627 };
628 
629 struct inode_walker {
630 	bool				first_this_inode;
631 	bool				recalculate_sums;
632 	struct bpos			last_pos;
633 
634 	DARRAY(struct inode_walker_entry) inodes;
635 };
636 
637 static void inode_walker_exit(struct inode_walker *w)
638 {
639 	darray_exit(&w->inodes);
640 }
641 
642 static struct inode_walker inode_walker_init(void)
643 {
644 	return (struct inode_walker) { 0, };
645 }
646 
647 static int add_inode(struct bch_fs *c, struct inode_walker *w,
648 		     struct bkey_s_c inode)
649 {
650 	struct bch_inode_unpacked u;
651 
652 	BUG_ON(bch2_inode_unpack(inode, &u));
653 
654 	return darray_push(&w->inodes, ((struct inode_walker_entry) {
655 		.inode		= u,
656 		.snapshot	= inode.k->p.snapshot,
657 	}));
658 }
659 
660 static int get_inodes_all_snapshots(struct btree_trans *trans,
661 				    struct inode_walker *w, u64 inum)
662 {
663 	struct bch_fs *c = trans->c;
664 	struct btree_iter iter;
665 	struct bkey_s_c k;
666 	int ret;
667 
668 	w->recalculate_sums = false;
669 	w->inodes.nr = 0;
670 
671 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
672 				     BTREE_ITER_all_snapshots, k, ret) {
673 		if (k.k->p.offset != inum)
674 			break;
675 
676 		if (bkey_is_inode(k.k))
677 			add_inode(c, w, k);
678 	}
679 	bch2_trans_iter_exit(trans, &iter);
680 
681 	if (ret)
682 		return ret;
683 
684 	w->first_this_inode = true;
685 	return 0;
686 }
687 
688 static struct inode_walker_entry *
689 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
690 {
691 	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
692 
693 	struct inode_walker_entry *i;
694 	__darray_for_each(w->inodes, i)
695 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
696 			goto found;
697 
698 	return NULL;
699 found:
700 	BUG_ON(k.k->p.snapshot > i->snapshot);
701 
702 	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
703 		struct inode_walker_entry new = *i;
704 
705 		new.snapshot = k.k->p.snapshot;
706 		new.count = 0;
707 
708 		struct printbuf buf = PRINTBUF;
709 		bch2_bkey_val_to_text(&buf, c, k);
710 
711 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
712 			 "unexpected because we should always update the inode when we update a key in that inode\n"
713 			 "%s",
714 			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
715 		printbuf_exit(&buf);
716 
717 		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
718 			--i;
719 
720 		size_t pos = i - w->inodes.data;
721 		int ret = darray_insert_item(&w->inodes, pos, new);
722 		if (ret)
723 			return ERR_PTR(ret);
724 
725 		i = w->inodes.data + pos;
726 	}
727 
728 	return i;
729 }
730 
731 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
732 					     struct inode_walker *w,
733 					     struct bkey_s_c k)
734 {
735 	if (w->last_pos.inode != k.k->p.inode) {
736 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
737 		if (ret)
738 			return ERR_PTR(ret);
739 	} else if (bkey_cmp(w->last_pos, k.k->p)) {
740 		darray_for_each(w->inodes, i)
741 			i->seen_this_pos = false;
742 	}
743 
744 	w->last_pos = k.k->p;
745 
746 	return lookup_inode_for_snapshot(trans->c, w, k);
747 }
748 
749 static int get_visible_inodes(struct btree_trans *trans,
750 			      struct inode_walker *w,
751 			      struct snapshots_seen *s,
752 			      u64 inum)
753 {
754 	struct bch_fs *c = trans->c;
755 	struct btree_iter iter;
756 	struct bkey_s_c k;
757 	int ret;
758 
759 	w->inodes.nr = 0;
760 
761 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
762 			   BTREE_ITER_all_snapshots, k, ret) {
763 		if (k.k->p.offset != inum)
764 			break;
765 
766 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
767 			continue;
768 
769 		if (bkey_is_inode(k.k))
770 			add_inode(c, w, k);
771 
772 		if (k.k->p.snapshot >= s->pos.snapshot)
773 			break;
774 	}
775 	bch2_trans_iter_exit(trans, &iter);
776 
777 	return ret;
778 }
779 
780 static int hash_redo_key(struct btree_trans *trans,
781 			 const struct bch_hash_desc desc,
782 			 struct bch_hash_info *hash_info,
783 			 struct btree_iter *k_iter, struct bkey_s_c k)
784 {
785 	struct bkey_i *delete;
786 	struct bkey_i *tmp;
787 
788 	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
789 	if (IS_ERR(delete))
790 		return PTR_ERR(delete);
791 
792 	tmp = bch2_bkey_make_mut_noupdate(trans, k);
793 	if (IS_ERR(tmp))
794 		return PTR_ERR(tmp);
795 
796 	bkey_init(&delete->k);
797 	delete->k.p = k_iter->pos;
798 	return  bch2_btree_iter_traverse(k_iter) ?:
799 		bch2_trans_update(trans, k_iter, delete, 0) ?:
800 		bch2_hash_set_in_snapshot(trans, desc, hash_info,
801 				       (subvol_inum) { 0, k.k->p.inode },
802 				       k.k->p.snapshot, tmp,
803 				       STR_HASH_must_create|
804 				       BTREE_UPDATE_internal_snapshot_node) ?:
805 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
806 }
807 
808 static int hash_check_key(struct btree_trans *trans,
809 			  const struct bch_hash_desc desc,
810 			  struct bch_hash_info *hash_info,
811 			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
812 {
813 	struct bch_fs *c = trans->c;
814 	struct btree_iter iter = { NULL };
815 	struct printbuf buf = PRINTBUF;
816 	struct bkey_s_c k;
817 	u64 hash;
818 	int ret = 0;
819 
820 	if (hash_k.k->type != desc.key_type)
821 		return 0;
822 
823 	hash = desc.hash_bkey(hash_info, hash_k);
824 
825 	if (likely(hash == hash_k.k->p.offset))
826 		return 0;
827 
828 	if (hash_k.k->p.offset < hash)
829 		goto bad_hash;
830 
831 	for_each_btree_key_norestart(trans, iter, desc.btree_id,
832 				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
833 				     BTREE_ITER_slots, k, ret) {
834 		if (bkey_eq(k.k->p, hash_k.k->p))
835 			break;
836 
837 		if (fsck_err_on(k.k->type == desc.key_type &&
838 				!desc.cmp_bkey(k, hash_k),
839 				trans, hash_table_key_duplicate,
840 				"duplicate hash table keys:\n%s",
841 				(printbuf_reset(&buf),
842 				 bch2_bkey_val_to_text(&buf, c, hash_k),
843 				 buf.buf))) {
844 			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
845 			break;
846 		}
847 
848 		if (bkey_deleted(k.k)) {
849 			bch2_trans_iter_exit(trans, &iter);
850 			goto bad_hash;
851 		}
852 	}
853 out:
854 	bch2_trans_iter_exit(trans, &iter);
855 	printbuf_exit(&buf);
856 	return ret;
857 bad_hash:
858 	if (fsck_err(trans, hash_table_key_wrong_offset,
859 		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
860 		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
861 		     (printbuf_reset(&buf),
862 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
863 		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
864 		bch_err_fn(c, ret);
865 		if (ret)
866 			return ret;
867 		ret = -BCH_ERR_transaction_restart_nested;
868 	}
869 fsck_err:
870 	goto out;
871 }
872 
873 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
874 						struct btree_iter *iter,
875 						struct bpos pos)
876 {
877 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
878 }
879 
880 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
881 					       struct btree_iter *iter,
882 					       struct bch_inode_unpacked *inode,
883 					       u32 *snapshot)
884 {
885 	if (inode->bi_subvol) {
886 		u64 inum;
887 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
888 		if (ret)
889 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
890 	}
891 
892 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
893 }
894 
895 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
896 				   struct bkey_s_c_dirent d)
897 {
898 	return  inode->bi_dir		== d.k->p.inode &&
899 		inode->bi_dir_offset	== d.k->p.offset;
900 }
901 
902 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
903 				   struct bch_inode_unpacked *inode)
904 {
905 	return d.v->d_type == DT_SUBVOL
906 		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
907 		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
908 }
909 
910 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
911 {
912 	struct btree_iter iter;
913 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
914 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
915 	bch2_trans_iter_exit(trans, &iter);
916 	return ret;
917 }
918 
919 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
920 				    struct bch_inode_unpacked *inode,
921 				    u32 inode_snapshot, bool *write_inode)
922 {
923 	struct bch_fs *c = trans->c;
924 	struct printbuf buf = PRINTBUF;
925 
926 	struct btree_iter dirent_iter = {};
927 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
928 	int ret = bkey_err(d);
929 	if (ret && !bch2_err_matches(ret, ENOENT))
930 		return ret;
931 
932 	if (fsck_err_on(ret,
933 			trans, inode_points_to_missing_dirent,
934 			"inode points to missing dirent\n%s",
935 			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
936 	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
937 			trans, inode_points_to_wrong_dirent,
938 			"inode points to dirent that does not point back:\n%s",
939 			(bch2_bkey_val_to_text(&buf, c, inode_k),
940 			 prt_newline(&buf),
941 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
942 		/*
943 		 * We just clear the backpointer fields for now. If we find a
944 		 * dirent that points to this inode in check_dirents(), we'll
945 		 * update it then; then when we get to check_path() if the
946 		 * backpointer is still 0 we'll reattach it.
947 		 */
948 		inode->bi_dir = 0;
949 		inode->bi_dir_offset = 0;
950 		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
951 		*write_inode = true;
952 	}
953 
954 	ret = 0;
955 fsck_err:
956 	bch2_trans_iter_exit(trans, &dirent_iter);
957 	printbuf_exit(&buf);
958 	bch_err_fn(c, ret);
959 	return ret;
960 }
961 
962 static int check_inode(struct btree_trans *trans,
963 		       struct btree_iter *iter,
964 		       struct bkey_s_c k,
965 		       struct bch_inode_unpacked *prev,
966 		       struct snapshots_seen *s,
967 		       bool full)
968 {
969 	struct bch_fs *c = trans->c;
970 	struct bch_inode_unpacked u;
971 	bool do_update = false;
972 	int ret;
973 
974 	ret = bch2_check_key_has_snapshot(trans, iter, k);
975 	if (ret < 0)
976 		goto err;
977 	if (ret)
978 		return 0;
979 
980 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
981 	if (ret)
982 		goto err;
983 
984 	if (!bkey_is_inode(k.k))
985 		return 0;
986 
987 	BUG_ON(bch2_inode_unpack(k, &u));
988 
989 	if (!full &&
990 	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
991 			    BCH_INODE_i_sectors_dirty|
992 			    BCH_INODE_unlinked)))
993 		return 0;
994 
995 	if (prev->bi_inum != u.bi_inum)
996 		*prev = u;
997 
998 	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
999 			inode_d_type(prev)	!= inode_d_type(&u),
1000 			trans, inode_snapshot_mismatch,
1001 			"inodes in different snapshots don't match")) {
1002 		bch_err(c, "repair not implemented yet");
1003 		return -BCH_ERR_fsck_repair_unimplemented;
1004 	}
1005 
1006 	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
1007 	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
1008 		struct bpos new_min_pos;
1009 
1010 		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
1011 		if (ret)
1012 			goto err;
1013 
1014 		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
1015 
1016 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1017 
1018 		bch_err_msg(c, ret, "in fsck updating inode");
1019 		if (ret)
1020 			return ret;
1021 
1022 		if (!bpos_eq(new_min_pos, POS_MIN))
1023 			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
1024 		return 0;
1025 	}
1026 
1027 	if (u.bi_flags & BCH_INODE_unlinked) {
1028 		ret = check_inode_deleted_list(trans, k.k->p);
1029 		if (ret < 0)
1030 			return ret;
1031 
1032 		fsck_err_on(!ret,
1033 			    trans, unlinked_inode_not_on_deleted_list,
1034 			    "inode %llu:%u unlinked, but not on deleted list",
1035 			    u.bi_inum, k.k->p.snapshot);
1036 		ret = 0;
1037 	}
1038 
1039 	if (u.bi_flags & BCH_INODE_unlinked &&
1040 	    (!c->sb.clean ||
1041 	     fsck_err(trans, inode_unlinked_but_clean,
1042 		      "filesystem marked clean, but inode %llu unlinked",
1043 		      u.bi_inum))) {
1044 		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1045 		bch_err_msg(c, ret, "in fsck deleting inode");
1046 		return ret;
1047 	}
1048 
1049 	if (u.bi_flags & BCH_INODE_i_size_dirty &&
1050 	    (!c->sb.clean ||
1051 	     fsck_err(trans, inode_i_size_dirty_but_clean,
1052 		      "filesystem marked clean, but inode %llu has i_size dirty",
1053 		      u.bi_inum))) {
1054 		bch_verbose(c, "truncating inode %llu", u.bi_inum);
1055 
1056 		/*
1057 		 * XXX: need to truncate partial blocks too here - or ideally
1058 		 * just switch units to bytes and that issue goes away
1059 		 */
1060 		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1061 				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
1062 				     iter->pos.snapshot),
1063 				POS(u.bi_inum, U64_MAX),
1064 				0, NULL);
1065 		bch_err_msg(c, ret, "in fsck truncating inode");
1066 		if (ret)
1067 			return ret;
1068 
1069 		/*
1070 		 * We truncated without our normal sector accounting hook, just
1071 		 * make sure we recalculate it:
1072 		 */
1073 		u.bi_flags |= BCH_INODE_i_sectors_dirty;
1074 
1075 		u.bi_flags &= ~BCH_INODE_i_size_dirty;
1076 		do_update = true;
1077 	}
1078 
1079 	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
1080 	    (!c->sb.clean ||
1081 	     fsck_err(trans, inode_i_sectors_dirty_but_clean,
1082 		      "filesystem marked clean, but inode %llu has i_sectors dirty",
1083 		      u.bi_inum))) {
1084 		s64 sectors;
1085 
1086 		bch_verbose(c, "recounting sectors for inode %llu",
1087 			    u.bi_inum);
1088 
1089 		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1090 		if (sectors < 0) {
1091 			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1092 			return sectors;
1093 		}
1094 
1095 		u.bi_sectors = sectors;
1096 		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1097 		do_update = true;
1098 	}
1099 
1100 	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1101 		u.bi_dir = 0;
1102 		u.bi_dir_offset = 0;
1103 		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1104 		do_update = true;
1105 	}
1106 
1107 	if (u.bi_dir || u.bi_dir_offset) {
1108 		ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1109 		if (ret)
1110 			goto err;
1111 	}
1112 
1113 	if (fsck_err_on(u.bi_parent_subvol &&
1114 			(u.bi_subvol == 0 ||
1115 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1116 			trans, inode_bi_parent_nonzero,
1117 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1118 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1119 		u.bi_parent_subvol = 0;
1120 		do_update = true;
1121 	}
1122 
1123 	if (u.bi_subvol) {
1124 		struct bch_subvolume s;
1125 
1126 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1127 		if (ret && !bch2_err_matches(ret, ENOENT))
1128 			goto err;
1129 
1130 		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1131 			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1132 			goto do_update;
1133 		}
1134 
1135 		if (fsck_err_on(ret,
1136 				trans, inode_bi_subvol_missing,
1137 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1138 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1139 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1140 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1141 							   k.k->p.snapshot),
1142 				trans, inode_bi_subvol_wrong,
1143 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1144 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1145 				le64_to_cpu(s.inode),
1146 				le32_to_cpu(s.snapshot))) {
1147 			u.bi_subvol = 0;
1148 			u.bi_parent_subvol = 0;
1149 			do_update = true;
1150 		}
1151 	}
1152 do_update:
1153 	if (do_update) {
1154 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1155 		bch_err_msg(c, ret, "in fsck updating inode");
1156 		if (ret)
1157 			return ret;
1158 	}
1159 err:
1160 fsck_err:
1161 	bch_err_fn(c, ret);
1162 	return ret;
1163 }
1164 
1165 int bch2_check_inodes(struct bch_fs *c)
1166 {
1167 	bool full = c->opts.fsck;
1168 	struct bch_inode_unpacked prev = { 0 };
1169 	struct snapshots_seen s;
1170 
1171 	snapshots_seen_init(&s);
1172 
1173 	int ret = bch2_trans_run(c,
1174 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1175 				POS_MIN,
1176 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1177 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1178 			check_inode(trans, &iter, k, &prev, &s, full)));
1179 
1180 	snapshots_seen_exit(&s);
1181 	bch_err_fn(c, ret);
1182 	return ret;
1183 }
1184 
1185 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1186 {
1187 	switch (btree) {
1188 	case BTREE_ID_extents:
1189 		return S_ISREG(mode) || S_ISLNK(mode);
1190 	case BTREE_ID_dirents:
1191 		return S_ISDIR(mode);
1192 	case BTREE_ID_xattrs:
1193 		return true;
1194 	default:
1195 		BUG();
1196 	}
1197 }
1198 
1199 static int check_key_has_inode(struct btree_trans *trans,
1200 			       struct btree_iter *iter,
1201 			       struct inode_walker *inode,
1202 			       struct inode_walker_entry *i,
1203 			       struct bkey_s_c k)
1204 {
1205 	struct bch_fs *c = trans->c;
1206 	struct printbuf buf = PRINTBUF;
1207 	int ret = PTR_ERR_OR_ZERO(i);
1208 	if (ret)
1209 		return ret;
1210 
1211 	if (k.k->type == KEY_TYPE_whiteout)
1212 		goto out;
1213 
1214 	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1215 		ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1216 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1217 		if (ret)
1218 			goto err;
1219 
1220 		inode->last_pos.inode--;
1221 		ret = -BCH_ERR_transaction_restart_nested;
1222 		goto err;
1223 	}
1224 
1225 	if (fsck_err_on(!i,
1226 			trans, key_in_missing_inode,
1227 			"key in missing inode:\n  %s",
1228 			(printbuf_reset(&buf),
1229 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1230 		goto delete;
1231 
1232 	if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1233 			trans, key_in_wrong_inode_type,
1234 			"key for wrong inode mode %o:\n  %s",
1235 			i->inode.bi_mode,
1236 			(printbuf_reset(&buf),
1237 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1238 		goto delete;
1239 out:
1240 err:
1241 fsck_err:
1242 	printbuf_exit(&buf);
1243 	bch_err_fn(c, ret);
1244 	return ret;
1245 delete:
1246 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1247 	goto out;
1248 }
1249 
1250 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1251 {
1252 	struct bch_fs *c = trans->c;
1253 	int ret = 0;
1254 	s64 count2;
1255 
1256 	darray_for_each(w->inodes, i) {
1257 		if (i->inode.bi_sectors == i->count)
1258 			continue;
1259 
1260 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1261 
1262 		if (w->recalculate_sums)
1263 			i->count = count2;
1264 
1265 		if (i->count != count2) {
1266 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1267 					    w->last_pos.inode, i->snapshot, i->count, count2);
1268 			return -BCH_ERR_internal_fsck_err;
1269 		}
1270 
1271 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1272 				trans, inode_i_sectors_wrong,
1273 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1274 				w->last_pos.inode, i->snapshot,
1275 				i->inode.bi_sectors, i->count)) {
1276 			i->inode.bi_sectors = i->count;
1277 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1278 			if (ret)
1279 				break;
1280 		}
1281 	}
1282 fsck_err:
1283 	bch_err_fn(c, ret);
1284 	return ret;
1285 }
1286 
1287 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1288 {
1289 	u32 restart_count = trans->restart_count;
1290 	return check_i_sectors_notnested(trans, w) ?:
1291 		trans_was_restarted(trans, restart_count);
1292 }
1293 
1294 struct extent_end {
1295 	u32			snapshot;
1296 	u64			offset;
1297 	struct snapshots_seen	seen;
1298 };
1299 
1300 struct extent_ends {
1301 	struct bpos			last_pos;
1302 	DARRAY(struct extent_end)	e;
1303 };
1304 
1305 static void extent_ends_reset(struct extent_ends *extent_ends)
1306 {
1307 	darray_for_each(extent_ends->e, i)
1308 		snapshots_seen_exit(&i->seen);
1309 	extent_ends->e.nr = 0;
1310 }
1311 
1312 static void extent_ends_exit(struct extent_ends *extent_ends)
1313 {
1314 	extent_ends_reset(extent_ends);
1315 	darray_exit(&extent_ends->e);
1316 }
1317 
1318 static void extent_ends_init(struct extent_ends *extent_ends)
1319 {
1320 	memset(extent_ends, 0, sizeof(*extent_ends));
1321 }
1322 
1323 static int extent_ends_at(struct bch_fs *c,
1324 			  struct extent_ends *extent_ends,
1325 			  struct snapshots_seen *seen,
1326 			  struct bkey_s_c k)
1327 {
1328 	struct extent_end *i, n = (struct extent_end) {
1329 		.offset		= k.k->p.offset,
1330 		.snapshot	= k.k->p.snapshot,
1331 		.seen		= *seen,
1332 	};
1333 
1334 	n.seen.ids.data = kmemdup(seen->ids.data,
1335 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1336 			      GFP_KERNEL);
1337 	if (!n.seen.ids.data)
1338 		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1339 
1340 	__darray_for_each(extent_ends->e, i) {
1341 		if (i->snapshot == k.k->p.snapshot) {
1342 			snapshots_seen_exit(&i->seen);
1343 			*i = n;
1344 			return 0;
1345 		}
1346 
1347 		if (i->snapshot >= k.k->p.snapshot)
1348 			break;
1349 	}
1350 
1351 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1352 }
1353 
1354 static int overlapping_extents_found(struct btree_trans *trans,
1355 				     enum btree_id btree,
1356 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1357 				     struct bkey pos2,
1358 				     bool *fixed,
1359 				     struct extent_end *extent_end)
1360 {
1361 	struct bch_fs *c = trans->c;
1362 	struct printbuf buf = PRINTBUF;
1363 	struct btree_iter iter1, iter2 = { NULL };
1364 	struct bkey_s_c k1, k2;
1365 	int ret;
1366 
1367 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1368 
1369 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1370 			     BTREE_ITER_all_snapshots|
1371 			     BTREE_ITER_not_extents);
1372 	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1373 	ret = bkey_err(k1);
1374 	if (ret)
1375 		goto err;
1376 
1377 	prt_str(&buf, "\n  ");
1378 	bch2_bkey_val_to_text(&buf, c, k1);
1379 
1380 	if (!bpos_eq(pos1, k1.k->p)) {
1381 		prt_str(&buf, "\n  wanted\n  ");
1382 		bch2_bpos_to_text(&buf, pos1);
1383 		prt_str(&buf, "\n  ");
1384 		bch2_bkey_to_text(&buf, &pos2);
1385 
1386 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1387 			__func__, buf.buf);
1388 		ret = -BCH_ERR_internal_fsck_err;
1389 		goto err;
1390 	}
1391 
1392 	bch2_trans_copy_iter(&iter2, &iter1);
1393 
1394 	while (1) {
1395 		bch2_btree_iter_advance(&iter2);
1396 
1397 		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1398 		ret = bkey_err(k2);
1399 		if (ret)
1400 			goto err;
1401 
1402 		if (bpos_ge(k2.k->p, pos2.p))
1403 			break;
1404 	}
1405 
1406 	prt_str(&buf, "\n  ");
1407 	bch2_bkey_val_to_text(&buf, c, k2);
1408 
1409 	if (bpos_gt(k2.k->p, pos2.p) ||
1410 	    pos2.size != k2.k->size) {
1411 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1412 			__func__, buf.buf);
1413 		ret = -BCH_ERR_internal_fsck_err;
1414 		goto err;
1415 	}
1416 
1417 	prt_printf(&buf, "\n  overwriting %s extent",
1418 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1419 
1420 	if (fsck_err(trans, extent_overlapping,
1421 		     "overlapping extents%s", buf.buf)) {
1422 		struct btree_iter *old_iter = &iter1;
1423 		struct disk_reservation res = { 0 };
1424 
1425 		if (pos1.snapshot < pos2.p.snapshot) {
1426 			old_iter = &iter2;
1427 			swap(k1, k2);
1428 		}
1429 
1430 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1431 
1432 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1433 				BTREE_UPDATE_internal_snapshot_node,
1434 				k1, k2) ?:
1435 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1436 		bch2_disk_reservation_put(c, &res);
1437 
1438 		if (ret)
1439 			goto err;
1440 
1441 		*fixed = true;
1442 
1443 		if (pos1.snapshot == pos2.p.snapshot) {
1444 			/*
1445 			 * We overwrote the first extent, and did the overwrite
1446 			 * in the same snapshot:
1447 			 */
1448 			extent_end->offset = bkey_start_offset(&pos2);
1449 		} else if (pos1.snapshot > pos2.p.snapshot) {
1450 			/*
1451 			 * We overwrote the first extent in pos2's snapshot:
1452 			 */
1453 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1454 		} else {
1455 			/*
1456 			 * We overwrote the second extent - restart
1457 			 * check_extent() from the top:
1458 			 */
1459 			ret = -BCH_ERR_transaction_restart_nested;
1460 		}
1461 	}
1462 fsck_err:
1463 err:
1464 	bch2_trans_iter_exit(trans, &iter2);
1465 	bch2_trans_iter_exit(trans, &iter1);
1466 	printbuf_exit(&buf);
1467 	return ret;
1468 }
1469 
1470 static int check_overlapping_extents(struct btree_trans *trans,
1471 			      struct snapshots_seen *seen,
1472 			      struct extent_ends *extent_ends,
1473 			      struct bkey_s_c k,
1474 			      struct btree_iter *iter,
1475 			      bool *fixed)
1476 {
1477 	struct bch_fs *c = trans->c;
1478 	int ret = 0;
1479 
1480 	/* transaction restart, running again */
1481 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1482 		return 0;
1483 
1484 	if (extent_ends->last_pos.inode != k.k->p.inode)
1485 		extent_ends_reset(extent_ends);
1486 
1487 	darray_for_each(extent_ends->e, i) {
1488 		if (i->offset <= bkey_start_offset(k.k))
1489 			continue;
1490 
1491 		if (!ref_visible2(c,
1492 				  k.k->p.snapshot, seen,
1493 				  i->snapshot, &i->seen))
1494 			continue;
1495 
1496 		ret = overlapping_extents_found(trans, iter->btree_id,
1497 						SPOS(iter->pos.inode,
1498 						     i->offset,
1499 						     i->snapshot),
1500 						&i->seen,
1501 						*k.k, fixed, i);
1502 		if (ret)
1503 			goto err;
1504 	}
1505 
1506 	extent_ends->last_pos = k.k->p;
1507 err:
1508 	return ret;
1509 }
1510 
1511 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1512 				struct bkey_s_c k)
1513 {
1514 	struct bch_fs *c = trans->c;
1515 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1516 	struct bch_extent_crc_unpacked crc;
1517 	const union bch_extent_entry *i;
1518 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1519 
1520 	bkey_for_each_crc(k.k, ptrs, crc, i)
1521 		if (crc_is_encoded(crc) &&
1522 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1523 			struct printbuf buf = PRINTBUF;
1524 
1525 			bch2_bkey_val_to_text(&buf, c, k);
1526 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1527 			printbuf_exit(&buf);
1528 		}
1529 
1530 	return 0;
1531 }
1532 
1533 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1534 			struct bkey_s_c k,
1535 			struct inode_walker *inode,
1536 			struct snapshots_seen *s,
1537 			struct extent_ends *extent_ends)
1538 {
1539 	struct bch_fs *c = trans->c;
1540 	struct inode_walker_entry *i;
1541 	struct printbuf buf = PRINTBUF;
1542 	int ret = 0;
1543 
1544 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1545 	if (ret) {
1546 		ret = ret < 0 ? ret : 0;
1547 		goto out;
1548 	}
1549 
1550 	if (inode->last_pos.inode != k.k->p.inode) {
1551 		ret = check_i_sectors(trans, inode);
1552 		if (ret)
1553 			goto err;
1554 	}
1555 
1556 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1557 	if (ret)
1558 		goto err;
1559 
1560 	i = walk_inode(trans, inode, k);
1561 	ret = PTR_ERR_OR_ZERO(i);
1562 	if (ret)
1563 		goto err;
1564 
1565 	ret = check_key_has_inode(trans, iter, inode, i, k);
1566 	if (ret)
1567 		goto err;
1568 
1569 	if (k.k->type != KEY_TYPE_whiteout) {
1570 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1571 						&inode->recalculate_sums);
1572 		if (ret)
1573 			goto err;
1574 	}
1575 
1576 	/*
1577 	 * Check inodes in reverse order, from oldest snapshots to newest,
1578 	 * starting from the inode that matches this extent's snapshot. If we
1579 	 * didn't have one, iterate over all inodes:
1580 	 */
1581 	if (!i)
1582 		i = &darray_last(inode->inodes);
1583 
1584 	for (;
1585 	     inode->inodes.data && i >= inode->inodes.data;
1586 	     --i) {
1587 		if (i->snapshot > k.k->p.snapshot ||
1588 		    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1589 			continue;
1590 
1591 		if (k.k->type != KEY_TYPE_whiteout) {
1592 			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1593 					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1594 					!bkey_extent_is_reservation(k),
1595 					trans, extent_past_end_of_inode,
1596 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
1597 					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1598 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1599 				struct btree_iter iter2;
1600 
1601 				bch2_trans_copy_iter(&iter2, iter);
1602 				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1603 				ret =   bch2_btree_iter_traverse(&iter2) ?:
1604 					bch2_btree_delete_at(trans, &iter2,
1605 						BTREE_UPDATE_internal_snapshot_node);
1606 				bch2_trans_iter_exit(trans, &iter2);
1607 				if (ret)
1608 					goto err;
1609 
1610 				iter->k.type = KEY_TYPE_whiteout;
1611 			}
1612 
1613 			if (bkey_extent_is_allocation(k.k))
1614 				i->count += k.k->size;
1615 		}
1616 
1617 		i->seen_this_pos = true;
1618 	}
1619 
1620 	if (k.k->type != KEY_TYPE_whiteout) {
1621 		ret = extent_ends_at(c, extent_ends, s, k);
1622 		if (ret)
1623 			goto err;
1624 	}
1625 out:
1626 err:
1627 fsck_err:
1628 	printbuf_exit(&buf);
1629 	bch_err_fn(c, ret);
1630 	return ret;
1631 }
1632 
1633 /*
1634  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1635  * that i_size an i_sectors are consistent
1636  */
1637 int bch2_check_extents(struct bch_fs *c)
1638 {
1639 	struct inode_walker w = inode_walker_init();
1640 	struct snapshots_seen s;
1641 	struct extent_ends extent_ends;
1642 	struct disk_reservation res = { 0 };
1643 
1644 	snapshots_seen_init(&s);
1645 	extent_ends_init(&extent_ends);
1646 
1647 	int ret = bch2_trans_run(c,
1648 		for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1649 				POS(BCACHEFS_ROOT_INO, 0),
1650 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1651 				&res, NULL,
1652 				BCH_TRANS_COMMIT_no_enospc, ({
1653 			bch2_disk_reservation_put(c, &res);
1654 			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1655 			check_extent_overbig(trans, &iter, k);
1656 		})) ?:
1657 		check_i_sectors_notnested(trans, &w));
1658 
1659 	bch2_disk_reservation_put(c, &res);
1660 	extent_ends_exit(&extent_ends);
1661 	inode_walker_exit(&w);
1662 	snapshots_seen_exit(&s);
1663 
1664 	bch_err_fn(c, ret);
1665 	return ret;
1666 }
1667 
1668 int bch2_check_indirect_extents(struct bch_fs *c)
1669 {
1670 	struct disk_reservation res = { 0 };
1671 
1672 	int ret = bch2_trans_run(c,
1673 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1674 				POS_MIN,
1675 				BTREE_ITER_prefetch, k,
1676 				&res, NULL,
1677 				BCH_TRANS_COMMIT_no_enospc, ({
1678 			bch2_disk_reservation_put(c, &res);
1679 			check_extent_overbig(trans, &iter, k);
1680 		})));
1681 
1682 	bch2_disk_reservation_put(c, &res);
1683 	bch_err_fn(c, ret);
1684 	return ret;
1685 }
1686 
1687 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1688 {
1689 	struct bch_fs *c = trans->c;
1690 	int ret = 0;
1691 	s64 count2;
1692 
1693 	darray_for_each(w->inodes, i) {
1694 		if (i->inode.bi_nlink == i->count)
1695 			continue;
1696 
1697 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1698 		if (count2 < 0)
1699 			return count2;
1700 
1701 		if (i->count != count2) {
1702 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1703 					    w->last_pos.inode, i->snapshot, i->count, count2);
1704 			i->count = count2;
1705 			if (i->inode.bi_nlink == i->count)
1706 				continue;
1707 		}
1708 
1709 		if (fsck_err_on(i->inode.bi_nlink != i->count,
1710 				trans, inode_dir_wrong_nlink,
1711 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1712 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1713 			i->inode.bi_nlink = i->count;
1714 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1715 			if (ret)
1716 				break;
1717 		}
1718 	}
1719 fsck_err:
1720 	bch_err_fn(c, ret);
1721 	return ret;
1722 }
1723 
1724 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1725 {
1726 	u32 restart_count = trans->restart_count;
1727 	return check_subdir_count_notnested(trans, w) ?:
1728 		trans_was_restarted(trans, restart_count);
1729 }
1730 
1731 noinline_for_stack
1732 static int check_dirent_inode_dirent(struct btree_trans *trans,
1733 				   struct btree_iter *iter,
1734 				   struct bkey_s_c_dirent d,
1735 				   struct bch_inode_unpacked *target,
1736 				   u32 target_snapshot)
1737 {
1738 	struct bch_fs *c = trans->c;
1739 	struct printbuf buf = PRINTBUF;
1740 	int ret = 0;
1741 
1742 	if (inode_points_to_dirent(target, d))
1743 		return 0;
1744 
1745 	if (bch2_inode_should_have_bp(target) &&
1746 	    !fsck_err(trans, inode_wrong_backpointer,
1747 		      "dirent points to inode that does not point back:\n  %s",
1748 		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
1749 		       prt_printf(&buf, "\n  "),
1750 		       bch2_inode_unpacked_to_text(&buf, target),
1751 		       buf.buf)))
1752 		goto out_noiter;
1753 
1754 	if (!target->bi_dir &&
1755 	    !target->bi_dir_offset) {
1756 		target->bi_dir		= d.k->p.inode;
1757 		target->bi_dir_offset	= d.k->p.offset;
1758 		return __bch2_fsck_write_inode(trans, target, target_snapshot);
1759 	}
1760 
1761 	struct btree_iter bp_iter = { NULL };
1762 	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1763 			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1764 	ret = bkey_err(bp_dirent);
1765 	if (ret && !bch2_err_matches(ret, ENOENT))
1766 		goto err;
1767 
1768 	bool backpointer_exists = !ret;
1769 	ret = 0;
1770 
1771 	if (fsck_err_on(!backpointer_exists,
1772 			trans, inode_wrong_backpointer,
1773 			"inode %llu:%u has wrong backpointer:\n"
1774 			"got       %llu:%llu\n"
1775 			"should be %llu:%llu",
1776 			target->bi_inum, target_snapshot,
1777 			target->bi_dir,
1778 			target->bi_dir_offset,
1779 			d.k->p.inode,
1780 			d.k->p.offset)) {
1781 		target->bi_dir		= d.k->p.inode;
1782 		target->bi_dir_offset	= d.k->p.offset;
1783 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1784 		goto out;
1785 	}
1786 
1787 	bch2_bkey_val_to_text(&buf, c, d.s_c);
1788 	prt_newline(&buf);
1789 	if (backpointer_exists)
1790 		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1791 
1792 	if (fsck_err_on(backpointer_exists &&
1793 			(S_ISDIR(target->bi_mode) ||
1794 			 target->bi_subvol),
1795 			trans, inode_dir_multiple_links,
1796 			"%s %llu:%u with multiple links\n%s",
1797 			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1798 			target->bi_inum, target_snapshot, buf.buf)) {
1799 		ret = __remove_dirent(trans, d.k->p);
1800 		goto out;
1801 	}
1802 
1803 	/*
1804 	 * hardlinked file with nlink 0:
1805 	 * We're just adjusting nlink here so check_nlinks() will pick
1806 	 * it up, it ignores inodes with nlink 0
1807 	 */
1808 	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1809 			trans, inode_multiple_links_but_nlink_0,
1810 			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1811 			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1812 		target->bi_nlink++;
1813 		target->bi_flags &= ~BCH_INODE_unlinked;
1814 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1815 		if (ret)
1816 			goto err;
1817 	}
1818 out:
1819 err:
1820 fsck_err:
1821 	bch2_trans_iter_exit(trans, &bp_iter);
1822 out_noiter:
1823 	printbuf_exit(&buf);
1824 	bch_err_fn(c, ret);
1825 	return ret;
1826 }
1827 
1828 noinline_for_stack
1829 static int check_dirent_target(struct btree_trans *trans,
1830 			       struct btree_iter *iter,
1831 			       struct bkey_s_c_dirent d,
1832 			       struct bch_inode_unpacked *target,
1833 			       u32 target_snapshot)
1834 {
1835 	struct bch_fs *c = trans->c;
1836 	struct bkey_i_dirent *n;
1837 	struct printbuf buf = PRINTBUF;
1838 	int ret = 0;
1839 
1840 	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1841 	if (ret)
1842 		goto err;
1843 
1844 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
1845 			trans, dirent_d_type_wrong,
1846 			"incorrect d_type: got %s, should be %s:\n%s",
1847 			bch2_d_type_str(d.v->d_type),
1848 			bch2_d_type_str(inode_d_type(target)),
1849 			(printbuf_reset(&buf),
1850 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1851 		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1852 		ret = PTR_ERR_OR_ZERO(n);
1853 		if (ret)
1854 			goto err;
1855 
1856 		bkey_reassemble(&n->k_i, d.s_c);
1857 		n->v.d_type = inode_d_type(target);
1858 		if (n->v.d_type == DT_SUBVOL) {
1859 			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1860 			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1861 		} else {
1862 			n->v.d_inum = cpu_to_le64(target->bi_inum);
1863 		}
1864 
1865 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1866 		if (ret)
1867 			goto err;
1868 
1869 		d = dirent_i_to_s_c(n);
1870 	}
1871 err:
1872 fsck_err:
1873 	printbuf_exit(&buf);
1874 	bch_err_fn(c, ret);
1875 	return ret;
1876 }
1877 
1878 /* find a subvolume that's a descendent of @snapshot: */
1879 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1880 {
1881 	struct btree_iter iter;
1882 	struct bkey_s_c k;
1883 	int ret;
1884 
1885 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1886 		if (k.k->type != KEY_TYPE_subvolume)
1887 			continue;
1888 
1889 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1890 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1891 			bch2_trans_iter_exit(trans, &iter);
1892 			*subvolid = k.k->p.offset;
1893 			goto found;
1894 		}
1895 	}
1896 	if (!ret)
1897 		ret = -ENOENT;
1898 found:
1899 	bch2_trans_iter_exit(trans, &iter);
1900 	return ret;
1901 }
1902 
1903 noinline_for_stack
1904 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1905 				  struct bkey_s_c_dirent d)
1906 {
1907 	struct bch_fs *c = trans->c;
1908 	struct btree_iter subvol_iter = {};
1909 	struct bch_inode_unpacked subvol_root;
1910 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1911 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1912 	u32 parent_snapshot;
1913 	u32 new_parent_subvol = 0;
1914 	u64 parent_inum;
1915 	struct printbuf buf = PRINTBUF;
1916 	int ret = 0;
1917 
1918 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1919 	if (ret && !bch2_err_matches(ret, ENOENT))
1920 		return ret;
1921 
1922 	if (ret ||
1923 	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
1924 		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1925 		if (ret2 && !bch2_err_matches(ret, ENOENT))
1926 			return ret2;
1927 	}
1928 
1929 	if (ret &&
1930 	    !new_parent_subvol &&
1931 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1932 		/*
1933 		 * Couldn't find a subvol for dirent's snapshot - but we lost
1934 		 * subvols, so we need to reconstruct:
1935 		 */
1936 		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
1937 		if (ret)
1938 			return ret;
1939 
1940 		parent_snapshot = d.k->p.snapshot;
1941 	}
1942 
1943 	if (fsck_err_on(ret,
1944 			trans, dirent_to_missing_parent_subvol,
1945 			"dirent parent_subvol points to missing subvolume\n%s",
1946 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1947 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1948 			trans, dirent_not_visible_in_parent_subvol,
1949 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1950 			parent_snapshot,
1951 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1952 		if (!new_parent_subvol) {
1953 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
1954 			return -BCH_ERR_fsck_repair_unimplemented;
1955 		}
1956 
1957 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1958 		ret = PTR_ERR_OR_ZERO(new_dirent);
1959 		if (ret)
1960 			goto err;
1961 
1962 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1963 	}
1964 
1965 	struct bkey_s_c_subvolume s =
1966 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
1967 					 BTREE_ID_subvolumes, POS(0, target_subvol),
1968 					 0, subvolume);
1969 	ret = bkey_err(s.s_c);
1970 	if (ret && !bch2_err_matches(ret, ENOENT))
1971 		return ret;
1972 
1973 	if (ret) {
1974 		if (fsck_err(trans, dirent_to_missing_subvol,
1975 			     "dirent points to missing subvolume\n%s",
1976 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1977 			return __remove_dirent(trans, d.k->p);
1978 		ret = 0;
1979 		goto out;
1980 	}
1981 
1982 	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
1983 			trans, subvol_fs_path_parent_wrong,
1984 			"subvol with wrong fs_path_parent, should be be %u\n%s",
1985 			parent_subvol,
1986 			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
1987 		struct bkey_i_subvolume *n =
1988 			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
1989 		ret = PTR_ERR_OR_ZERO(n);
1990 		if (ret)
1991 			goto err;
1992 
1993 		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
1994 	}
1995 
1996 	u64 target_inum = le64_to_cpu(s.v->inode);
1997 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
1998 
1999 	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
2000 	if (ret && !bch2_err_matches(ret, ENOENT))
2001 		goto err;
2002 
2003 	if (ret) {
2004 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2005 		ret = -BCH_ERR_fsck_repair_unimplemented;
2006 		ret = 0;
2007 		goto err;
2008 	}
2009 
2010 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2011 			trans, inode_bi_parent_wrong,
2012 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2013 			target_inum,
2014 			subvol_root.bi_parent_subvol, parent_subvol)) {
2015 		subvol_root.bi_parent_subvol = parent_subvol;
2016 		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
2017 		if (ret)
2018 			goto err;
2019 	}
2020 
2021 	ret = check_dirent_target(trans, iter, d, &subvol_root,
2022 				  target_snapshot);
2023 	if (ret)
2024 		goto err;
2025 out:
2026 err:
2027 fsck_err:
2028 	bch2_trans_iter_exit(trans, &subvol_iter);
2029 	printbuf_exit(&buf);
2030 	return ret;
2031 }
2032 
2033 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2034 			struct bkey_s_c k,
2035 			struct bch_hash_info *hash_info,
2036 			struct inode_walker *dir,
2037 			struct inode_walker *target,
2038 			struct snapshots_seen *s)
2039 {
2040 	struct bch_fs *c = trans->c;
2041 	struct inode_walker_entry *i;
2042 	struct printbuf buf = PRINTBUF;
2043 	int ret = 0;
2044 
2045 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2046 	if (ret) {
2047 		ret = ret < 0 ? ret : 0;
2048 		goto out;
2049 	}
2050 
2051 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2052 	if (ret)
2053 		goto err;
2054 
2055 	if (k.k->type == KEY_TYPE_whiteout)
2056 		goto out;
2057 
2058 	if (dir->last_pos.inode != k.k->p.inode) {
2059 		ret = check_subdir_count(trans, dir);
2060 		if (ret)
2061 			goto err;
2062 	}
2063 
2064 	i = walk_inode(trans, dir, k);
2065 	ret = PTR_ERR_OR_ZERO(i);
2066 	if (ret < 0)
2067 		goto err;
2068 
2069 	ret = check_key_has_inode(trans, iter, dir, i, k);
2070 	if (ret)
2071 		goto err;
2072 
2073 	if (!i)
2074 		goto out;
2075 
2076 	if (dir->first_this_inode)
2077 		*hash_info = bch2_hash_info_init(c, &i->inode);
2078 	dir->first_this_inode = false;
2079 
2080 	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
2081 	if (ret < 0)
2082 		goto err;
2083 	if (ret) {
2084 		/* dirent has been deleted */
2085 		ret = 0;
2086 		goto out;
2087 	}
2088 
2089 	if (k.k->type != KEY_TYPE_dirent)
2090 		goto out;
2091 
2092 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2093 
2094 	if (d.v->d_type == DT_SUBVOL) {
2095 		ret = check_dirent_to_subvol(trans, iter, d);
2096 		if (ret)
2097 			goto err;
2098 	} else {
2099 		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2100 		if (ret)
2101 			goto err;
2102 
2103 		if (fsck_err_on(!target->inodes.nr,
2104 				trans, dirent_to_missing_inode,
2105 				"dirent points to missing inode:\n%s",
2106 				(printbuf_reset(&buf),
2107 				 bch2_bkey_val_to_text(&buf, c, k),
2108 				 buf.buf))) {
2109 			ret = __remove_dirent(trans, d.k->p);
2110 			if (ret)
2111 				goto err;
2112 		}
2113 
2114 		darray_for_each(target->inodes, i) {
2115 			ret = check_dirent_target(trans, iter, d,
2116 						  &i->inode, i->snapshot);
2117 			if (ret)
2118 				goto err;
2119 		}
2120 
2121 		if (d.v->d_type == DT_DIR)
2122 			for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
2123 				i->count++;
2124 	}
2125 out:
2126 err:
2127 fsck_err:
2128 	printbuf_exit(&buf);
2129 	bch_err_fn(c, ret);
2130 	return ret;
2131 }
2132 
2133 /*
2134  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2135  * validate d_type
2136  */
2137 int bch2_check_dirents(struct bch_fs *c)
2138 {
2139 	struct inode_walker dir = inode_walker_init();
2140 	struct inode_walker target = inode_walker_init();
2141 	struct snapshots_seen s;
2142 	struct bch_hash_info hash_info;
2143 
2144 	snapshots_seen_init(&s);
2145 
2146 	int ret = bch2_trans_run(c,
2147 		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2148 				POS(BCACHEFS_ROOT_INO, 0),
2149 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2150 				k,
2151 				NULL, NULL,
2152 				BCH_TRANS_COMMIT_no_enospc,
2153 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2154 		check_subdir_count_notnested(trans, &dir));
2155 
2156 	snapshots_seen_exit(&s);
2157 	inode_walker_exit(&dir);
2158 	inode_walker_exit(&target);
2159 	bch_err_fn(c, ret);
2160 	return ret;
2161 }
2162 
2163 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2164 		       struct bkey_s_c k,
2165 		       struct bch_hash_info *hash_info,
2166 		       struct inode_walker *inode)
2167 {
2168 	struct bch_fs *c = trans->c;
2169 	struct inode_walker_entry *i;
2170 	int ret;
2171 
2172 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2173 	if (ret < 0)
2174 		return ret;
2175 	if (ret)
2176 		return 0;
2177 
2178 	i = walk_inode(trans, inode, k);
2179 	ret = PTR_ERR_OR_ZERO(i);
2180 	if (ret)
2181 		return ret;
2182 
2183 	ret = check_key_has_inode(trans, iter, inode, i, k);
2184 	if (ret)
2185 		return ret;
2186 
2187 	if (!i)
2188 		return 0;
2189 
2190 	if (inode->first_this_inode)
2191 		*hash_info = bch2_hash_info_init(c, &i->inode);
2192 	inode->first_this_inode = false;
2193 
2194 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2195 	bch_err_fn(c, ret);
2196 	return ret;
2197 }
2198 
2199 /*
2200  * Walk xattrs: verify that they all have a corresponding inode
2201  */
2202 int bch2_check_xattrs(struct bch_fs *c)
2203 {
2204 	struct inode_walker inode = inode_walker_init();
2205 	struct bch_hash_info hash_info;
2206 	int ret = 0;
2207 
2208 	ret = bch2_trans_run(c,
2209 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2210 			POS(BCACHEFS_ROOT_INO, 0),
2211 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2212 			k,
2213 			NULL, NULL,
2214 			BCH_TRANS_COMMIT_no_enospc,
2215 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2216 	bch_err_fn(c, ret);
2217 	return ret;
2218 }
2219 
2220 static int check_root_trans(struct btree_trans *trans)
2221 {
2222 	struct bch_fs *c = trans->c;
2223 	struct bch_inode_unpacked root_inode;
2224 	u32 snapshot;
2225 	u64 inum;
2226 	int ret;
2227 
2228 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2229 	if (ret && !bch2_err_matches(ret, ENOENT))
2230 		return ret;
2231 
2232 	if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2233 				"root subvol missing")) {
2234 		struct bkey_i_subvolume *root_subvol =
2235 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2236 		ret = PTR_ERR_OR_ZERO(root_subvol);
2237 		if (ret)
2238 			goto err;
2239 
2240 		snapshot	= U32_MAX;
2241 		inum		= BCACHEFS_ROOT_INO;
2242 
2243 		bkey_subvolume_init(&root_subvol->k_i);
2244 		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2245 		root_subvol->v.flags	= 0;
2246 		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2247 		root_subvol->v.inode	= cpu_to_le64(inum);
2248 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2249 		bch_err_msg(c, ret, "writing root subvol");
2250 		if (ret)
2251 			goto err;
2252 	}
2253 
2254 	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2255 	if (ret && !bch2_err_matches(ret, ENOENT))
2256 		return ret;
2257 
2258 	if (mustfix_fsck_err_on(ret,
2259 				trans, root_dir_missing,
2260 				"root directory missing") ||
2261 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2262 				trans, root_inode_not_dir,
2263 				"root inode not a directory")) {
2264 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2265 				0, NULL);
2266 		root_inode.bi_inum = inum;
2267 
2268 		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2269 		bch_err_msg(c, ret, "writing root inode");
2270 	}
2271 err:
2272 fsck_err:
2273 	return ret;
2274 }
2275 
2276 /* Get root directory, create if it doesn't exist: */
2277 int bch2_check_root(struct bch_fs *c)
2278 {
2279 	int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2280 		check_root_trans(trans));
2281 	bch_err_fn(c, ret);
2282 	return ret;
2283 }
2284 
2285 typedef DARRAY(u32) darray_u32;
2286 
2287 static bool darray_u32_has(darray_u32 *d, u32 v)
2288 {
2289 	darray_for_each(*d, i)
2290 		if (*i == v)
2291 			return true;
2292 	return false;
2293 }
2294 
2295 /*
2296  * We've checked that inode backpointers point to valid dirents; here, it's
2297  * sufficient to check that the subvolume root has a dirent:
2298  */
2299 static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
2300 {
2301 	struct bch_inode_unpacked inode;
2302 	int ret = bch2_inode_find_by_inum_trans(trans,
2303 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2304 				&inode);
2305 	if (ret)
2306 		return ret;
2307 
2308 	return inode.bi_dir != 0;
2309 }
2310 
2311 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2312 {
2313 	struct bch_fs *c = trans->c;
2314 	struct btree_iter parent_iter = {};
2315 	darray_u32 subvol_path = {};
2316 	struct printbuf buf = PRINTBUF;
2317 	int ret = 0;
2318 
2319 	if (k.k->type != KEY_TYPE_subvolume)
2320 		return 0;
2321 
2322 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2323 		ret = darray_push(&subvol_path, k.k->p.offset);
2324 		if (ret)
2325 			goto err;
2326 
2327 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2328 
2329 		ret = subvol_has_dirent(trans, s);
2330 		if (ret < 0)
2331 			break;
2332 
2333 		if (fsck_err_on(!ret,
2334 				trans, subvol_unreachable,
2335 				"unreachable subvolume %s",
2336 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2337 				 buf.buf))) {
2338 			ret = reattach_subvol(trans, s);
2339 			break;
2340 		}
2341 
2342 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2343 
2344 		if (darray_u32_has(&subvol_path, parent)) {
2345 			if (fsck_err(c, subvol_loop, "subvolume loop"))
2346 				ret = reattach_subvol(trans, s);
2347 			break;
2348 		}
2349 
2350 		bch2_trans_iter_exit(trans, &parent_iter);
2351 		bch2_trans_iter_init(trans, &parent_iter,
2352 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2353 		k = bch2_btree_iter_peek_slot(&parent_iter);
2354 		ret = bkey_err(k);
2355 		if (ret)
2356 			goto err;
2357 
2358 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2359 				trans, subvol_unreachable,
2360 				"unreachable subvolume %s",
2361 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2362 				 buf.buf))) {
2363 			ret = reattach_subvol(trans, s);
2364 			break;
2365 		}
2366 	}
2367 fsck_err:
2368 err:
2369 	printbuf_exit(&buf);
2370 	darray_exit(&subvol_path);
2371 	bch2_trans_iter_exit(trans, &parent_iter);
2372 	return ret;
2373 }
2374 
2375 int bch2_check_subvolume_structure(struct bch_fs *c)
2376 {
2377 	int ret = bch2_trans_run(c,
2378 		for_each_btree_key_commit(trans, iter,
2379 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2380 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2381 			check_subvol_path(trans, &iter, k)));
2382 	bch_err_fn(c, ret);
2383 	return ret;
2384 }
2385 
2386 struct pathbuf_entry {
2387 	u64	inum;
2388 	u32	snapshot;
2389 };
2390 
2391 typedef DARRAY(struct pathbuf_entry) pathbuf;
2392 
2393 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2394 {
2395 	darray_for_each(*p, i)
2396 		if (i->inum	== inum &&
2397 		    i->snapshot	== snapshot)
2398 			return true;
2399 	return false;
2400 }
2401 
2402 /*
2403  * Check that a given inode is reachable from its subvolume root - we already
2404  * verified subvolume connectivity:
2405  *
2406  * XXX: we should also be verifying that inodes are in the right subvolumes
2407  */
2408 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2409 {
2410 	struct bch_fs *c = trans->c;
2411 	struct btree_iter inode_iter = {};
2412 	struct bch_inode_unpacked inode;
2413 	struct printbuf buf = PRINTBUF;
2414 	u32 snapshot = inode_k.k->p.snapshot;
2415 	int ret = 0;
2416 
2417 	p->nr = 0;
2418 
2419 	BUG_ON(bch2_inode_unpack(inode_k, &inode));
2420 
2421 	while (!inode.bi_subvol) {
2422 		struct btree_iter dirent_iter;
2423 		struct bkey_s_c_dirent d;
2424 		u32 parent_snapshot = snapshot;
2425 
2426 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2427 		ret = bkey_err(d.s_c);
2428 		if (ret && !bch2_err_matches(ret, ENOENT))
2429 			break;
2430 
2431 		if (!ret && !dirent_points_to_inode(d, &inode)) {
2432 			bch2_trans_iter_exit(trans, &dirent_iter);
2433 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2434 		}
2435 
2436 		if (bch2_err_matches(ret, ENOENT)) {
2437 			ret = 0;
2438 			if (fsck_err(trans, inode_unreachable,
2439 				     "unreachable inode\n%s",
2440 				     (printbuf_reset(&buf),
2441 				      bch2_bkey_val_to_text(&buf, c, inode_k),
2442 				      buf.buf)))
2443 				ret = reattach_inode(trans, &inode, snapshot);
2444 			goto out;
2445 		}
2446 
2447 		bch2_trans_iter_exit(trans, &dirent_iter);
2448 
2449 		if (!S_ISDIR(inode.bi_mode))
2450 			break;
2451 
2452 		ret = darray_push(p, ((struct pathbuf_entry) {
2453 			.inum		= inode.bi_inum,
2454 			.snapshot	= snapshot,
2455 		}));
2456 		if (ret)
2457 			return ret;
2458 
2459 		snapshot = parent_snapshot;
2460 
2461 		bch2_trans_iter_exit(trans, &inode_iter);
2462 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2463 					     SPOS(0, inode.bi_dir, snapshot), 0);
2464 		ret = bkey_err(inode_k) ?:
2465 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2466 			: bch2_inode_unpack(inode_k, &inode);
2467 		if (ret) {
2468 			/* Should have been caught in dirents pass */
2469 			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
2470 				bch_err(c, "error looking up parent directory: %i", ret);
2471 			break;
2472 		}
2473 
2474 		snapshot = inode_k.k->p.snapshot;
2475 
2476 		if (path_is_dup(p, inode.bi_inum, snapshot)) {
2477 			/* XXX print path */
2478 			bch_err(c, "directory structure loop");
2479 
2480 			darray_for_each(*p, i)
2481 				pr_err("%llu:%u", i->inum, i->snapshot);
2482 			pr_err("%llu:%u", inode.bi_inum, snapshot);
2483 
2484 			if (fsck_err(trans, dir_loop, "directory structure loop")) {
2485 				ret = remove_backpointer(trans, &inode);
2486 				bch_err_msg(c, ret, "removing dirent");
2487 				if (ret)
2488 					break;
2489 
2490 				ret = reattach_inode(trans, &inode, snapshot);
2491 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2492 			}
2493 			break;
2494 		}
2495 	}
2496 out:
2497 fsck_err:
2498 	bch2_trans_iter_exit(trans, &inode_iter);
2499 	printbuf_exit(&buf);
2500 	bch_err_fn(c, ret);
2501 	return ret;
2502 }
2503 
2504 /*
2505  * Check for unreachable inodes, as well as loops in the directory structure:
2506  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2507  * unreachable:
2508  */
2509 int bch2_check_directory_structure(struct bch_fs *c)
2510 {
2511 	pathbuf path = { 0, };
2512 	int ret;
2513 
2514 	ret = bch2_trans_run(c,
2515 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2516 					  BTREE_ITER_intent|
2517 					  BTREE_ITER_prefetch|
2518 					  BTREE_ITER_all_snapshots, k,
2519 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2520 			if (!bkey_is_inode(k.k))
2521 				continue;
2522 
2523 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2524 				continue;
2525 
2526 			check_path(trans, &path, k);
2527 		})));
2528 	darray_exit(&path);
2529 
2530 	bch_err_fn(c, ret);
2531 	return ret;
2532 }
2533 
2534 struct nlink_table {
2535 	size_t		nr;
2536 	size_t		size;
2537 
2538 	struct nlink {
2539 		u64	inum;
2540 		u32	snapshot;
2541 		u32	count;
2542 	}		*d;
2543 };
2544 
2545 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2546 		     u64 inum, u32 snapshot)
2547 {
2548 	if (t->nr == t->size) {
2549 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2550 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2551 
2552 		if (!d) {
2553 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2554 				new_size);
2555 			return -BCH_ERR_ENOMEM_fsck_add_nlink;
2556 		}
2557 
2558 		if (t->d)
2559 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2560 		kvfree(t->d);
2561 
2562 		t->d = d;
2563 		t->size = new_size;
2564 	}
2565 
2566 
2567 	t->d[t->nr++] = (struct nlink) {
2568 		.inum		= inum,
2569 		.snapshot	= snapshot,
2570 	};
2571 
2572 	return 0;
2573 }
2574 
2575 static int nlink_cmp(const void *_l, const void *_r)
2576 {
2577 	const struct nlink *l = _l;
2578 	const struct nlink *r = _r;
2579 
2580 	return cmp_int(l->inum, r->inum);
2581 }
2582 
2583 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2584 		     struct nlink_table *links,
2585 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2586 {
2587 	struct nlink *link, key = {
2588 		.inum = inum, .snapshot = U32_MAX,
2589 	};
2590 
2591 	if (inum < range_start || inum >= range_end)
2592 		return;
2593 
2594 	link = __inline_bsearch(&key, links->d, links->nr,
2595 				sizeof(links->d[0]), nlink_cmp);
2596 	if (!link)
2597 		return;
2598 
2599 	while (link > links->d && link[0].inum == link[-1].inum)
2600 		--link;
2601 
2602 	for (; link < links->d + links->nr && link->inum == inum; link++)
2603 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2604 			link->count++;
2605 			if (link->snapshot >= snapshot)
2606 				break;
2607 		}
2608 }
2609 
2610 noinline_for_stack
2611 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2612 				       struct nlink_table *t,
2613 				       u64 start, u64 *end)
2614 {
2615 	int ret = bch2_trans_run(c,
2616 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2617 				   POS(0, start),
2618 				   BTREE_ITER_intent|
2619 				   BTREE_ITER_prefetch|
2620 				   BTREE_ITER_all_snapshots, k, ({
2621 			if (!bkey_is_inode(k.k))
2622 				continue;
2623 
2624 			/* Should never fail, checked by bch2_inode_invalid: */
2625 			struct bch_inode_unpacked u;
2626 			BUG_ON(bch2_inode_unpack(k, &u));
2627 
2628 			/*
2629 			 * Backpointer and directory structure checks are sufficient for
2630 			 * directories, since they can't have hardlinks:
2631 			 */
2632 			if (S_ISDIR(u.bi_mode))
2633 				continue;
2634 
2635 			if (!u.bi_nlink)
2636 				continue;
2637 
2638 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2639 			if (ret) {
2640 				*end = k.k->p.offset;
2641 				ret = 0;
2642 				break;
2643 			}
2644 			0;
2645 		})));
2646 
2647 	bch_err_fn(c, ret);
2648 	return ret;
2649 }
2650 
2651 noinline_for_stack
2652 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2653 				     u64 range_start, u64 range_end)
2654 {
2655 	struct snapshots_seen s;
2656 
2657 	snapshots_seen_init(&s);
2658 
2659 	int ret = bch2_trans_run(c,
2660 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2661 				   BTREE_ITER_intent|
2662 				   BTREE_ITER_prefetch|
2663 				   BTREE_ITER_all_snapshots, k, ({
2664 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2665 			if (ret)
2666 				break;
2667 
2668 			if (k.k->type == KEY_TYPE_dirent) {
2669 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2670 
2671 				if (d.v->d_type != DT_DIR &&
2672 				    d.v->d_type != DT_SUBVOL)
2673 					inc_link(c, &s, links, range_start, range_end,
2674 						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2675 			}
2676 			0;
2677 		})));
2678 
2679 	snapshots_seen_exit(&s);
2680 
2681 	bch_err_fn(c, ret);
2682 	return ret;
2683 }
2684 
2685 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2686 				     struct bkey_s_c k,
2687 				     struct nlink_table *links,
2688 				     size_t *idx, u64 range_end)
2689 {
2690 	struct bch_inode_unpacked u;
2691 	struct nlink *link = &links->d[*idx];
2692 	int ret = 0;
2693 
2694 	if (k.k->p.offset >= range_end)
2695 		return 1;
2696 
2697 	if (!bkey_is_inode(k.k))
2698 		return 0;
2699 
2700 	BUG_ON(bch2_inode_unpack(k, &u));
2701 
2702 	if (S_ISDIR(u.bi_mode))
2703 		return 0;
2704 
2705 	if (!u.bi_nlink)
2706 		return 0;
2707 
2708 	while ((cmp_int(link->inum, k.k->p.offset) ?:
2709 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2710 		BUG_ON(*idx == links->nr);
2711 		link = &links->d[++*idx];
2712 	}
2713 
2714 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2715 			trans, inode_wrong_nlink,
2716 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
2717 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2718 			bch2_inode_nlink_get(&u), link->count)) {
2719 		bch2_inode_nlink_set(&u, link->count);
2720 		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2721 	}
2722 fsck_err:
2723 	return ret;
2724 }
2725 
2726 noinline_for_stack
2727 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2728 			       struct nlink_table *links,
2729 			       u64 range_start, u64 range_end)
2730 {
2731 	size_t idx = 0;
2732 
2733 	int ret = bch2_trans_run(c,
2734 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2735 				POS(0, range_start),
2736 				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2737 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2738 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2739 	if (ret < 0) {
2740 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2741 		return ret;
2742 	}
2743 
2744 	return 0;
2745 }
2746 
2747 int bch2_check_nlinks(struct bch_fs *c)
2748 {
2749 	struct nlink_table links = { 0 };
2750 	u64 this_iter_range_start, next_iter_range_start = 0;
2751 	int ret = 0;
2752 
2753 	do {
2754 		this_iter_range_start = next_iter_range_start;
2755 		next_iter_range_start = U64_MAX;
2756 
2757 		ret = check_nlinks_find_hardlinks(c, &links,
2758 						  this_iter_range_start,
2759 						  &next_iter_range_start);
2760 
2761 		ret = check_nlinks_walk_dirents(c, &links,
2762 					  this_iter_range_start,
2763 					  next_iter_range_start);
2764 		if (ret)
2765 			break;
2766 
2767 		ret = check_nlinks_update_hardlinks(c, &links,
2768 					 this_iter_range_start,
2769 					 next_iter_range_start);
2770 		if (ret)
2771 			break;
2772 
2773 		links.nr = 0;
2774 	} while (next_iter_range_start != U64_MAX);
2775 
2776 	kvfree(links.d);
2777 	bch_err_fn(c, ret);
2778 	return ret;
2779 }
2780 
2781 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2782 			     struct bkey_s_c k)
2783 {
2784 	struct bkey_s_c_reflink_p p;
2785 	struct bkey_i_reflink_p *u;
2786 
2787 	if (k.k->type != KEY_TYPE_reflink_p)
2788 		return 0;
2789 
2790 	p = bkey_s_c_to_reflink_p(k);
2791 
2792 	if (!p.v->front_pad && !p.v->back_pad)
2793 		return 0;
2794 
2795 	u = bch2_trans_kmalloc(trans, sizeof(*u));
2796 	int ret = PTR_ERR_OR_ZERO(u);
2797 	if (ret)
2798 		return ret;
2799 
2800 	bkey_reassemble(&u->k_i, k);
2801 	u->v.front_pad	= 0;
2802 	u->v.back_pad	= 0;
2803 
2804 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2805 }
2806 
2807 int bch2_fix_reflink_p(struct bch_fs *c)
2808 {
2809 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2810 		return 0;
2811 
2812 	int ret = bch2_trans_run(c,
2813 		for_each_btree_key_commit(trans, iter,
2814 				BTREE_ID_extents, POS_MIN,
2815 				BTREE_ITER_intent|BTREE_ITER_prefetch|
2816 				BTREE_ITER_all_snapshots, k,
2817 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2818 			fix_reflink_p_key(trans, &iter, k)));
2819 	bch_err_fn(c, ret);
2820 	return ret;
2821 }
2822