xref: /linux/fs/bcachefs/fsck.c (revision c13d526d9dc1aa0c4962b017c881c28c1e23ca26)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "darray.h"
9 #include "dirent.h"
10 #include "error.h"
11 #include "fs-common.h"
12 #include "fsck.h"
13 #include "inode.h"
14 #include "keylist.h"
15 #include "recovery_passes.h"
16 #include "snapshot.h"
17 #include "super.h"
18 #include "xattr.h"
19 
20 #include <linux/bsearch.h>
21 #include <linux/dcache.h> /* struct qstr */
22 
23 /*
24  * XXX: this is handling transaction restarts without returning
25  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
26  */
27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
28 				    u32 snapshot)
29 {
30 	u64 sectors = 0;
31 
32 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
33 				SPOS(inum, 0, snapshot),
34 				POS(inum, U64_MAX),
35 				0, k, ({
36 		if (bkey_extent_is_allocation(k.k))
37 			sectors += k.k->size;
38 		0;
39 	}));
40 
41 	return ret ?: sectors;
42 }
43 
44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45 				    u32 snapshot)
46 {
47 	u64 subdirs = 0;
48 
49 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
50 				    SPOS(inum, 0, snapshot),
51 				    POS(inum, U64_MAX),
52 				    0, k, ({
53 		if (k.k->type == KEY_TYPE_dirent &&
54 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
55 			subdirs++;
56 		0;
57 	}));
58 
59 	return ret ?: subdirs;
60 }
61 
62 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
63 			 u32 *snapshot, u64 *inum)
64 {
65 	struct bch_subvolume s;
66 	int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
67 
68 	*snapshot = le32_to_cpu(s.snapshot);
69 	*inum = le64_to_cpu(s.inode);
70 	return ret;
71 }
72 
73 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
74 			      struct bch_inode_unpacked *inode)
75 {
76 	struct btree_iter iter;
77 	struct bkey_s_c k;
78 	int ret;
79 
80 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
81 				     BTREE_ITER_all_snapshots, k, ret) {
82 		if (k.k->p.offset != inode_nr)
83 			break;
84 		if (!bkey_is_inode(k.k))
85 			continue;
86 		ret = bch2_inode_unpack(k, inode);
87 		goto found;
88 	}
89 	ret = -BCH_ERR_ENOENT_inode;
90 found:
91 	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
92 	bch2_trans_iter_exit(trans, &iter);
93 	return ret;
94 }
95 
96 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
97 			struct bch_inode_unpacked *inode,
98 			u32 *snapshot)
99 {
100 	struct btree_iter iter;
101 	struct bkey_s_c k;
102 	int ret;
103 
104 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
105 			       SPOS(0, inode_nr, *snapshot), 0);
106 	ret = bkey_err(k);
107 	if (ret)
108 		goto err;
109 
110 	ret = bkey_is_inode(k.k)
111 		? bch2_inode_unpack(k, inode)
112 		: -BCH_ERR_ENOENT_inode;
113 	if (!ret)
114 		*snapshot = iter.pos.snapshot;
115 err:
116 	bch2_trans_iter_exit(trans, &iter);
117 	return ret;
118 }
119 
120 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
121 			   struct bch_hash_info hash_info,
122 			   subvol_inum dir, struct qstr *name,
123 			   u64 *target, unsigned *type, u32 snapshot)
124 {
125 	struct btree_iter iter;
126 	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
127 							 &hash_info, dir, name, 0, snapshot);
128 	int ret = bkey_err(k);
129 	if (ret)
130 		return ret;
131 
132 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
133 	*target = le64_to_cpu(d.v->d_inum);
134 	*type = d.v->d_type;
135 	bch2_trans_iter_exit(trans, &iter);
136 	return 0;
137 }
138 
139 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
140 {
141 	struct bch_fs *c = trans->c;
142 	struct btree_iter iter;
143 	struct bch_inode_unpacked dir_inode;
144 	struct bch_hash_info dir_hash_info;
145 	int ret;
146 
147 	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
148 	if (ret)
149 		goto err;
150 
151 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
152 
153 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
154 
155 	ret =   bch2_btree_iter_traverse(&iter) ?:
156 		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
157 				    &dir_hash_info, &iter,
158 				    BTREE_UPDATE_internal_snapshot_node);
159 	bch2_trans_iter_exit(trans, &iter);
160 err:
161 	bch_err_fn(c, ret);
162 	return ret;
163 }
164 
165 /* Get lost+found, create if it doesn't exist: */
166 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
167 			    struct bch_inode_unpacked *lostfound,
168 			    u64 reattaching_inum)
169 {
170 	struct bch_fs *c = trans->c;
171 	struct qstr lostfound_str = QSTR("lost+found");
172 	u64 inum = 0;
173 	unsigned d_type = 0;
174 	int ret;
175 
176 	struct bch_snapshot_tree st;
177 	ret = bch2_snapshot_tree_lookup(trans,
178 			bch2_snapshot_tree(c, snapshot), &st);
179 	if (ret)
180 		return ret;
181 
182 	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
183 
184 	struct bch_subvolume subvol;
185 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
186 				 false, 0, &subvol);
187 	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
188 		    le32_to_cpu(st.master_subvol), snapshot);
189 	if (ret)
190 		return ret;
191 
192 	if (!subvol.inode) {
193 		struct btree_iter iter;
194 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
195 				BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
196 				0, subvolume);
197 		ret = PTR_ERR_OR_ZERO(subvol);
198 		if (ret)
199 			return ret;
200 
201 		subvol->v.inode = cpu_to_le64(reattaching_inum);
202 		bch2_trans_iter_exit(trans, &iter);
203 	}
204 
205 	root_inum.inum = le64_to_cpu(subvol.inode);
206 
207 	struct bch_inode_unpacked root_inode;
208 	struct bch_hash_info root_hash_info;
209 	u32 root_inode_snapshot = snapshot;
210 	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
211 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
212 		    root_inum.inum, le32_to_cpu(st.master_subvol));
213 	if (ret)
214 		return ret;
215 
216 	root_hash_info = bch2_hash_info_init(c, &root_inode);
217 
218 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
219 			      &lostfound_str, &inum, &d_type, snapshot);
220 	if (bch2_err_matches(ret, ENOENT))
221 		goto create_lostfound;
222 
223 	bch_err_fn(c, ret);
224 	if (ret)
225 		return ret;
226 
227 	if (d_type != DT_DIR) {
228 		bch_err(c, "error looking up lost+found: not a directory");
229 		return -BCH_ERR_ENOENT_not_directory;
230 	}
231 
232 	/*
233 	 * The bch2_check_dirents pass has already run, dangling dirents
234 	 * shouldn't exist here:
235 	 */
236 	ret = lookup_inode(trans, inum, lostfound, &snapshot);
237 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
238 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
239 	return ret;
240 
241 create_lostfound:
242 	/*
243 	 * XXX: we could have a nicer log message here  if we had a nice way to
244 	 * walk backpointers to print a path
245 	 */
246 	bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
247 
248 	u64 now = bch2_current_time(c);
249 	struct btree_iter lostfound_iter = { NULL };
250 	u64 cpu = raw_smp_processor_id();
251 
252 	bch2_inode_init_early(c, lostfound);
253 	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
254 	lostfound->bi_dir = root_inode.bi_inum;
255 
256 	root_inode.bi_nlink++;
257 
258 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
259 	if (ret)
260 		goto err;
261 
262 	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
263 	ret = bch2_btree_iter_traverse(&lostfound_iter);
264 	if (ret)
265 		goto err;
266 
267 	ret =   bch2_dirent_create_snapshot(trans,
268 				0, root_inode.bi_inum, snapshot, &root_hash_info,
269 				mode_to_type(lostfound->bi_mode),
270 				&lostfound_str,
271 				lostfound->bi_inum,
272 				&lostfound->bi_dir_offset,
273 				STR_HASH_must_create) ?:
274 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
275 				       BTREE_UPDATE_internal_snapshot_node);
276 err:
277 	bch_err_msg(c, ret, "creating lost+found");
278 	bch2_trans_iter_exit(trans, &lostfound_iter);
279 	return ret;
280 }
281 
282 static int reattach_inode(struct btree_trans *trans,
283 			  struct bch_inode_unpacked *inode,
284 			  u32 inode_snapshot)
285 {
286 	struct bch_hash_info dir_hash;
287 	struct bch_inode_unpacked lostfound;
288 	char name_buf[20];
289 	struct qstr name;
290 	u64 dir_offset = 0;
291 	u32 dirent_snapshot = inode_snapshot;
292 	int ret;
293 
294 	if (inode->bi_subvol) {
295 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
296 
297 		u64 root_inum;
298 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
299 				    &dirent_snapshot, &root_inum);
300 		if (ret)
301 			return ret;
302 
303 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
304 	} else {
305 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
306 	}
307 
308 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
309 	if (ret)
310 		return ret;
311 
312 	if (S_ISDIR(inode->bi_mode)) {
313 		lostfound.bi_nlink++;
314 
315 		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
316 		if (ret)
317 			return ret;
318 	}
319 
320 	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
321 
322 	name = (struct qstr) QSTR(name_buf);
323 
324 	ret = bch2_dirent_create_snapshot(trans,
325 				inode->bi_parent_subvol, lostfound.bi_inum,
326 				dirent_snapshot,
327 				&dir_hash,
328 				inode_d_type(inode),
329 				&name,
330 				inode->bi_subvol ?: inode->bi_inum,
331 				&dir_offset,
332 				STR_HASH_must_create);
333 	if (ret)
334 		return ret;
335 
336 	inode->bi_dir		= lostfound.bi_inum;
337 	inode->bi_dir_offset	= dir_offset;
338 
339 	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
340 }
341 
342 static int remove_backpointer(struct btree_trans *trans,
343 			      struct bch_inode_unpacked *inode)
344 {
345 	struct btree_iter iter;
346 	struct bkey_s_c_dirent d;
347 	int ret;
348 
349 	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
350 				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
351 				     dirent);
352 	ret =   bkey_err(d) ?:
353 		__remove_dirent(trans, d.k->p);
354 	bch2_trans_iter_exit(trans, &iter);
355 	return ret;
356 }
357 
358 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
359 {
360 	struct bch_fs *c = trans->c;
361 
362 	struct bch_inode_unpacked inode;
363 	int ret = bch2_inode_find_by_inum_trans(trans,
364 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
365 				&inode);
366 	if (ret)
367 		return ret;
368 
369 	ret = remove_backpointer(trans, &inode);
370 	bch_err_msg(c, ret, "removing dirent");
371 	if (ret)
372 		return ret;
373 
374 	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
375 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
376 	return ret;
377 }
378 
379 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
380 {
381 	struct bch_fs *c = trans->c;
382 
383 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
384 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
385 		return -BCH_ERR_fsck_repair_unimplemented;
386 	}
387 
388 	/*
389 	 * If inum isn't set, that means we're being called from check_dirents,
390 	 * not check_inodes - the root of this subvolume doesn't exist or we
391 	 * would have found it there:
392 	 */
393 	if (!inum) {
394 		struct btree_iter inode_iter = {};
395 		struct bch_inode_unpacked new_inode;
396 		u64 cpu = raw_smp_processor_id();
397 
398 		bch2_inode_init_early(c, &new_inode);
399 		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
400 
401 		new_inode.bi_subvol = subvolid;
402 
403 		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
404 			  bch2_btree_iter_traverse(&inode_iter) ?:
405 			  bch2_inode_write(trans, &inode_iter, &new_inode);
406 		bch2_trans_iter_exit(trans, &inode_iter);
407 		if (ret)
408 			return ret;
409 
410 		inum = new_inode.bi_inum;
411 	}
412 
413 	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
414 
415 	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
416 	int ret = PTR_ERR_OR_ZERO(new_subvol);
417 	if (ret)
418 		return ret;
419 
420 	bkey_subvolume_init(&new_subvol->k_i);
421 	new_subvol->k.p.offset	= subvolid;
422 	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
423 	new_subvol->v.inode	= cpu_to_le64(inum);
424 	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
425 	if (ret)
426 		return ret;
427 
428 	struct btree_iter iter;
429 	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
430 			BTREE_ID_snapshots, POS(0, snapshotid),
431 			0, snapshot);
432 	ret = PTR_ERR_OR_ZERO(s);
433 	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
434 	if (ret)
435 		return ret;
436 
437 	u32 snapshot_tree = le32_to_cpu(s->v.tree);
438 
439 	s->v.subvol = cpu_to_le32(subvolid);
440 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
441 	bch2_trans_iter_exit(trans, &iter);
442 
443 	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
444 			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
445 			0, snapshot_tree);
446 	ret = PTR_ERR_OR_ZERO(st);
447 	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
448 	if (ret)
449 		return ret;
450 
451 	if (!st->v.master_subvol)
452 		st->v.master_subvol = cpu_to_le32(subvolid);
453 
454 	bch2_trans_iter_exit(trans, &iter);
455 	return 0;
456 }
457 
458 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
459 {
460 	struct bch_fs *c = trans->c;
461 	unsigned i_mode = S_IFREG;
462 	u64 i_size = 0;
463 
464 	switch (btree) {
465 	case BTREE_ID_extents: {
466 		struct btree_iter iter = {};
467 
468 		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
469 		struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
470 		bch2_trans_iter_exit(trans, &iter);
471 		int ret = bkey_err(k);
472 		if (ret)
473 			return ret;
474 
475 		i_size = k.k->p.offset << 9;
476 		break;
477 	}
478 	case BTREE_ID_dirents:
479 		i_mode = S_IFDIR;
480 		break;
481 	case BTREE_ID_xattrs:
482 		break;
483 	default:
484 		BUG();
485 	}
486 
487 	struct bch_inode_unpacked new_inode;
488 	bch2_inode_init_early(c, &new_inode);
489 	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
490 	new_inode.bi_size = i_size;
491 	new_inode.bi_inum = inum;
492 
493 	return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
494 }
495 
496 struct snapshots_seen {
497 	struct bpos			pos;
498 	snapshot_id_list		ids;
499 };
500 
501 static inline void snapshots_seen_exit(struct snapshots_seen *s)
502 {
503 	darray_exit(&s->ids);
504 }
505 
506 static inline void snapshots_seen_init(struct snapshots_seen *s)
507 {
508 	memset(s, 0, sizeof(*s));
509 }
510 
511 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
512 {
513 	u32 *i;
514 	__darray_for_each(s->ids, i) {
515 		if (*i == id)
516 			return 0;
517 		if (*i > id)
518 			break;
519 	}
520 
521 	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
522 	if (ret)
523 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
524 			s->ids.size);
525 	return ret;
526 }
527 
528 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
529 				 enum btree_id btree_id, struct bpos pos)
530 {
531 	if (!bkey_eq(s->pos, pos))
532 		s->ids.nr = 0;
533 	s->pos = pos;
534 
535 	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
536 }
537 
538 /**
539  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
540  * and @ancestor hasn't been overwritten in @seen
541  *
542  * @c:		filesystem handle
543  * @seen:	list of snapshot ids already seen at current position
544  * @id:		descendent snapshot id
545  * @ancestor:	ancestor snapshot id
546  *
547  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
548  */
549 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
550 				    u32 id, u32 ancestor)
551 {
552 	ssize_t i;
553 
554 	EBUG_ON(id > ancestor);
555 
556 	/* @ancestor should be the snapshot most recently added to @seen */
557 	EBUG_ON(ancestor != seen->pos.snapshot);
558 	EBUG_ON(ancestor != darray_last(seen->ids));
559 
560 	if (id == ancestor)
561 		return true;
562 
563 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
564 		return false;
565 
566 	/*
567 	 * We know that @id is a descendant of @ancestor, we're checking if
568 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
569 	 * @ascestor and with @id as a descendent.
570 	 *
571 	 * But we already know that we're scanning IDs between @id and @ancestor
572 	 * numerically, since snapshot ID lists are kept sorted, so if we find
573 	 * an id that's an ancestor of @id we're done:
574 	 */
575 
576 	for (i = seen->ids.nr - 2;
577 	     i >= 0 && seen->ids.data[i] >= id;
578 	     --i)
579 		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
580 			return false;
581 
582 	return true;
583 }
584 
585 /**
586  * ref_visible - given a key with snapshot id @src that points to a key with
587  * snapshot id @dst, test whether there is some snapshot in which @dst is
588  * visible.
589  *
590  * @c:		filesystem handle
591  * @s:		list of snapshot IDs already seen at @src
592  * @src:	snapshot ID of src key
593  * @dst:	snapshot ID of dst key
594  * Returns:	true if there is some snapshot in which @dst is visible
595  *
596  * Assumes we're visiting @src keys in natural key order
597  */
598 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
599 			u32 src, u32 dst)
600 {
601 	return dst <= src
602 		? key_visible_in_snapshot(c, s, dst, src)
603 		: bch2_snapshot_is_ancestor(c, src, dst);
604 }
605 
606 static int ref_visible2(struct bch_fs *c,
607 			u32 src, struct snapshots_seen *src_seen,
608 			u32 dst, struct snapshots_seen *dst_seen)
609 {
610 	if (dst > src) {
611 		swap(dst, src);
612 		swap(dst_seen, src_seen);
613 	}
614 	return key_visible_in_snapshot(c, src_seen, dst, src);
615 }
616 
617 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
618 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
619 	     (_i)->snapshot <= (_snapshot); _i++)					\
620 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
621 
622 struct inode_walker_entry {
623 	struct bch_inode_unpacked inode;
624 	u32			snapshot;
625 	bool			seen_this_pos;
626 	u64			count;
627 };
628 
629 struct inode_walker {
630 	bool				first_this_inode;
631 	bool				recalculate_sums;
632 	struct bpos			last_pos;
633 
634 	DARRAY(struct inode_walker_entry) inodes;
635 };
636 
637 static void inode_walker_exit(struct inode_walker *w)
638 {
639 	darray_exit(&w->inodes);
640 }
641 
642 static struct inode_walker inode_walker_init(void)
643 {
644 	return (struct inode_walker) { 0, };
645 }
646 
647 static int add_inode(struct bch_fs *c, struct inode_walker *w,
648 		     struct bkey_s_c inode)
649 {
650 	struct bch_inode_unpacked u;
651 
652 	BUG_ON(bch2_inode_unpack(inode, &u));
653 
654 	return darray_push(&w->inodes, ((struct inode_walker_entry) {
655 		.inode		= u,
656 		.snapshot	= inode.k->p.snapshot,
657 	}));
658 }
659 
660 static int get_inodes_all_snapshots(struct btree_trans *trans,
661 				    struct inode_walker *w, u64 inum)
662 {
663 	struct bch_fs *c = trans->c;
664 	struct btree_iter iter;
665 	struct bkey_s_c k;
666 	int ret;
667 
668 	w->recalculate_sums = false;
669 	w->inodes.nr = 0;
670 
671 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
672 				     BTREE_ITER_all_snapshots, k, ret) {
673 		if (k.k->p.offset != inum)
674 			break;
675 
676 		if (bkey_is_inode(k.k))
677 			add_inode(c, w, k);
678 	}
679 	bch2_trans_iter_exit(trans, &iter);
680 
681 	if (ret)
682 		return ret;
683 
684 	w->first_this_inode = true;
685 	return 0;
686 }
687 
688 static struct inode_walker_entry *
689 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
690 {
691 	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
692 
693 	struct inode_walker_entry *i;
694 	__darray_for_each(w->inodes, i)
695 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
696 			goto found;
697 
698 	return NULL;
699 found:
700 	BUG_ON(k.k->p.snapshot > i->snapshot);
701 
702 	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
703 		struct inode_walker_entry new = *i;
704 
705 		new.snapshot = k.k->p.snapshot;
706 		new.count = 0;
707 
708 		struct printbuf buf = PRINTBUF;
709 		bch2_bkey_val_to_text(&buf, c, k);
710 
711 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
712 			 "unexpected because we should always update the inode when we update a key in that inode\n"
713 			 "%s",
714 			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
715 		printbuf_exit(&buf);
716 
717 		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
718 			--i;
719 
720 		size_t pos = i - w->inodes.data;
721 		int ret = darray_insert_item(&w->inodes, pos, new);
722 		if (ret)
723 			return ERR_PTR(ret);
724 
725 		i = w->inodes.data + pos;
726 	}
727 
728 	return i;
729 }
730 
731 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
732 					     struct inode_walker *w,
733 					     struct bkey_s_c k)
734 {
735 	if (w->last_pos.inode != k.k->p.inode) {
736 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
737 		if (ret)
738 			return ERR_PTR(ret);
739 	} else if (bkey_cmp(w->last_pos, k.k->p)) {
740 		darray_for_each(w->inodes, i)
741 			i->seen_this_pos = false;
742 	}
743 
744 	w->last_pos = k.k->p;
745 
746 	return lookup_inode_for_snapshot(trans->c, w, k);
747 }
748 
749 static int get_visible_inodes(struct btree_trans *trans,
750 			      struct inode_walker *w,
751 			      struct snapshots_seen *s,
752 			      u64 inum)
753 {
754 	struct bch_fs *c = trans->c;
755 	struct btree_iter iter;
756 	struct bkey_s_c k;
757 	int ret;
758 
759 	w->inodes.nr = 0;
760 
761 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
762 			   BTREE_ITER_all_snapshots, k, ret) {
763 		if (k.k->p.offset != inum)
764 			break;
765 
766 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
767 			continue;
768 
769 		if (bkey_is_inode(k.k))
770 			add_inode(c, w, k);
771 
772 		if (k.k->p.snapshot >= s->pos.snapshot)
773 			break;
774 	}
775 	bch2_trans_iter_exit(trans, &iter);
776 
777 	return ret;
778 }
779 
780 static int hash_redo_key(struct btree_trans *trans,
781 			 const struct bch_hash_desc desc,
782 			 struct bch_hash_info *hash_info,
783 			 struct btree_iter *k_iter, struct bkey_s_c k)
784 {
785 	struct bkey_i *delete;
786 	struct bkey_i *tmp;
787 
788 	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
789 	if (IS_ERR(delete))
790 		return PTR_ERR(delete);
791 
792 	tmp = bch2_bkey_make_mut_noupdate(trans, k);
793 	if (IS_ERR(tmp))
794 		return PTR_ERR(tmp);
795 
796 	bkey_init(&delete->k);
797 	delete->k.p = k_iter->pos;
798 	return  bch2_btree_iter_traverse(k_iter) ?:
799 		bch2_trans_update(trans, k_iter, delete, 0) ?:
800 		bch2_hash_set_in_snapshot(trans, desc, hash_info,
801 				       (subvol_inum) { 0, k.k->p.inode },
802 				       k.k->p.snapshot, tmp,
803 				       STR_HASH_must_create|
804 				       BTREE_UPDATE_internal_snapshot_node) ?:
805 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
806 }
807 
808 static int hash_check_key(struct btree_trans *trans,
809 			  const struct bch_hash_desc desc,
810 			  struct bch_hash_info *hash_info,
811 			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
812 {
813 	struct bch_fs *c = trans->c;
814 	struct btree_iter iter = { NULL };
815 	struct printbuf buf = PRINTBUF;
816 	struct bkey_s_c k;
817 	u64 hash;
818 	int ret = 0;
819 
820 	if (hash_k.k->type != desc.key_type)
821 		return 0;
822 
823 	hash = desc.hash_bkey(hash_info, hash_k);
824 
825 	if (likely(hash == hash_k.k->p.offset))
826 		return 0;
827 
828 	if (hash_k.k->p.offset < hash)
829 		goto bad_hash;
830 
831 	for_each_btree_key_norestart(trans, iter, desc.btree_id,
832 				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
833 				     BTREE_ITER_slots, k, ret) {
834 		if (bkey_eq(k.k->p, hash_k.k->p))
835 			break;
836 
837 		if (fsck_err_on(k.k->type == desc.key_type &&
838 				!desc.cmp_bkey(k, hash_k), c,
839 				hash_table_key_duplicate,
840 				"duplicate hash table keys:\n%s",
841 				(printbuf_reset(&buf),
842 				 bch2_bkey_val_to_text(&buf, c, hash_k),
843 				 buf.buf))) {
844 			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
845 			break;
846 		}
847 
848 		if (bkey_deleted(k.k)) {
849 			bch2_trans_iter_exit(trans, &iter);
850 			goto bad_hash;
851 		}
852 	}
853 out:
854 	bch2_trans_iter_exit(trans, &iter);
855 	printbuf_exit(&buf);
856 	return ret;
857 bad_hash:
858 	if (fsck_err(c, hash_table_key_wrong_offset,
859 		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
860 		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
861 		     (printbuf_reset(&buf),
862 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
863 		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
864 		bch_err_fn(c, ret);
865 		if (ret)
866 			return ret;
867 		ret = -BCH_ERR_transaction_restart_nested;
868 	}
869 fsck_err:
870 	goto out;
871 }
872 
873 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
874 						struct btree_iter *iter,
875 						struct bpos pos)
876 {
877 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
878 }
879 
880 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
881 					       struct btree_iter *iter,
882 					       struct bch_inode_unpacked *inode,
883 					       u32 *snapshot)
884 {
885 	if (inode->bi_subvol) {
886 		u64 inum;
887 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
888 		if (ret)
889 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
890 	}
891 
892 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
893 }
894 
895 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
896 				   struct bkey_s_c_dirent d)
897 {
898 	return  inode->bi_dir		== d.k->p.inode &&
899 		inode->bi_dir_offset	== d.k->p.offset;
900 }
901 
902 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
903 				   struct bch_inode_unpacked *inode)
904 {
905 	return d.v->d_type == DT_SUBVOL
906 		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
907 		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
908 }
909 
910 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
911 {
912 	struct btree_iter iter;
913 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
914 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
915 	bch2_trans_iter_exit(trans, &iter);
916 	return ret;
917 }
918 
919 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
920 				    struct bch_inode_unpacked *inode,
921 				    u32 inode_snapshot, bool *write_inode)
922 {
923 	struct bch_fs *c = trans->c;
924 	struct printbuf buf = PRINTBUF;
925 
926 	struct btree_iter dirent_iter = {};
927 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
928 	int ret = bkey_err(d);
929 	if (ret && !bch2_err_matches(ret, ENOENT))
930 		return ret;
931 
932 	if (fsck_err_on(ret,
933 			c, inode_points_to_missing_dirent,
934 			"inode points to missing dirent\n%s",
935 			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
936 	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
937 			c, inode_points_to_wrong_dirent,
938 			"inode points to dirent that does not point back:\n%s",
939 			(bch2_bkey_val_to_text(&buf, c, inode_k),
940 			 prt_newline(&buf),
941 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
942 		/*
943 		 * We just clear the backpointer fields for now. If we find a
944 		 * dirent that points to this inode in check_dirents(), we'll
945 		 * update it then; then when we get to check_path() if the
946 		 * backpointer is still 0 we'll reattach it.
947 		 */
948 		inode->bi_dir = 0;
949 		inode->bi_dir_offset = 0;
950 		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
951 		*write_inode = true;
952 	}
953 
954 	ret = 0;
955 fsck_err:
956 	bch2_trans_iter_exit(trans, &dirent_iter);
957 	printbuf_exit(&buf);
958 	bch_err_fn(c, ret);
959 	return ret;
960 }
961 
962 static int check_inode(struct btree_trans *trans,
963 		       struct btree_iter *iter,
964 		       struct bkey_s_c k,
965 		       struct bch_inode_unpacked *prev,
966 		       struct snapshots_seen *s,
967 		       bool full)
968 {
969 	struct bch_fs *c = trans->c;
970 	struct bch_inode_unpacked u;
971 	bool do_update = false;
972 	int ret;
973 
974 	ret = bch2_check_key_has_snapshot(trans, iter, k);
975 	if (ret < 0)
976 		goto err;
977 	if (ret)
978 		return 0;
979 
980 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
981 	if (ret)
982 		goto err;
983 
984 	if (!bkey_is_inode(k.k))
985 		return 0;
986 
987 	BUG_ON(bch2_inode_unpack(k, &u));
988 
989 	if (!full &&
990 	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
991 			    BCH_INODE_i_sectors_dirty|
992 			    BCH_INODE_unlinked)))
993 		return 0;
994 
995 	if (prev->bi_inum != u.bi_inum)
996 		*prev = u;
997 
998 	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
999 			inode_d_type(prev)	!= inode_d_type(&u),
1000 			c, inode_snapshot_mismatch,
1001 			"inodes in different snapshots don't match")) {
1002 		bch_err(c, "repair not implemented yet");
1003 		return -BCH_ERR_fsck_repair_unimplemented;
1004 	}
1005 
1006 	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
1007 	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
1008 		struct bpos new_min_pos;
1009 
1010 		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
1011 		if (ret)
1012 			goto err;
1013 
1014 		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
1015 
1016 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1017 
1018 		bch_err_msg(c, ret, "in fsck updating inode");
1019 		if (ret)
1020 			return ret;
1021 
1022 		if (!bpos_eq(new_min_pos, POS_MIN))
1023 			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
1024 		return 0;
1025 	}
1026 
1027 	if (u.bi_flags & BCH_INODE_unlinked) {
1028 		ret = check_inode_deleted_list(trans, k.k->p);
1029 		if (ret < 0)
1030 			return ret;
1031 
1032 		fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
1033 			    "inode %llu:%u unlinked, but not on deleted list",
1034 			    u.bi_inum, k.k->p.snapshot);
1035 		ret = 0;
1036 	}
1037 
1038 	if (u.bi_flags & BCH_INODE_unlinked &&
1039 	    (!c->sb.clean ||
1040 	     fsck_err(c, inode_unlinked_but_clean,
1041 		      "filesystem marked clean, but inode %llu unlinked",
1042 		      u.bi_inum))) {
1043 		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1044 		bch_err_msg(c, ret, "in fsck deleting inode");
1045 		return ret;
1046 	}
1047 
1048 	if (u.bi_flags & BCH_INODE_i_size_dirty &&
1049 	    (!c->sb.clean ||
1050 	     fsck_err(c, inode_i_size_dirty_but_clean,
1051 		      "filesystem marked clean, but inode %llu has i_size dirty",
1052 		      u.bi_inum))) {
1053 		bch_verbose(c, "truncating inode %llu", u.bi_inum);
1054 
1055 		/*
1056 		 * XXX: need to truncate partial blocks too here - or ideally
1057 		 * just switch units to bytes and that issue goes away
1058 		 */
1059 		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1060 				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
1061 				     iter->pos.snapshot),
1062 				POS(u.bi_inum, U64_MAX),
1063 				0, NULL);
1064 		bch_err_msg(c, ret, "in fsck truncating inode");
1065 		if (ret)
1066 			return ret;
1067 
1068 		/*
1069 		 * We truncated without our normal sector accounting hook, just
1070 		 * make sure we recalculate it:
1071 		 */
1072 		u.bi_flags |= BCH_INODE_i_sectors_dirty;
1073 
1074 		u.bi_flags &= ~BCH_INODE_i_size_dirty;
1075 		do_update = true;
1076 	}
1077 
1078 	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
1079 	    (!c->sb.clean ||
1080 	     fsck_err(c, inode_i_sectors_dirty_but_clean,
1081 		      "filesystem marked clean, but inode %llu has i_sectors dirty",
1082 		      u.bi_inum))) {
1083 		s64 sectors;
1084 
1085 		bch_verbose(c, "recounting sectors for inode %llu",
1086 			    u.bi_inum);
1087 
1088 		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1089 		if (sectors < 0) {
1090 			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1091 			return sectors;
1092 		}
1093 
1094 		u.bi_sectors = sectors;
1095 		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1096 		do_update = true;
1097 	}
1098 
1099 	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1100 		u.bi_dir = 0;
1101 		u.bi_dir_offset = 0;
1102 		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1103 		do_update = true;
1104 	}
1105 
1106 	if (u.bi_dir || u.bi_dir_offset) {
1107 		ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1108 		if (ret)
1109 			goto err;
1110 	}
1111 
1112 	if (fsck_err_on(u.bi_parent_subvol &&
1113 			(u.bi_subvol == 0 ||
1114 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1115 			c, inode_bi_parent_nonzero,
1116 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1117 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1118 		u.bi_parent_subvol = 0;
1119 		do_update = true;
1120 	}
1121 
1122 	if (u.bi_subvol) {
1123 		struct bch_subvolume s;
1124 
1125 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1126 		if (ret && !bch2_err_matches(ret, ENOENT))
1127 			goto err;
1128 
1129 		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1130 			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1131 			goto do_update;
1132 		}
1133 
1134 		if (fsck_err_on(ret,
1135 				c, inode_bi_subvol_missing,
1136 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1137 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1138 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1139 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1140 							   k.k->p.snapshot),
1141 				c, inode_bi_subvol_wrong,
1142 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1143 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1144 				le64_to_cpu(s.inode),
1145 				le32_to_cpu(s.snapshot))) {
1146 			u.bi_subvol = 0;
1147 			u.bi_parent_subvol = 0;
1148 			do_update = true;
1149 		}
1150 	}
1151 do_update:
1152 	if (do_update) {
1153 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1154 		bch_err_msg(c, ret, "in fsck updating inode");
1155 		if (ret)
1156 			return ret;
1157 	}
1158 err:
1159 fsck_err:
1160 	bch_err_fn(c, ret);
1161 	return ret;
1162 }
1163 
1164 int bch2_check_inodes(struct bch_fs *c)
1165 {
1166 	bool full = c->opts.fsck;
1167 	struct bch_inode_unpacked prev = { 0 };
1168 	struct snapshots_seen s;
1169 
1170 	snapshots_seen_init(&s);
1171 
1172 	int ret = bch2_trans_run(c,
1173 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1174 				POS_MIN,
1175 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1176 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1177 			check_inode(trans, &iter, k, &prev, &s, full)));
1178 
1179 	snapshots_seen_exit(&s);
1180 	bch_err_fn(c, ret);
1181 	return ret;
1182 }
1183 
1184 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1185 {
1186 	switch (btree) {
1187 	case BTREE_ID_extents:
1188 		return S_ISREG(mode) || S_ISLNK(mode);
1189 	case BTREE_ID_dirents:
1190 		return S_ISDIR(mode);
1191 	case BTREE_ID_xattrs:
1192 		return true;
1193 	default:
1194 		BUG();
1195 	}
1196 }
1197 
1198 static int check_key_has_inode(struct btree_trans *trans,
1199 			       struct btree_iter *iter,
1200 			       struct inode_walker *inode,
1201 			       struct inode_walker_entry *i,
1202 			       struct bkey_s_c k)
1203 {
1204 	struct bch_fs *c = trans->c;
1205 	struct printbuf buf = PRINTBUF;
1206 	int ret = PTR_ERR_OR_ZERO(i);
1207 	if (ret)
1208 		return ret;
1209 
1210 	if (k.k->type == KEY_TYPE_whiteout)
1211 		goto out;
1212 
1213 	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1214 		ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1215 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1216 		if (ret)
1217 			goto err;
1218 
1219 		inode->last_pos.inode--;
1220 		ret = -BCH_ERR_transaction_restart_nested;
1221 		goto err;
1222 	}
1223 
1224 	if (fsck_err_on(!i, c, key_in_missing_inode,
1225 			"key in missing inode:\n  %s",
1226 			(printbuf_reset(&buf),
1227 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1228 		goto delete;
1229 
1230 	if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1231 			c, key_in_wrong_inode_type,
1232 			"key for wrong inode mode %o:\n  %s",
1233 			i->inode.bi_mode,
1234 			(printbuf_reset(&buf),
1235 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1236 		goto delete;
1237 out:
1238 err:
1239 fsck_err:
1240 	printbuf_exit(&buf);
1241 	bch_err_fn(c, ret);
1242 	return ret;
1243 delete:
1244 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1245 	goto out;
1246 }
1247 
1248 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1249 {
1250 	struct bch_fs *c = trans->c;
1251 	int ret = 0;
1252 	s64 count2;
1253 
1254 	darray_for_each(w->inodes, i) {
1255 		if (i->inode.bi_sectors == i->count)
1256 			continue;
1257 
1258 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1259 
1260 		if (w->recalculate_sums)
1261 			i->count = count2;
1262 
1263 		if (i->count != count2) {
1264 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1265 					    w->last_pos.inode, i->snapshot, i->count, count2);
1266 			return -BCH_ERR_internal_fsck_err;
1267 		}
1268 
1269 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1270 				c, inode_i_sectors_wrong,
1271 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1272 				w->last_pos.inode, i->snapshot,
1273 				i->inode.bi_sectors, i->count)) {
1274 			i->inode.bi_sectors = i->count;
1275 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1276 			if (ret)
1277 				break;
1278 		}
1279 	}
1280 fsck_err:
1281 	bch_err_fn(c, ret);
1282 	return ret;
1283 }
1284 
1285 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1286 {
1287 	u32 restart_count = trans->restart_count;
1288 	return check_i_sectors_notnested(trans, w) ?:
1289 		trans_was_restarted(trans, restart_count);
1290 }
1291 
1292 struct extent_end {
1293 	u32			snapshot;
1294 	u64			offset;
1295 	struct snapshots_seen	seen;
1296 };
1297 
1298 struct extent_ends {
1299 	struct bpos			last_pos;
1300 	DARRAY(struct extent_end)	e;
1301 };
1302 
1303 static void extent_ends_reset(struct extent_ends *extent_ends)
1304 {
1305 	darray_for_each(extent_ends->e, i)
1306 		snapshots_seen_exit(&i->seen);
1307 	extent_ends->e.nr = 0;
1308 }
1309 
1310 static void extent_ends_exit(struct extent_ends *extent_ends)
1311 {
1312 	extent_ends_reset(extent_ends);
1313 	darray_exit(&extent_ends->e);
1314 }
1315 
1316 static void extent_ends_init(struct extent_ends *extent_ends)
1317 {
1318 	memset(extent_ends, 0, sizeof(*extent_ends));
1319 }
1320 
1321 static int extent_ends_at(struct bch_fs *c,
1322 			  struct extent_ends *extent_ends,
1323 			  struct snapshots_seen *seen,
1324 			  struct bkey_s_c k)
1325 {
1326 	struct extent_end *i, n = (struct extent_end) {
1327 		.offset		= k.k->p.offset,
1328 		.snapshot	= k.k->p.snapshot,
1329 		.seen		= *seen,
1330 	};
1331 
1332 	n.seen.ids.data = kmemdup(seen->ids.data,
1333 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1334 			      GFP_KERNEL);
1335 	if (!n.seen.ids.data)
1336 		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1337 
1338 	__darray_for_each(extent_ends->e, i) {
1339 		if (i->snapshot == k.k->p.snapshot) {
1340 			snapshots_seen_exit(&i->seen);
1341 			*i = n;
1342 			return 0;
1343 		}
1344 
1345 		if (i->snapshot >= k.k->p.snapshot)
1346 			break;
1347 	}
1348 
1349 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1350 }
1351 
1352 static int overlapping_extents_found(struct btree_trans *trans,
1353 				     enum btree_id btree,
1354 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1355 				     struct bkey pos2,
1356 				     bool *fixed,
1357 				     struct extent_end *extent_end)
1358 {
1359 	struct bch_fs *c = trans->c;
1360 	struct printbuf buf = PRINTBUF;
1361 	struct btree_iter iter1, iter2 = { NULL };
1362 	struct bkey_s_c k1, k2;
1363 	int ret;
1364 
1365 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1366 
1367 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1368 			     BTREE_ITER_all_snapshots|
1369 			     BTREE_ITER_not_extents);
1370 	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1371 	ret = bkey_err(k1);
1372 	if (ret)
1373 		goto err;
1374 
1375 	prt_str(&buf, "\n  ");
1376 	bch2_bkey_val_to_text(&buf, c, k1);
1377 
1378 	if (!bpos_eq(pos1, k1.k->p)) {
1379 		prt_str(&buf, "\n  wanted\n  ");
1380 		bch2_bpos_to_text(&buf, pos1);
1381 		prt_str(&buf, "\n  ");
1382 		bch2_bkey_to_text(&buf, &pos2);
1383 
1384 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1385 			__func__, buf.buf);
1386 		ret = -BCH_ERR_internal_fsck_err;
1387 		goto err;
1388 	}
1389 
1390 	bch2_trans_copy_iter(&iter2, &iter1);
1391 
1392 	while (1) {
1393 		bch2_btree_iter_advance(&iter2);
1394 
1395 		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1396 		ret = bkey_err(k2);
1397 		if (ret)
1398 			goto err;
1399 
1400 		if (bpos_ge(k2.k->p, pos2.p))
1401 			break;
1402 	}
1403 
1404 	prt_str(&buf, "\n  ");
1405 	bch2_bkey_val_to_text(&buf, c, k2);
1406 
1407 	if (bpos_gt(k2.k->p, pos2.p) ||
1408 	    pos2.size != k2.k->size) {
1409 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1410 			__func__, buf.buf);
1411 		ret = -BCH_ERR_internal_fsck_err;
1412 		goto err;
1413 	}
1414 
1415 	prt_printf(&buf, "\n  overwriting %s extent",
1416 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1417 
1418 	if (fsck_err(c, extent_overlapping,
1419 		     "overlapping extents%s", buf.buf)) {
1420 		struct btree_iter *old_iter = &iter1;
1421 		struct disk_reservation res = { 0 };
1422 
1423 		if (pos1.snapshot < pos2.p.snapshot) {
1424 			old_iter = &iter2;
1425 			swap(k1, k2);
1426 		}
1427 
1428 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1429 
1430 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1431 				BTREE_UPDATE_internal_snapshot_node,
1432 				k1, k2) ?:
1433 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1434 		bch2_disk_reservation_put(c, &res);
1435 
1436 		if (ret)
1437 			goto err;
1438 
1439 		*fixed = true;
1440 
1441 		if (pos1.snapshot == pos2.p.snapshot) {
1442 			/*
1443 			 * We overwrote the first extent, and did the overwrite
1444 			 * in the same snapshot:
1445 			 */
1446 			extent_end->offset = bkey_start_offset(&pos2);
1447 		} else if (pos1.snapshot > pos2.p.snapshot) {
1448 			/*
1449 			 * We overwrote the first extent in pos2's snapshot:
1450 			 */
1451 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1452 		} else {
1453 			/*
1454 			 * We overwrote the second extent - restart
1455 			 * check_extent() from the top:
1456 			 */
1457 			ret = -BCH_ERR_transaction_restart_nested;
1458 		}
1459 	}
1460 fsck_err:
1461 err:
1462 	bch2_trans_iter_exit(trans, &iter2);
1463 	bch2_trans_iter_exit(trans, &iter1);
1464 	printbuf_exit(&buf);
1465 	return ret;
1466 }
1467 
1468 static int check_overlapping_extents(struct btree_trans *trans,
1469 			      struct snapshots_seen *seen,
1470 			      struct extent_ends *extent_ends,
1471 			      struct bkey_s_c k,
1472 			      struct btree_iter *iter,
1473 			      bool *fixed)
1474 {
1475 	struct bch_fs *c = trans->c;
1476 	int ret = 0;
1477 
1478 	/* transaction restart, running again */
1479 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1480 		return 0;
1481 
1482 	if (extent_ends->last_pos.inode != k.k->p.inode)
1483 		extent_ends_reset(extent_ends);
1484 
1485 	darray_for_each(extent_ends->e, i) {
1486 		if (i->offset <= bkey_start_offset(k.k))
1487 			continue;
1488 
1489 		if (!ref_visible2(c,
1490 				  k.k->p.snapshot, seen,
1491 				  i->snapshot, &i->seen))
1492 			continue;
1493 
1494 		ret = overlapping_extents_found(trans, iter->btree_id,
1495 						SPOS(iter->pos.inode,
1496 						     i->offset,
1497 						     i->snapshot),
1498 						&i->seen,
1499 						*k.k, fixed, i);
1500 		if (ret)
1501 			goto err;
1502 	}
1503 
1504 	extent_ends->last_pos = k.k->p;
1505 err:
1506 	return ret;
1507 }
1508 
1509 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1510 				struct bkey_s_c k)
1511 {
1512 	struct bch_fs *c = trans->c;
1513 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1514 	struct bch_extent_crc_unpacked crc;
1515 	const union bch_extent_entry *i;
1516 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1517 
1518 	bkey_for_each_crc(k.k, ptrs, crc, i)
1519 		if (crc_is_encoded(crc) &&
1520 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1521 			struct printbuf buf = PRINTBUF;
1522 
1523 			bch2_bkey_val_to_text(&buf, c, k);
1524 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1525 			printbuf_exit(&buf);
1526 		}
1527 
1528 	return 0;
1529 }
1530 
1531 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1532 			struct bkey_s_c k,
1533 			struct inode_walker *inode,
1534 			struct snapshots_seen *s,
1535 			struct extent_ends *extent_ends)
1536 {
1537 	struct bch_fs *c = trans->c;
1538 	struct inode_walker_entry *i;
1539 	struct printbuf buf = PRINTBUF;
1540 	int ret = 0;
1541 
1542 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1543 	if (ret) {
1544 		ret = ret < 0 ? ret : 0;
1545 		goto out;
1546 	}
1547 
1548 	if (inode->last_pos.inode != k.k->p.inode) {
1549 		ret = check_i_sectors(trans, inode);
1550 		if (ret)
1551 			goto err;
1552 	}
1553 
1554 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1555 	if (ret)
1556 		goto err;
1557 
1558 	i = walk_inode(trans, inode, k);
1559 	ret = PTR_ERR_OR_ZERO(i);
1560 	if (ret)
1561 		goto err;
1562 
1563 	ret = check_key_has_inode(trans, iter, inode, i, k);
1564 	if (ret)
1565 		goto err;
1566 
1567 	if (k.k->type != KEY_TYPE_whiteout) {
1568 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1569 						&inode->recalculate_sums);
1570 		if (ret)
1571 			goto err;
1572 	}
1573 
1574 	/*
1575 	 * Check inodes in reverse order, from oldest snapshots to newest,
1576 	 * starting from the inode that matches this extent's snapshot. If we
1577 	 * didn't have one, iterate over all inodes:
1578 	 */
1579 	if (!i)
1580 		i = &darray_last(inode->inodes);
1581 
1582 	for (;
1583 	     inode->inodes.data && i >= inode->inodes.data;
1584 	     --i) {
1585 		if (i->snapshot > k.k->p.snapshot ||
1586 		    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1587 			continue;
1588 
1589 		if (k.k->type != KEY_TYPE_whiteout) {
1590 			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1591 					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1592 					!bkey_extent_is_reservation(k),
1593 					c, extent_past_end_of_inode,
1594 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
1595 					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1596 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1597 				struct btree_iter iter2;
1598 
1599 				bch2_trans_copy_iter(&iter2, iter);
1600 				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1601 				ret =   bch2_btree_iter_traverse(&iter2) ?:
1602 					bch2_btree_delete_at(trans, &iter2,
1603 						BTREE_UPDATE_internal_snapshot_node);
1604 				bch2_trans_iter_exit(trans, &iter2);
1605 				if (ret)
1606 					goto err;
1607 
1608 				iter->k.type = KEY_TYPE_whiteout;
1609 			}
1610 
1611 			if (bkey_extent_is_allocation(k.k))
1612 				i->count += k.k->size;
1613 		}
1614 
1615 		i->seen_this_pos = true;
1616 	}
1617 
1618 	if (k.k->type != KEY_TYPE_whiteout) {
1619 		ret = extent_ends_at(c, extent_ends, s, k);
1620 		if (ret)
1621 			goto err;
1622 	}
1623 out:
1624 err:
1625 fsck_err:
1626 	printbuf_exit(&buf);
1627 	bch_err_fn(c, ret);
1628 	return ret;
1629 }
1630 
1631 /*
1632  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1633  * that i_size an i_sectors are consistent
1634  */
1635 int bch2_check_extents(struct bch_fs *c)
1636 {
1637 	struct inode_walker w = inode_walker_init();
1638 	struct snapshots_seen s;
1639 	struct extent_ends extent_ends;
1640 	struct disk_reservation res = { 0 };
1641 
1642 	snapshots_seen_init(&s);
1643 	extent_ends_init(&extent_ends);
1644 
1645 	int ret = bch2_trans_run(c,
1646 		for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1647 				POS(BCACHEFS_ROOT_INO, 0),
1648 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1649 				&res, NULL,
1650 				BCH_TRANS_COMMIT_no_enospc, ({
1651 			bch2_disk_reservation_put(c, &res);
1652 			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1653 			check_extent_overbig(trans, &iter, k);
1654 		})) ?:
1655 		check_i_sectors_notnested(trans, &w));
1656 
1657 	bch2_disk_reservation_put(c, &res);
1658 	extent_ends_exit(&extent_ends);
1659 	inode_walker_exit(&w);
1660 	snapshots_seen_exit(&s);
1661 
1662 	bch_err_fn(c, ret);
1663 	return ret;
1664 }
1665 
1666 int bch2_check_indirect_extents(struct bch_fs *c)
1667 {
1668 	struct disk_reservation res = { 0 };
1669 
1670 	int ret = bch2_trans_run(c,
1671 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1672 				POS_MIN,
1673 				BTREE_ITER_prefetch, k,
1674 				&res, NULL,
1675 				BCH_TRANS_COMMIT_no_enospc, ({
1676 			bch2_disk_reservation_put(c, &res);
1677 			check_extent_overbig(trans, &iter, k);
1678 		})));
1679 
1680 	bch2_disk_reservation_put(c, &res);
1681 	bch_err_fn(c, ret);
1682 	return ret;
1683 }
1684 
1685 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1686 {
1687 	struct bch_fs *c = trans->c;
1688 	int ret = 0;
1689 	s64 count2;
1690 
1691 	darray_for_each(w->inodes, i) {
1692 		if (i->inode.bi_nlink == i->count)
1693 			continue;
1694 
1695 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1696 		if (count2 < 0)
1697 			return count2;
1698 
1699 		if (i->count != count2) {
1700 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1701 					    w->last_pos.inode, i->snapshot, i->count, count2);
1702 			i->count = count2;
1703 			if (i->inode.bi_nlink == i->count)
1704 				continue;
1705 		}
1706 
1707 		if (fsck_err_on(i->inode.bi_nlink != i->count,
1708 				c, inode_dir_wrong_nlink,
1709 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1710 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1711 			i->inode.bi_nlink = i->count;
1712 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1713 			if (ret)
1714 				break;
1715 		}
1716 	}
1717 fsck_err:
1718 	bch_err_fn(c, ret);
1719 	return ret;
1720 }
1721 
1722 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1723 {
1724 	u32 restart_count = trans->restart_count;
1725 	return check_subdir_count_notnested(trans, w) ?:
1726 		trans_was_restarted(trans, restart_count);
1727 }
1728 
1729 noinline_for_stack
1730 static int check_dirent_inode_dirent(struct btree_trans *trans,
1731 				   struct btree_iter *iter,
1732 				   struct bkey_s_c_dirent d,
1733 				   struct bch_inode_unpacked *target,
1734 				   u32 target_snapshot)
1735 {
1736 	struct bch_fs *c = trans->c;
1737 	struct printbuf buf = PRINTBUF;
1738 	int ret = 0;
1739 
1740 	if (inode_points_to_dirent(target, d))
1741 		return 0;
1742 
1743 	if (bch2_inode_should_have_bp(target) &&
1744 	    !fsck_err(c, inode_wrong_backpointer,
1745 		      "dirent points to inode that does not point back:\n  %s",
1746 		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
1747 		       prt_printf(&buf, "\n  "),
1748 		       bch2_inode_unpacked_to_text(&buf, target),
1749 		       buf.buf)))
1750 		goto out_noiter;
1751 
1752 	if (!target->bi_dir &&
1753 	    !target->bi_dir_offset) {
1754 		target->bi_dir		= d.k->p.inode;
1755 		target->bi_dir_offset	= d.k->p.offset;
1756 		return __bch2_fsck_write_inode(trans, target, target_snapshot);
1757 	}
1758 
1759 	struct btree_iter bp_iter = { NULL };
1760 	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1761 			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1762 	ret = bkey_err(bp_dirent);
1763 	if (ret && !bch2_err_matches(ret, ENOENT))
1764 		goto err;
1765 
1766 	bool backpointer_exists = !ret;
1767 	ret = 0;
1768 
1769 	if (fsck_err_on(!backpointer_exists,
1770 			c, inode_wrong_backpointer,
1771 			"inode %llu:%u has wrong backpointer:\n"
1772 			"got       %llu:%llu\n"
1773 			"should be %llu:%llu",
1774 			target->bi_inum, target_snapshot,
1775 			target->bi_dir,
1776 			target->bi_dir_offset,
1777 			d.k->p.inode,
1778 			d.k->p.offset)) {
1779 		target->bi_dir		= d.k->p.inode;
1780 		target->bi_dir_offset	= d.k->p.offset;
1781 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1782 		goto out;
1783 	}
1784 
1785 	bch2_bkey_val_to_text(&buf, c, d.s_c);
1786 	prt_newline(&buf);
1787 	if (backpointer_exists)
1788 		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1789 
1790 	if (fsck_err_on(backpointer_exists &&
1791 			(S_ISDIR(target->bi_mode) ||
1792 			 target->bi_subvol),
1793 			c, inode_dir_multiple_links,
1794 			"%s %llu:%u with multiple links\n%s",
1795 			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1796 			target->bi_inum, target_snapshot, buf.buf)) {
1797 		ret = __remove_dirent(trans, d.k->p);
1798 		goto out;
1799 	}
1800 
1801 	/*
1802 	 * hardlinked file with nlink 0:
1803 	 * We're just adjusting nlink here so check_nlinks() will pick
1804 	 * it up, it ignores inodes with nlink 0
1805 	 */
1806 	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1807 			c, inode_multiple_links_but_nlink_0,
1808 			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1809 			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1810 		target->bi_nlink++;
1811 		target->bi_flags &= ~BCH_INODE_unlinked;
1812 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1813 		if (ret)
1814 			goto err;
1815 	}
1816 out:
1817 err:
1818 fsck_err:
1819 	bch2_trans_iter_exit(trans, &bp_iter);
1820 out_noiter:
1821 	printbuf_exit(&buf);
1822 	bch_err_fn(c, ret);
1823 	return ret;
1824 }
1825 
1826 noinline_for_stack
1827 static int check_dirent_target(struct btree_trans *trans,
1828 			       struct btree_iter *iter,
1829 			       struct bkey_s_c_dirent d,
1830 			       struct bch_inode_unpacked *target,
1831 			       u32 target_snapshot)
1832 {
1833 	struct bch_fs *c = trans->c;
1834 	struct bkey_i_dirent *n;
1835 	struct printbuf buf = PRINTBUF;
1836 	int ret = 0;
1837 
1838 	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1839 	if (ret)
1840 		goto err;
1841 
1842 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
1843 			c, dirent_d_type_wrong,
1844 			"incorrect d_type: got %s, should be %s:\n%s",
1845 			bch2_d_type_str(d.v->d_type),
1846 			bch2_d_type_str(inode_d_type(target)),
1847 			(printbuf_reset(&buf),
1848 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1849 		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1850 		ret = PTR_ERR_OR_ZERO(n);
1851 		if (ret)
1852 			goto err;
1853 
1854 		bkey_reassemble(&n->k_i, d.s_c);
1855 		n->v.d_type = inode_d_type(target);
1856 		if (n->v.d_type == DT_SUBVOL) {
1857 			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1858 			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1859 		} else {
1860 			n->v.d_inum = cpu_to_le64(target->bi_inum);
1861 		}
1862 
1863 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1864 		if (ret)
1865 			goto err;
1866 
1867 		d = dirent_i_to_s_c(n);
1868 	}
1869 err:
1870 fsck_err:
1871 	printbuf_exit(&buf);
1872 	bch_err_fn(c, ret);
1873 	return ret;
1874 }
1875 
1876 /* find a subvolume that's a descendent of @snapshot: */
1877 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1878 {
1879 	struct btree_iter iter;
1880 	struct bkey_s_c k;
1881 	int ret;
1882 
1883 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1884 		if (k.k->type != KEY_TYPE_subvolume)
1885 			continue;
1886 
1887 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1888 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1889 			bch2_trans_iter_exit(trans, &iter);
1890 			*subvolid = k.k->p.offset;
1891 			goto found;
1892 		}
1893 	}
1894 	if (!ret)
1895 		ret = -ENOENT;
1896 found:
1897 	bch2_trans_iter_exit(trans, &iter);
1898 	return ret;
1899 }
1900 
1901 noinline_for_stack
1902 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1903 				  struct bkey_s_c_dirent d)
1904 {
1905 	struct bch_fs *c = trans->c;
1906 	struct btree_iter subvol_iter = {};
1907 	struct bch_inode_unpacked subvol_root;
1908 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1909 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1910 	u32 parent_snapshot;
1911 	u32 new_parent_subvol = 0;
1912 	u64 parent_inum;
1913 	struct printbuf buf = PRINTBUF;
1914 	int ret = 0;
1915 
1916 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1917 	if (ret && !bch2_err_matches(ret, ENOENT))
1918 		return ret;
1919 
1920 	if (ret ||
1921 	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
1922 		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1923 		if (ret2 && !bch2_err_matches(ret, ENOENT))
1924 			return ret2;
1925 	}
1926 
1927 	if (ret &&
1928 	    !new_parent_subvol &&
1929 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1930 		/*
1931 		 * Couldn't find a subvol for dirent's snapshot - but we lost
1932 		 * subvols, so we need to reconstruct:
1933 		 */
1934 		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
1935 		if (ret)
1936 			return ret;
1937 
1938 		parent_snapshot = d.k->p.snapshot;
1939 	}
1940 
1941 	if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
1942 			"dirent parent_subvol points to missing subvolume\n%s",
1943 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1944 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1945 			c, dirent_not_visible_in_parent_subvol,
1946 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1947 			parent_snapshot,
1948 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1949 		if (!new_parent_subvol) {
1950 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
1951 			return -BCH_ERR_fsck_repair_unimplemented;
1952 		}
1953 
1954 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1955 		ret = PTR_ERR_OR_ZERO(new_dirent);
1956 		if (ret)
1957 			goto err;
1958 
1959 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1960 	}
1961 
1962 	struct bkey_s_c_subvolume s =
1963 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
1964 					 BTREE_ID_subvolumes, POS(0, target_subvol),
1965 					 0, subvolume);
1966 	ret = bkey_err(s.s_c);
1967 	if (ret && !bch2_err_matches(ret, ENOENT))
1968 		return ret;
1969 
1970 	if (ret) {
1971 		if (fsck_err(c, dirent_to_missing_subvol,
1972 			     "dirent points to missing subvolume\n%s",
1973 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1974 			return __remove_dirent(trans, d.k->p);
1975 		ret = 0;
1976 		goto out;
1977 	}
1978 
1979 	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
1980 			c, subvol_fs_path_parent_wrong,
1981 			"subvol with wrong fs_path_parent, should be be %u\n%s",
1982 			parent_subvol,
1983 			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
1984 		struct bkey_i_subvolume *n =
1985 			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
1986 		ret = PTR_ERR_OR_ZERO(n);
1987 		if (ret)
1988 			goto err;
1989 
1990 		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
1991 	}
1992 
1993 	u64 target_inum = le64_to_cpu(s.v->inode);
1994 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
1995 
1996 	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
1997 	if (ret && !bch2_err_matches(ret, ENOENT))
1998 		goto err;
1999 
2000 	if (ret) {
2001 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2002 		ret = -BCH_ERR_fsck_repair_unimplemented;
2003 		ret = 0;
2004 		goto err;
2005 	}
2006 
2007 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2008 			c, inode_bi_parent_wrong,
2009 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2010 			target_inum,
2011 			subvol_root.bi_parent_subvol, parent_subvol)) {
2012 		subvol_root.bi_parent_subvol = parent_subvol;
2013 		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
2014 		if (ret)
2015 			goto err;
2016 	}
2017 
2018 	ret = check_dirent_target(trans, iter, d, &subvol_root,
2019 				  target_snapshot);
2020 	if (ret)
2021 		goto err;
2022 out:
2023 err:
2024 fsck_err:
2025 	bch2_trans_iter_exit(trans, &subvol_iter);
2026 	printbuf_exit(&buf);
2027 	return ret;
2028 }
2029 
2030 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2031 			struct bkey_s_c k,
2032 			struct bch_hash_info *hash_info,
2033 			struct inode_walker *dir,
2034 			struct inode_walker *target,
2035 			struct snapshots_seen *s)
2036 {
2037 	struct bch_fs *c = trans->c;
2038 	struct inode_walker_entry *i;
2039 	struct printbuf buf = PRINTBUF;
2040 	int ret = 0;
2041 
2042 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2043 	if (ret) {
2044 		ret = ret < 0 ? ret : 0;
2045 		goto out;
2046 	}
2047 
2048 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2049 	if (ret)
2050 		goto err;
2051 
2052 	if (k.k->type == KEY_TYPE_whiteout)
2053 		goto out;
2054 
2055 	if (dir->last_pos.inode != k.k->p.inode) {
2056 		ret = check_subdir_count(trans, dir);
2057 		if (ret)
2058 			goto err;
2059 	}
2060 
2061 	i = walk_inode(trans, dir, k);
2062 	ret = PTR_ERR_OR_ZERO(i);
2063 	if (ret < 0)
2064 		goto err;
2065 
2066 	ret = check_key_has_inode(trans, iter, dir, i, k);
2067 	if (ret)
2068 		goto err;
2069 
2070 	if (!i)
2071 		goto out;
2072 
2073 	if (dir->first_this_inode)
2074 		*hash_info = bch2_hash_info_init(c, &i->inode);
2075 	dir->first_this_inode = false;
2076 
2077 	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
2078 	if (ret < 0)
2079 		goto err;
2080 	if (ret) {
2081 		/* dirent has been deleted */
2082 		ret = 0;
2083 		goto out;
2084 	}
2085 
2086 	if (k.k->type != KEY_TYPE_dirent)
2087 		goto out;
2088 
2089 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2090 
2091 	if (d.v->d_type == DT_SUBVOL) {
2092 		ret = check_dirent_to_subvol(trans, iter, d);
2093 		if (ret)
2094 			goto err;
2095 	} else {
2096 		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2097 		if (ret)
2098 			goto err;
2099 
2100 		if (fsck_err_on(!target->inodes.nr,
2101 				c, dirent_to_missing_inode,
2102 				"dirent points to missing inode:\n%s",
2103 				(printbuf_reset(&buf),
2104 				 bch2_bkey_val_to_text(&buf, c, k),
2105 				 buf.buf))) {
2106 			ret = __remove_dirent(trans, d.k->p);
2107 			if (ret)
2108 				goto err;
2109 		}
2110 
2111 		darray_for_each(target->inodes, i) {
2112 			ret = check_dirent_target(trans, iter, d,
2113 						  &i->inode, i->snapshot);
2114 			if (ret)
2115 				goto err;
2116 		}
2117 
2118 		if (d.v->d_type == DT_DIR)
2119 			for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
2120 				i->count++;
2121 	}
2122 out:
2123 err:
2124 fsck_err:
2125 	printbuf_exit(&buf);
2126 	bch_err_fn(c, ret);
2127 	return ret;
2128 }
2129 
2130 /*
2131  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2132  * validate d_type
2133  */
2134 int bch2_check_dirents(struct bch_fs *c)
2135 {
2136 	struct inode_walker dir = inode_walker_init();
2137 	struct inode_walker target = inode_walker_init();
2138 	struct snapshots_seen s;
2139 	struct bch_hash_info hash_info;
2140 
2141 	snapshots_seen_init(&s);
2142 
2143 	int ret = bch2_trans_run(c,
2144 		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2145 				POS(BCACHEFS_ROOT_INO, 0),
2146 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2147 				k,
2148 				NULL, NULL,
2149 				BCH_TRANS_COMMIT_no_enospc,
2150 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2151 		check_subdir_count_notnested(trans, &dir));
2152 
2153 	snapshots_seen_exit(&s);
2154 	inode_walker_exit(&dir);
2155 	inode_walker_exit(&target);
2156 	bch_err_fn(c, ret);
2157 	return ret;
2158 }
2159 
2160 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2161 		       struct bkey_s_c k,
2162 		       struct bch_hash_info *hash_info,
2163 		       struct inode_walker *inode)
2164 {
2165 	struct bch_fs *c = trans->c;
2166 	struct inode_walker_entry *i;
2167 	int ret;
2168 
2169 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2170 	if (ret < 0)
2171 		return ret;
2172 	if (ret)
2173 		return 0;
2174 
2175 	i = walk_inode(trans, inode, k);
2176 	ret = PTR_ERR_OR_ZERO(i);
2177 	if (ret)
2178 		return ret;
2179 
2180 	ret = check_key_has_inode(trans, iter, inode, i, k);
2181 	if (ret)
2182 		return ret;
2183 
2184 	if (!i)
2185 		return 0;
2186 
2187 	if (inode->first_this_inode)
2188 		*hash_info = bch2_hash_info_init(c, &i->inode);
2189 	inode->first_this_inode = false;
2190 
2191 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2192 	bch_err_fn(c, ret);
2193 	return ret;
2194 }
2195 
2196 /*
2197  * Walk xattrs: verify that they all have a corresponding inode
2198  */
2199 int bch2_check_xattrs(struct bch_fs *c)
2200 {
2201 	struct inode_walker inode = inode_walker_init();
2202 	struct bch_hash_info hash_info;
2203 	int ret = 0;
2204 
2205 	ret = bch2_trans_run(c,
2206 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2207 			POS(BCACHEFS_ROOT_INO, 0),
2208 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2209 			k,
2210 			NULL, NULL,
2211 			BCH_TRANS_COMMIT_no_enospc,
2212 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2213 	bch_err_fn(c, ret);
2214 	return ret;
2215 }
2216 
2217 static int check_root_trans(struct btree_trans *trans)
2218 {
2219 	struct bch_fs *c = trans->c;
2220 	struct bch_inode_unpacked root_inode;
2221 	u32 snapshot;
2222 	u64 inum;
2223 	int ret;
2224 
2225 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2226 	if (ret && !bch2_err_matches(ret, ENOENT))
2227 		return ret;
2228 
2229 	if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
2230 				"root subvol missing")) {
2231 		struct bkey_i_subvolume *root_subvol =
2232 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2233 		ret = PTR_ERR_OR_ZERO(root_subvol);
2234 		if (ret)
2235 			goto err;
2236 
2237 		snapshot	= U32_MAX;
2238 		inum		= BCACHEFS_ROOT_INO;
2239 
2240 		bkey_subvolume_init(&root_subvol->k_i);
2241 		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2242 		root_subvol->v.flags	= 0;
2243 		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2244 		root_subvol->v.inode	= cpu_to_le64(inum);
2245 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2246 		bch_err_msg(c, ret, "writing root subvol");
2247 		if (ret)
2248 			goto err;
2249 	}
2250 
2251 	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2252 	if (ret && !bch2_err_matches(ret, ENOENT))
2253 		return ret;
2254 
2255 	if (mustfix_fsck_err_on(ret, c, root_dir_missing,
2256 				"root directory missing") ||
2257 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2258 				c, root_inode_not_dir,
2259 				"root inode not a directory")) {
2260 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2261 				0, NULL);
2262 		root_inode.bi_inum = inum;
2263 
2264 		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2265 		bch_err_msg(c, ret, "writing root inode");
2266 	}
2267 err:
2268 fsck_err:
2269 	return ret;
2270 }
2271 
2272 /* Get root directory, create if it doesn't exist: */
2273 int bch2_check_root(struct bch_fs *c)
2274 {
2275 	int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2276 		check_root_trans(trans));
2277 	bch_err_fn(c, ret);
2278 	return ret;
2279 }
2280 
2281 typedef DARRAY(u32) darray_u32;
2282 
2283 static bool darray_u32_has(darray_u32 *d, u32 v)
2284 {
2285 	darray_for_each(*d, i)
2286 		if (*i == v)
2287 			return true;
2288 	return false;
2289 }
2290 
2291 /*
2292  * We've checked that inode backpointers point to valid dirents; here, it's
2293  * sufficient to check that the subvolume root has a dirent:
2294  */
2295 static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
2296 {
2297 	struct bch_inode_unpacked inode;
2298 	int ret = bch2_inode_find_by_inum_trans(trans,
2299 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2300 				&inode);
2301 	if (ret)
2302 		return ret;
2303 
2304 	return inode.bi_dir != 0;
2305 }
2306 
2307 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2308 {
2309 	struct bch_fs *c = trans->c;
2310 	struct btree_iter parent_iter = {};
2311 	darray_u32 subvol_path = {};
2312 	struct printbuf buf = PRINTBUF;
2313 	int ret = 0;
2314 
2315 	if (k.k->type != KEY_TYPE_subvolume)
2316 		return 0;
2317 
2318 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2319 		ret = darray_push(&subvol_path, k.k->p.offset);
2320 		if (ret)
2321 			goto err;
2322 
2323 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2324 
2325 		ret = subvol_has_dirent(trans, s);
2326 		if (ret < 0)
2327 			break;
2328 
2329 		if (fsck_err_on(!ret,
2330 				c, subvol_unreachable,
2331 				"unreachable subvolume %s",
2332 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2333 				 buf.buf))) {
2334 			ret = reattach_subvol(trans, s);
2335 			break;
2336 		}
2337 
2338 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2339 
2340 		if (darray_u32_has(&subvol_path, parent)) {
2341 			if (fsck_err(c, subvol_loop, "subvolume loop"))
2342 				ret = reattach_subvol(trans, s);
2343 			break;
2344 		}
2345 
2346 		bch2_trans_iter_exit(trans, &parent_iter);
2347 		bch2_trans_iter_init(trans, &parent_iter,
2348 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2349 		k = bch2_btree_iter_peek_slot(&parent_iter);
2350 		ret = bkey_err(k);
2351 		if (ret)
2352 			goto err;
2353 
2354 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2355 				c, subvol_unreachable,
2356 				"unreachable subvolume %s",
2357 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2358 				 buf.buf))) {
2359 			ret = reattach_subvol(trans, s);
2360 			break;
2361 		}
2362 	}
2363 fsck_err:
2364 err:
2365 	printbuf_exit(&buf);
2366 	darray_exit(&subvol_path);
2367 	bch2_trans_iter_exit(trans, &parent_iter);
2368 	return ret;
2369 }
2370 
2371 int bch2_check_subvolume_structure(struct bch_fs *c)
2372 {
2373 	int ret = bch2_trans_run(c,
2374 		for_each_btree_key_commit(trans, iter,
2375 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2376 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2377 			check_subvol_path(trans, &iter, k)));
2378 	bch_err_fn(c, ret);
2379 	return ret;
2380 }
2381 
2382 struct pathbuf_entry {
2383 	u64	inum;
2384 	u32	snapshot;
2385 };
2386 
2387 typedef DARRAY(struct pathbuf_entry) pathbuf;
2388 
2389 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2390 {
2391 	darray_for_each(*p, i)
2392 		if (i->inum	== inum &&
2393 		    i->snapshot	== snapshot)
2394 			return true;
2395 	return false;
2396 }
2397 
2398 /*
2399  * Check that a given inode is reachable from its subvolume root - we already
2400  * verified subvolume connectivity:
2401  *
2402  * XXX: we should also be verifying that inodes are in the right subvolumes
2403  */
2404 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2405 {
2406 	struct bch_fs *c = trans->c;
2407 	struct btree_iter inode_iter = {};
2408 	struct bch_inode_unpacked inode;
2409 	struct printbuf buf = PRINTBUF;
2410 	u32 snapshot = inode_k.k->p.snapshot;
2411 	int ret = 0;
2412 
2413 	p->nr = 0;
2414 
2415 	BUG_ON(bch2_inode_unpack(inode_k, &inode));
2416 
2417 	while (!inode.bi_subvol) {
2418 		struct btree_iter dirent_iter;
2419 		struct bkey_s_c_dirent d;
2420 		u32 parent_snapshot = snapshot;
2421 
2422 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2423 		ret = bkey_err(d.s_c);
2424 		if (ret && !bch2_err_matches(ret, ENOENT))
2425 			break;
2426 
2427 		if (!ret && !dirent_points_to_inode(d, &inode)) {
2428 			bch2_trans_iter_exit(trans, &dirent_iter);
2429 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2430 		}
2431 
2432 		if (bch2_err_matches(ret, ENOENT)) {
2433 			ret = 0;
2434 			if (fsck_err(c, inode_unreachable,
2435 				     "unreachable inode\n%s",
2436 				     (printbuf_reset(&buf),
2437 				      bch2_bkey_val_to_text(&buf, c, inode_k),
2438 				      buf.buf)))
2439 				ret = reattach_inode(trans, &inode, snapshot);
2440 			goto out;
2441 		}
2442 
2443 		bch2_trans_iter_exit(trans, &dirent_iter);
2444 
2445 		if (!S_ISDIR(inode.bi_mode))
2446 			break;
2447 
2448 		ret = darray_push(p, ((struct pathbuf_entry) {
2449 			.inum		= inode.bi_inum,
2450 			.snapshot	= snapshot,
2451 		}));
2452 		if (ret)
2453 			return ret;
2454 
2455 		snapshot = parent_snapshot;
2456 
2457 		bch2_trans_iter_exit(trans, &inode_iter);
2458 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2459 					     SPOS(0, inode.bi_dir, snapshot), 0);
2460 		ret = bkey_err(inode_k) ?:
2461 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2462 			: bch2_inode_unpack(inode_k, &inode);
2463 		if (ret) {
2464 			/* Should have been caught in dirents pass */
2465 			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
2466 				bch_err(c, "error looking up parent directory: %i", ret);
2467 			break;
2468 		}
2469 
2470 		snapshot = inode_k.k->p.snapshot;
2471 
2472 		if (path_is_dup(p, inode.bi_inum, snapshot)) {
2473 			/* XXX print path */
2474 			bch_err(c, "directory structure loop");
2475 
2476 			darray_for_each(*p, i)
2477 				pr_err("%llu:%u", i->inum, i->snapshot);
2478 			pr_err("%llu:%u", inode.bi_inum, snapshot);
2479 
2480 			if (fsck_err(c, dir_loop, "directory structure loop")) {
2481 				ret = remove_backpointer(trans, &inode);
2482 				bch_err_msg(c, ret, "removing dirent");
2483 				if (ret)
2484 					break;
2485 
2486 				ret = reattach_inode(trans, &inode, snapshot);
2487 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2488 			}
2489 			break;
2490 		}
2491 	}
2492 out:
2493 fsck_err:
2494 	bch2_trans_iter_exit(trans, &inode_iter);
2495 	printbuf_exit(&buf);
2496 	bch_err_fn(c, ret);
2497 	return ret;
2498 }
2499 
2500 /*
2501  * Check for unreachable inodes, as well as loops in the directory structure:
2502  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2503  * unreachable:
2504  */
2505 int bch2_check_directory_structure(struct bch_fs *c)
2506 {
2507 	pathbuf path = { 0, };
2508 	int ret;
2509 
2510 	ret = bch2_trans_run(c,
2511 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2512 					  BTREE_ITER_intent|
2513 					  BTREE_ITER_prefetch|
2514 					  BTREE_ITER_all_snapshots, k,
2515 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2516 			if (!bkey_is_inode(k.k))
2517 				continue;
2518 
2519 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2520 				continue;
2521 
2522 			check_path(trans, &path, k);
2523 		})));
2524 	darray_exit(&path);
2525 
2526 	bch_err_fn(c, ret);
2527 	return ret;
2528 }
2529 
2530 struct nlink_table {
2531 	size_t		nr;
2532 	size_t		size;
2533 
2534 	struct nlink {
2535 		u64	inum;
2536 		u32	snapshot;
2537 		u32	count;
2538 	}		*d;
2539 };
2540 
2541 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2542 		     u64 inum, u32 snapshot)
2543 {
2544 	if (t->nr == t->size) {
2545 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2546 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2547 
2548 		if (!d) {
2549 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2550 				new_size);
2551 			return -BCH_ERR_ENOMEM_fsck_add_nlink;
2552 		}
2553 
2554 		if (t->d)
2555 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2556 		kvfree(t->d);
2557 
2558 		t->d = d;
2559 		t->size = new_size;
2560 	}
2561 
2562 
2563 	t->d[t->nr++] = (struct nlink) {
2564 		.inum		= inum,
2565 		.snapshot	= snapshot,
2566 	};
2567 
2568 	return 0;
2569 }
2570 
2571 static int nlink_cmp(const void *_l, const void *_r)
2572 {
2573 	const struct nlink *l = _l;
2574 	const struct nlink *r = _r;
2575 
2576 	return cmp_int(l->inum, r->inum);
2577 }
2578 
2579 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2580 		     struct nlink_table *links,
2581 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2582 {
2583 	struct nlink *link, key = {
2584 		.inum = inum, .snapshot = U32_MAX,
2585 	};
2586 
2587 	if (inum < range_start || inum >= range_end)
2588 		return;
2589 
2590 	link = __inline_bsearch(&key, links->d, links->nr,
2591 				sizeof(links->d[0]), nlink_cmp);
2592 	if (!link)
2593 		return;
2594 
2595 	while (link > links->d && link[0].inum == link[-1].inum)
2596 		--link;
2597 
2598 	for (; link < links->d + links->nr && link->inum == inum; link++)
2599 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2600 			link->count++;
2601 			if (link->snapshot >= snapshot)
2602 				break;
2603 		}
2604 }
2605 
2606 noinline_for_stack
2607 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2608 				       struct nlink_table *t,
2609 				       u64 start, u64 *end)
2610 {
2611 	int ret = bch2_trans_run(c,
2612 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2613 				   POS(0, start),
2614 				   BTREE_ITER_intent|
2615 				   BTREE_ITER_prefetch|
2616 				   BTREE_ITER_all_snapshots, k, ({
2617 			if (!bkey_is_inode(k.k))
2618 				continue;
2619 
2620 			/* Should never fail, checked by bch2_inode_invalid: */
2621 			struct bch_inode_unpacked u;
2622 			BUG_ON(bch2_inode_unpack(k, &u));
2623 
2624 			/*
2625 			 * Backpointer and directory structure checks are sufficient for
2626 			 * directories, since they can't have hardlinks:
2627 			 */
2628 			if (S_ISDIR(u.bi_mode))
2629 				continue;
2630 
2631 			if (!u.bi_nlink)
2632 				continue;
2633 
2634 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2635 			if (ret) {
2636 				*end = k.k->p.offset;
2637 				ret = 0;
2638 				break;
2639 			}
2640 			0;
2641 		})));
2642 
2643 	bch_err_fn(c, ret);
2644 	return ret;
2645 }
2646 
2647 noinline_for_stack
2648 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2649 				     u64 range_start, u64 range_end)
2650 {
2651 	struct snapshots_seen s;
2652 
2653 	snapshots_seen_init(&s);
2654 
2655 	int ret = bch2_trans_run(c,
2656 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2657 				   BTREE_ITER_intent|
2658 				   BTREE_ITER_prefetch|
2659 				   BTREE_ITER_all_snapshots, k, ({
2660 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2661 			if (ret)
2662 				break;
2663 
2664 			if (k.k->type == KEY_TYPE_dirent) {
2665 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2666 
2667 				if (d.v->d_type != DT_DIR &&
2668 				    d.v->d_type != DT_SUBVOL)
2669 					inc_link(c, &s, links, range_start, range_end,
2670 						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2671 			}
2672 			0;
2673 		})));
2674 
2675 	snapshots_seen_exit(&s);
2676 
2677 	bch_err_fn(c, ret);
2678 	return ret;
2679 }
2680 
2681 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2682 				     struct bkey_s_c k,
2683 				     struct nlink_table *links,
2684 				     size_t *idx, u64 range_end)
2685 {
2686 	struct bch_fs *c = trans->c;
2687 	struct bch_inode_unpacked u;
2688 	struct nlink *link = &links->d[*idx];
2689 	int ret = 0;
2690 
2691 	if (k.k->p.offset >= range_end)
2692 		return 1;
2693 
2694 	if (!bkey_is_inode(k.k))
2695 		return 0;
2696 
2697 	BUG_ON(bch2_inode_unpack(k, &u));
2698 
2699 	if (S_ISDIR(u.bi_mode))
2700 		return 0;
2701 
2702 	if (!u.bi_nlink)
2703 		return 0;
2704 
2705 	while ((cmp_int(link->inum, k.k->p.offset) ?:
2706 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2707 		BUG_ON(*idx == links->nr);
2708 		link = &links->d[++*idx];
2709 	}
2710 
2711 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2712 			c, inode_wrong_nlink,
2713 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
2714 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2715 			bch2_inode_nlink_get(&u), link->count)) {
2716 		bch2_inode_nlink_set(&u, link->count);
2717 		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2718 	}
2719 fsck_err:
2720 	return ret;
2721 }
2722 
2723 noinline_for_stack
2724 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2725 			       struct nlink_table *links,
2726 			       u64 range_start, u64 range_end)
2727 {
2728 	size_t idx = 0;
2729 
2730 	int ret = bch2_trans_run(c,
2731 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2732 				POS(0, range_start),
2733 				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2734 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2735 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2736 	if (ret < 0) {
2737 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2738 		return ret;
2739 	}
2740 
2741 	return 0;
2742 }
2743 
2744 int bch2_check_nlinks(struct bch_fs *c)
2745 {
2746 	struct nlink_table links = { 0 };
2747 	u64 this_iter_range_start, next_iter_range_start = 0;
2748 	int ret = 0;
2749 
2750 	do {
2751 		this_iter_range_start = next_iter_range_start;
2752 		next_iter_range_start = U64_MAX;
2753 
2754 		ret = check_nlinks_find_hardlinks(c, &links,
2755 						  this_iter_range_start,
2756 						  &next_iter_range_start);
2757 
2758 		ret = check_nlinks_walk_dirents(c, &links,
2759 					  this_iter_range_start,
2760 					  next_iter_range_start);
2761 		if (ret)
2762 			break;
2763 
2764 		ret = check_nlinks_update_hardlinks(c, &links,
2765 					 this_iter_range_start,
2766 					 next_iter_range_start);
2767 		if (ret)
2768 			break;
2769 
2770 		links.nr = 0;
2771 	} while (next_iter_range_start != U64_MAX);
2772 
2773 	kvfree(links.d);
2774 	bch_err_fn(c, ret);
2775 	return ret;
2776 }
2777 
2778 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2779 			     struct bkey_s_c k)
2780 {
2781 	struct bkey_s_c_reflink_p p;
2782 	struct bkey_i_reflink_p *u;
2783 
2784 	if (k.k->type != KEY_TYPE_reflink_p)
2785 		return 0;
2786 
2787 	p = bkey_s_c_to_reflink_p(k);
2788 
2789 	if (!p.v->front_pad && !p.v->back_pad)
2790 		return 0;
2791 
2792 	u = bch2_trans_kmalloc(trans, sizeof(*u));
2793 	int ret = PTR_ERR_OR_ZERO(u);
2794 	if (ret)
2795 		return ret;
2796 
2797 	bkey_reassemble(&u->k_i, k);
2798 	u->v.front_pad	= 0;
2799 	u->v.back_pad	= 0;
2800 
2801 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2802 }
2803 
2804 int bch2_fix_reflink_p(struct bch_fs *c)
2805 {
2806 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2807 		return 0;
2808 
2809 	int ret = bch2_trans_run(c,
2810 		for_each_btree_key_commit(trans, iter,
2811 				BTREE_ID_extents, POS_MIN,
2812 				BTREE_ITER_intent|BTREE_ITER_prefetch|
2813 				BTREE_ITER_all_snapshots, k,
2814 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2815 			fix_reflink_p_key(trans, &iter, k)));
2816 	bch_err_fn(c, ret);
2817 	return ret;
2818 }
2819