xref: /linux/fs/bcachefs/fsck.c (revision 2fe3c78a2c26dd5ee811024a1b7d6cfb4d654319)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "darray.h"
9 #include "dirent.h"
10 #include "error.h"
11 #include "fs.h"
12 #include "fs-common.h"
13 #include "fsck.h"
14 #include "inode.h"
15 #include "keylist.h"
16 #include "recovery_passes.h"
17 #include "snapshot.h"
18 #include "super.h"
19 #include "xattr.h"
20 
21 #include <linux/bsearch.h>
22 #include <linux/dcache.h> /* struct qstr */
23 
24 /*
25  * XXX: this is handling transaction restarts without returning
26  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
27  */
28 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
29 				    u32 snapshot)
30 {
31 	u64 sectors = 0;
32 
33 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
34 				SPOS(inum, 0, snapshot),
35 				POS(inum, U64_MAX),
36 				0, k, ({
37 		if (bkey_extent_is_allocation(k.k))
38 			sectors += k.k->size;
39 		0;
40 	}));
41 
42 	return ret ?: sectors;
43 }
44 
45 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
46 				    u32 snapshot)
47 {
48 	u64 subdirs = 0;
49 
50 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
51 				    SPOS(inum, 0, snapshot),
52 				    POS(inum, U64_MAX),
53 				    0, k, ({
54 		if (k.k->type == KEY_TYPE_dirent &&
55 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
56 			subdirs++;
57 		0;
58 	}));
59 
60 	return ret ?: subdirs;
61 }
62 
63 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
64 			 u32 *snapshot, u64 *inum)
65 {
66 	struct bch_subvolume s;
67 	int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
68 
69 	*snapshot = le32_to_cpu(s.snapshot);
70 	*inum = le64_to_cpu(s.inode);
71 	return ret;
72 }
73 
74 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
75 			      struct bch_inode_unpacked *inode)
76 {
77 	struct btree_iter iter;
78 	struct bkey_s_c k;
79 	int ret;
80 
81 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
82 				     BTREE_ITER_all_snapshots, k, ret) {
83 		if (k.k->p.offset != inode_nr)
84 			break;
85 		if (!bkey_is_inode(k.k))
86 			continue;
87 		ret = bch2_inode_unpack(k, inode);
88 		goto found;
89 	}
90 	ret = -BCH_ERR_ENOENT_inode;
91 found:
92 	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
93 	bch2_trans_iter_exit(trans, &iter);
94 	return ret;
95 }
96 
97 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
98 			struct bch_inode_unpacked *inode,
99 			u32 *snapshot)
100 {
101 	struct btree_iter iter;
102 	struct bkey_s_c k;
103 	int ret;
104 
105 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
106 			       SPOS(0, inode_nr, *snapshot), 0);
107 	ret = bkey_err(k);
108 	if (ret)
109 		goto err;
110 
111 	ret = bkey_is_inode(k.k)
112 		? bch2_inode_unpack(k, inode)
113 		: -BCH_ERR_ENOENT_inode;
114 	if (!ret)
115 		*snapshot = iter.pos.snapshot;
116 err:
117 	bch2_trans_iter_exit(trans, &iter);
118 	return ret;
119 }
120 
121 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
122 			   struct bch_hash_info hash_info,
123 			   subvol_inum dir, struct qstr *name,
124 			   u64 *target, unsigned *type, u32 snapshot)
125 {
126 	struct btree_iter iter;
127 	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
128 							 &hash_info, dir, name, 0, snapshot);
129 	int ret = bkey_err(k);
130 	if (ret)
131 		return ret;
132 
133 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
134 	*target = le64_to_cpu(d.v->d_inum);
135 	*type = d.v->d_type;
136 	bch2_trans_iter_exit(trans, &iter);
137 	return 0;
138 }
139 
140 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
141 {
142 	struct bch_fs *c = trans->c;
143 	struct btree_iter iter;
144 	struct bch_inode_unpacked dir_inode;
145 	struct bch_hash_info dir_hash_info;
146 	int ret;
147 
148 	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
149 	if (ret)
150 		goto err;
151 
152 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
153 
154 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
155 
156 	ret =   bch2_btree_iter_traverse(&iter) ?:
157 		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
158 				    &dir_hash_info, &iter,
159 				    BTREE_UPDATE_internal_snapshot_node);
160 	bch2_trans_iter_exit(trans, &iter);
161 err:
162 	bch_err_fn(c, ret);
163 	return ret;
164 }
165 
166 /* Get lost+found, create if it doesn't exist: */
167 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
168 			    struct bch_inode_unpacked *lostfound,
169 			    u64 reattaching_inum)
170 {
171 	struct bch_fs *c = trans->c;
172 	struct qstr lostfound_str = QSTR("lost+found");
173 	u64 inum = 0;
174 	unsigned d_type = 0;
175 	int ret;
176 
177 	struct bch_snapshot_tree st;
178 	ret = bch2_snapshot_tree_lookup(trans,
179 			bch2_snapshot_tree(c, snapshot), &st);
180 	if (ret)
181 		return ret;
182 
183 	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
184 
185 	struct bch_subvolume subvol;
186 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
187 				 false, 0, &subvol);
188 	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
189 		    le32_to_cpu(st.master_subvol), snapshot);
190 	if (ret)
191 		return ret;
192 
193 	if (!subvol.inode) {
194 		struct btree_iter iter;
195 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
196 				BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
197 				0, subvolume);
198 		ret = PTR_ERR_OR_ZERO(subvol);
199 		if (ret)
200 			return ret;
201 
202 		subvol->v.inode = cpu_to_le64(reattaching_inum);
203 		bch2_trans_iter_exit(trans, &iter);
204 	}
205 
206 	root_inum.inum = le64_to_cpu(subvol.inode);
207 
208 	struct bch_inode_unpacked root_inode;
209 	struct bch_hash_info root_hash_info;
210 	u32 root_inode_snapshot = snapshot;
211 	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
212 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
213 		    root_inum.inum, le32_to_cpu(st.master_subvol));
214 	if (ret)
215 		return ret;
216 
217 	root_hash_info = bch2_hash_info_init(c, &root_inode);
218 
219 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
220 			      &lostfound_str, &inum, &d_type, snapshot);
221 	if (bch2_err_matches(ret, ENOENT))
222 		goto create_lostfound;
223 
224 	bch_err_fn(c, ret);
225 	if (ret)
226 		return ret;
227 
228 	if (d_type != DT_DIR) {
229 		bch_err(c, "error looking up lost+found: not a directory");
230 		return -BCH_ERR_ENOENT_not_directory;
231 	}
232 
233 	/*
234 	 * The bch2_check_dirents pass has already run, dangling dirents
235 	 * shouldn't exist here:
236 	 */
237 	ret = lookup_inode(trans, inum, lostfound, &snapshot);
238 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
239 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
240 	return ret;
241 
242 create_lostfound:
243 	/*
244 	 * XXX: we could have a nicer log message here  if we had a nice way to
245 	 * walk backpointers to print a path
246 	 */
247 	bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
248 
249 	u64 now = bch2_current_time(c);
250 	struct btree_iter lostfound_iter = { NULL };
251 	u64 cpu = raw_smp_processor_id();
252 
253 	bch2_inode_init_early(c, lostfound);
254 	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
255 	lostfound->bi_dir = root_inode.bi_inum;
256 
257 	root_inode.bi_nlink++;
258 
259 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
260 	if (ret)
261 		goto err;
262 
263 	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
264 	ret = bch2_btree_iter_traverse(&lostfound_iter);
265 	if (ret)
266 		goto err;
267 
268 	ret =   bch2_dirent_create_snapshot(trans,
269 				0, root_inode.bi_inum, snapshot, &root_hash_info,
270 				mode_to_type(lostfound->bi_mode),
271 				&lostfound_str,
272 				lostfound->bi_inum,
273 				&lostfound->bi_dir_offset,
274 				STR_HASH_must_create) ?:
275 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
276 				       BTREE_UPDATE_internal_snapshot_node);
277 err:
278 	bch_err_msg(c, ret, "creating lost+found");
279 	bch2_trans_iter_exit(trans, &lostfound_iter);
280 	return ret;
281 }
282 
283 static int reattach_inode(struct btree_trans *trans,
284 			  struct bch_inode_unpacked *inode,
285 			  u32 inode_snapshot)
286 {
287 	struct bch_fs *c = trans->c;
288 	struct bch_hash_info dir_hash;
289 	struct bch_inode_unpacked lostfound;
290 	char name_buf[20];
291 	struct qstr name;
292 	u64 dir_offset = 0;
293 	u32 dirent_snapshot = inode_snapshot;
294 	int ret;
295 
296 	if (inode->bi_subvol) {
297 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
298 
299 		u64 root_inum;
300 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
301 				    &dirent_snapshot, &root_inum);
302 		if (ret)
303 			return ret;
304 
305 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
306 	} else {
307 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
308 	}
309 
310 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
311 	if (ret)
312 		return ret;
313 
314 	if (S_ISDIR(inode->bi_mode)) {
315 		lostfound.bi_nlink++;
316 
317 		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
318 		if (ret)
319 			return ret;
320 	}
321 
322 	dir_hash = bch2_hash_info_init(c, &lostfound);
323 
324 	name = (struct qstr) QSTR(name_buf);
325 
326 	ret = bch2_dirent_create_snapshot(trans,
327 				inode->bi_parent_subvol, lostfound.bi_inum,
328 				dirent_snapshot,
329 				&dir_hash,
330 				inode_d_type(inode),
331 				&name,
332 				inode->bi_subvol ?: inode->bi_inum,
333 				&dir_offset,
334 				STR_HASH_must_create);
335 	if (ret) {
336 		bch_err_msg(c, ret, "error creating dirent");
337 		return ret;
338 	}
339 
340 	inode->bi_dir		= lostfound.bi_inum;
341 	inode->bi_dir_offset	= dir_offset;
342 
343 	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
344 }
345 
346 static int remove_backpointer(struct btree_trans *trans,
347 			      struct bch_inode_unpacked *inode)
348 {
349 	struct btree_iter iter;
350 	struct bkey_s_c_dirent d;
351 	int ret;
352 
353 	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
354 				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
355 				     dirent);
356 	ret =   bkey_err(d) ?:
357 		__remove_dirent(trans, d.k->p);
358 	bch2_trans_iter_exit(trans, &iter);
359 	return ret;
360 }
361 
362 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
363 {
364 	struct bch_fs *c = trans->c;
365 
366 	struct bch_inode_unpacked inode;
367 	int ret = bch2_inode_find_by_inum_trans(trans,
368 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
369 				&inode);
370 	if (ret)
371 		return ret;
372 
373 	ret = remove_backpointer(trans, &inode);
374 	bch_err_msg(c, ret, "removing dirent");
375 	if (ret)
376 		return ret;
377 
378 	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
379 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
380 	return ret;
381 }
382 
383 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
384 {
385 	struct bch_fs *c = trans->c;
386 
387 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
388 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
389 		return -BCH_ERR_fsck_repair_unimplemented;
390 	}
391 
392 	/*
393 	 * If inum isn't set, that means we're being called from check_dirents,
394 	 * not check_inodes - the root of this subvolume doesn't exist or we
395 	 * would have found it there:
396 	 */
397 	if (!inum) {
398 		struct btree_iter inode_iter = {};
399 		struct bch_inode_unpacked new_inode;
400 		u64 cpu = raw_smp_processor_id();
401 
402 		bch2_inode_init_early(c, &new_inode);
403 		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
404 
405 		new_inode.bi_subvol = subvolid;
406 
407 		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
408 			  bch2_btree_iter_traverse(&inode_iter) ?:
409 			  bch2_inode_write(trans, &inode_iter, &new_inode);
410 		bch2_trans_iter_exit(trans, &inode_iter);
411 		if (ret)
412 			return ret;
413 
414 		inum = new_inode.bi_inum;
415 	}
416 
417 	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
418 
419 	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
420 	int ret = PTR_ERR_OR_ZERO(new_subvol);
421 	if (ret)
422 		return ret;
423 
424 	bkey_subvolume_init(&new_subvol->k_i);
425 	new_subvol->k.p.offset	= subvolid;
426 	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
427 	new_subvol->v.inode	= cpu_to_le64(inum);
428 	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
429 	if (ret)
430 		return ret;
431 
432 	struct btree_iter iter;
433 	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
434 			BTREE_ID_snapshots, POS(0, snapshotid),
435 			0, snapshot);
436 	ret = PTR_ERR_OR_ZERO(s);
437 	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
438 	if (ret)
439 		return ret;
440 
441 	u32 snapshot_tree = le32_to_cpu(s->v.tree);
442 
443 	s->v.subvol = cpu_to_le32(subvolid);
444 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
445 	bch2_trans_iter_exit(trans, &iter);
446 
447 	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
448 			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
449 			0, snapshot_tree);
450 	ret = PTR_ERR_OR_ZERO(st);
451 	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
452 	if (ret)
453 		return ret;
454 
455 	if (!st->v.master_subvol)
456 		st->v.master_subvol = cpu_to_le32(subvolid);
457 
458 	bch2_trans_iter_exit(trans, &iter);
459 	return 0;
460 }
461 
462 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
463 {
464 	struct bch_fs *c = trans->c;
465 	unsigned i_mode = S_IFREG;
466 	u64 i_size = 0;
467 
468 	switch (btree) {
469 	case BTREE_ID_extents: {
470 		struct btree_iter iter = {};
471 
472 		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
473 		struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
474 		bch2_trans_iter_exit(trans, &iter);
475 		int ret = bkey_err(k);
476 		if (ret)
477 			return ret;
478 
479 		i_size = k.k->p.offset << 9;
480 		break;
481 	}
482 	case BTREE_ID_dirents:
483 		i_mode = S_IFDIR;
484 		break;
485 	case BTREE_ID_xattrs:
486 		break;
487 	default:
488 		BUG();
489 	}
490 
491 	struct bch_inode_unpacked new_inode;
492 	bch2_inode_init_early(c, &new_inode);
493 	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
494 	new_inode.bi_size = i_size;
495 	new_inode.bi_inum = inum;
496 
497 	return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
498 }
499 
500 struct snapshots_seen {
501 	struct bpos			pos;
502 	snapshot_id_list		ids;
503 };
504 
505 static inline void snapshots_seen_exit(struct snapshots_seen *s)
506 {
507 	darray_exit(&s->ids);
508 }
509 
510 static inline void snapshots_seen_init(struct snapshots_seen *s)
511 {
512 	memset(s, 0, sizeof(*s));
513 }
514 
515 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
516 {
517 	u32 *i;
518 	__darray_for_each(s->ids, i) {
519 		if (*i == id)
520 			return 0;
521 		if (*i > id)
522 			break;
523 	}
524 
525 	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
526 	if (ret)
527 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
528 			s->ids.size);
529 	return ret;
530 }
531 
532 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
533 				 enum btree_id btree_id, struct bpos pos)
534 {
535 	if (!bkey_eq(s->pos, pos))
536 		s->ids.nr = 0;
537 	s->pos = pos;
538 
539 	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
540 }
541 
542 /**
543  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
544  * and @ancestor hasn't been overwritten in @seen
545  *
546  * @c:		filesystem handle
547  * @seen:	list of snapshot ids already seen at current position
548  * @id:		descendent snapshot id
549  * @ancestor:	ancestor snapshot id
550  *
551  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
552  */
553 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
554 				    u32 id, u32 ancestor)
555 {
556 	ssize_t i;
557 
558 	EBUG_ON(id > ancestor);
559 
560 	/* @ancestor should be the snapshot most recently added to @seen */
561 	EBUG_ON(ancestor != seen->pos.snapshot);
562 	EBUG_ON(ancestor != darray_last(seen->ids));
563 
564 	if (id == ancestor)
565 		return true;
566 
567 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
568 		return false;
569 
570 	/*
571 	 * We know that @id is a descendant of @ancestor, we're checking if
572 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
573 	 * @ascestor and with @id as a descendent.
574 	 *
575 	 * But we already know that we're scanning IDs between @id and @ancestor
576 	 * numerically, since snapshot ID lists are kept sorted, so if we find
577 	 * an id that's an ancestor of @id we're done:
578 	 */
579 
580 	for (i = seen->ids.nr - 2;
581 	     i >= 0 && seen->ids.data[i] >= id;
582 	     --i)
583 		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
584 			return false;
585 
586 	return true;
587 }
588 
589 /**
590  * ref_visible - given a key with snapshot id @src that points to a key with
591  * snapshot id @dst, test whether there is some snapshot in which @dst is
592  * visible.
593  *
594  * @c:		filesystem handle
595  * @s:		list of snapshot IDs already seen at @src
596  * @src:	snapshot ID of src key
597  * @dst:	snapshot ID of dst key
598  * Returns:	true if there is some snapshot in which @dst is visible
599  *
600  * Assumes we're visiting @src keys in natural key order
601  */
602 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
603 			u32 src, u32 dst)
604 {
605 	return dst <= src
606 		? key_visible_in_snapshot(c, s, dst, src)
607 		: bch2_snapshot_is_ancestor(c, src, dst);
608 }
609 
610 static int ref_visible2(struct bch_fs *c,
611 			u32 src, struct snapshots_seen *src_seen,
612 			u32 dst, struct snapshots_seen *dst_seen)
613 {
614 	if (dst > src) {
615 		swap(dst, src);
616 		swap(dst_seen, src_seen);
617 	}
618 	return key_visible_in_snapshot(c, src_seen, dst, src);
619 }
620 
621 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
622 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
623 	     (_i)->snapshot <= (_snapshot); _i++)					\
624 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
625 
626 struct inode_walker_entry {
627 	struct bch_inode_unpacked inode;
628 	u32			snapshot;
629 	bool			seen_this_pos;
630 	u64			count;
631 };
632 
633 struct inode_walker {
634 	bool				first_this_inode;
635 	bool				recalculate_sums;
636 	struct bpos			last_pos;
637 
638 	DARRAY(struct inode_walker_entry) inodes;
639 };
640 
641 static void inode_walker_exit(struct inode_walker *w)
642 {
643 	darray_exit(&w->inodes);
644 }
645 
646 static struct inode_walker inode_walker_init(void)
647 {
648 	return (struct inode_walker) { 0, };
649 }
650 
651 static int add_inode(struct bch_fs *c, struct inode_walker *w,
652 		     struct bkey_s_c inode)
653 {
654 	struct bch_inode_unpacked u;
655 
656 	BUG_ON(bch2_inode_unpack(inode, &u));
657 
658 	return darray_push(&w->inodes, ((struct inode_walker_entry) {
659 		.inode		= u,
660 		.snapshot	= inode.k->p.snapshot,
661 	}));
662 }
663 
664 static int get_inodes_all_snapshots(struct btree_trans *trans,
665 				    struct inode_walker *w, u64 inum)
666 {
667 	struct bch_fs *c = trans->c;
668 	struct btree_iter iter;
669 	struct bkey_s_c k;
670 	int ret;
671 
672 	w->recalculate_sums = false;
673 	w->inodes.nr = 0;
674 
675 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
676 				     BTREE_ITER_all_snapshots, k, ret) {
677 		if (k.k->p.offset != inum)
678 			break;
679 
680 		if (bkey_is_inode(k.k))
681 			add_inode(c, w, k);
682 	}
683 	bch2_trans_iter_exit(trans, &iter);
684 
685 	if (ret)
686 		return ret;
687 
688 	w->first_this_inode = true;
689 	return 0;
690 }
691 
692 static struct inode_walker_entry *
693 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
694 {
695 	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
696 
697 	struct inode_walker_entry *i;
698 	__darray_for_each(w->inodes, i)
699 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
700 			goto found;
701 
702 	return NULL;
703 found:
704 	BUG_ON(k.k->p.snapshot > i->snapshot);
705 
706 	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
707 		struct inode_walker_entry new = *i;
708 
709 		new.snapshot = k.k->p.snapshot;
710 		new.count = 0;
711 
712 		struct printbuf buf = PRINTBUF;
713 		bch2_bkey_val_to_text(&buf, c, k);
714 
715 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
716 			 "unexpected because we should always update the inode when we update a key in that inode\n"
717 			 "%s",
718 			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
719 		printbuf_exit(&buf);
720 
721 		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
722 			--i;
723 
724 		size_t pos = i - w->inodes.data;
725 		int ret = darray_insert_item(&w->inodes, pos, new);
726 		if (ret)
727 			return ERR_PTR(ret);
728 
729 		i = w->inodes.data + pos;
730 	}
731 
732 	return i;
733 }
734 
735 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
736 					     struct inode_walker *w,
737 					     struct bkey_s_c k)
738 {
739 	if (w->last_pos.inode != k.k->p.inode) {
740 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
741 		if (ret)
742 			return ERR_PTR(ret);
743 	} else if (bkey_cmp(w->last_pos, k.k->p)) {
744 		darray_for_each(w->inodes, i)
745 			i->seen_this_pos = false;
746 	}
747 
748 	w->last_pos = k.k->p;
749 
750 	return lookup_inode_for_snapshot(trans->c, w, k);
751 }
752 
753 static int get_visible_inodes(struct btree_trans *trans,
754 			      struct inode_walker *w,
755 			      struct snapshots_seen *s,
756 			      u64 inum)
757 {
758 	struct bch_fs *c = trans->c;
759 	struct btree_iter iter;
760 	struct bkey_s_c k;
761 	int ret;
762 
763 	w->inodes.nr = 0;
764 
765 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
766 			   BTREE_ITER_all_snapshots, k, ret) {
767 		if (k.k->p.offset != inum)
768 			break;
769 
770 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
771 			continue;
772 
773 		if (bkey_is_inode(k.k))
774 			add_inode(c, w, k);
775 
776 		if (k.k->p.snapshot >= s->pos.snapshot)
777 			break;
778 	}
779 	bch2_trans_iter_exit(trans, &iter);
780 
781 	return ret;
782 }
783 
784 static int hash_redo_key(struct btree_trans *trans,
785 			 const struct bch_hash_desc desc,
786 			 struct bch_hash_info *hash_info,
787 			 struct btree_iter *k_iter, struct bkey_s_c k)
788 {
789 	struct bkey_i *delete;
790 	struct bkey_i *tmp;
791 
792 	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
793 	if (IS_ERR(delete))
794 		return PTR_ERR(delete);
795 
796 	tmp = bch2_bkey_make_mut_noupdate(trans, k);
797 	if (IS_ERR(tmp))
798 		return PTR_ERR(tmp);
799 
800 	bkey_init(&delete->k);
801 	delete->k.p = k_iter->pos;
802 	return  bch2_btree_iter_traverse(k_iter) ?:
803 		bch2_trans_update(trans, k_iter, delete, 0) ?:
804 		bch2_hash_set_in_snapshot(trans, desc, hash_info,
805 				       (subvol_inum) { 0, k.k->p.inode },
806 				       k.k->p.snapshot, tmp,
807 				       STR_HASH_must_create|
808 				       BTREE_UPDATE_internal_snapshot_node) ?:
809 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
810 }
811 
812 static int hash_check_key(struct btree_trans *trans,
813 			  const struct bch_hash_desc desc,
814 			  struct bch_hash_info *hash_info,
815 			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
816 {
817 	struct bch_fs *c = trans->c;
818 	struct btree_iter iter = { NULL };
819 	struct printbuf buf = PRINTBUF;
820 	struct bkey_s_c k;
821 	u64 hash;
822 	int ret = 0;
823 
824 	if (hash_k.k->type != desc.key_type)
825 		return 0;
826 
827 	hash = desc.hash_bkey(hash_info, hash_k);
828 
829 	if (likely(hash == hash_k.k->p.offset))
830 		return 0;
831 
832 	if (hash_k.k->p.offset < hash)
833 		goto bad_hash;
834 
835 	for_each_btree_key_norestart(trans, iter, desc.btree_id,
836 				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
837 				     BTREE_ITER_slots, k, ret) {
838 		if (bkey_eq(k.k->p, hash_k.k->p))
839 			break;
840 
841 		if (fsck_err_on(k.k->type == desc.key_type &&
842 				!desc.cmp_bkey(k, hash_k),
843 				trans, hash_table_key_duplicate,
844 				"duplicate hash table keys:\n%s",
845 				(printbuf_reset(&buf),
846 				 bch2_bkey_val_to_text(&buf, c, hash_k),
847 				 buf.buf))) {
848 			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
849 			break;
850 		}
851 
852 		if (bkey_deleted(k.k)) {
853 			bch2_trans_iter_exit(trans, &iter);
854 			goto bad_hash;
855 		}
856 	}
857 out:
858 	bch2_trans_iter_exit(trans, &iter);
859 	printbuf_exit(&buf);
860 	return ret;
861 bad_hash:
862 	if (fsck_err(trans, hash_table_key_wrong_offset,
863 		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
864 		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
865 		     (printbuf_reset(&buf),
866 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
867 		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
868 		bch_err_fn(c, ret);
869 		if (ret)
870 			return ret;
871 		ret = -BCH_ERR_transaction_restart_nested;
872 	}
873 fsck_err:
874 	goto out;
875 }
876 
877 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
878 						struct btree_iter *iter,
879 						struct bpos pos)
880 {
881 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
882 }
883 
884 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
885 					       struct btree_iter *iter,
886 					       struct bch_inode_unpacked *inode,
887 					       u32 *snapshot)
888 {
889 	if (inode->bi_subvol) {
890 		u64 inum;
891 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
892 		if (ret)
893 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
894 	}
895 
896 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
897 }
898 
899 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
900 				   struct bkey_s_c_dirent d)
901 {
902 	return  inode->bi_dir		== d.k->p.inode &&
903 		inode->bi_dir_offset	== d.k->p.offset;
904 }
905 
906 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
907 				   struct bch_inode_unpacked *inode)
908 {
909 	return d.v->d_type == DT_SUBVOL
910 		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
911 		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
912 }
913 
914 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
915 {
916 	struct btree_iter iter;
917 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
918 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
919 	bch2_trans_iter_exit(trans, &iter);
920 	return ret;
921 }
922 
923 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
924 				    struct bch_inode_unpacked *inode,
925 				    u32 inode_snapshot, bool *write_inode)
926 {
927 	struct bch_fs *c = trans->c;
928 	struct printbuf buf = PRINTBUF;
929 
930 	struct btree_iter dirent_iter = {};
931 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
932 	int ret = bkey_err(d);
933 	if (ret && !bch2_err_matches(ret, ENOENT))
934 		return ret;
935 
936 	if (fsck_err_on(ret,
937 			trans, inode_points_to_missing_dirent,
938 			"inode points to missing dirent\n%s",
939 			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
940 	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
941 			trans, inode_points_to_wrong_dirent,
942 			"inode points to dirent that does not point back:\n%s",
943 			(bch2_bkey_val_to_text(&buf, c, inode_k),
944 			 prt_newline(&buf),
945 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
946 		/*
947 		 * We just clear the backpointer fields for now. If we find a
948 		 * dirent that points to this inode in check_dirents(), we'll
949 		 * update it then; then when we get to check_path() if the
950 		 * backpointer is still 0 we'll reattach it.
951 		 */
952 		inode->bi_dir = 0;
953 		inode->bi_dir_offset = 0;
954 		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
955 		*write_inode = true;
956 	}
957 
958 	ret = 0;
959 fsck_err:
960 	bch2_trans_iter_exit(trans, &dirent_iter);
961 	printbuf_exit(&buf);
962 	bch_err_fn(c, ret);
963 	return ret;
964 }
965 
966 static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
967 {
968 	subvol_inum inum = {
969 		.subvol = snapshot_t(c, p.snapshot)->subvol,
970 		.inum	= p.offset,
971 	};
972 
973 	/* snapshot tree corruption, can't safely delete */
974 	if (!inum.subvol) {
975 		bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot);
976 		return true;
977 	}
978 
979 	return __bch2_inode_hash_find(c, inum) != NULL;
980 }
981 
982 static int check_inode(struct btree_trans *trans,
983 		       struct btree_iter *iter,
984 		       struct bkey_s_c k,
985 		       struct bch_inode_unpacked *prev,
986 		       struct snapshots_seen *s,
987 		       bool full)
988 {
989 	struct bch_fs *c = trans->c;
990 	struct bch_inode_unpacked u;
991 	bool do_update = false;
992 	int ret;
993 
994 	ret = bch2_check_key_has_snapshot(trans, iter, k);
995 	if (ret < 0)
996 		goto err;
997 	if (ret)
998 		return 0;
999 
1000 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1001 	if (ret)
1002 		goto err;
1003 
1004 	if (!bkey_is_inode(k.k))
1005 		return 0;
1006 
1007 	BUG_ON(bch2_inode_unpack(k, &u));
1008 
1009 	if (!full &&
1010 	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
1011 			    BCH_INODE_i_sectors_dirty|
1012 			    BCH_INODE_unlinked)))
1013 		return 0;
1014 
1015 	if (prev->bi_inum != u.bi_inum)
1016 		*prev = u;
1017 
1018 	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
1019 			inode_d_type(prev)	!= inode_d_type(&u),
1020 			trans, inode_snapshot_mismatch,
1021 			"inodes in different snapshots don't match")) {
1022 		bch_err(c, "repair not implemented yet");
1023 		return -BCH_ERR_fsck_repair_unimplemented;
1024 	}
1025 
1026 	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
1027 	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
1028 		struct bpos new_min_pos;
1029 
1030 		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
1031 		if (ret)
1032 			goto err;
1033 
1034 		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
1035 
1036 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1037 
1038 		bch_err_msg(c, ret, "in fsck updating inode");
1039 		if (ret)
1040 			return ret;
1041 
1042 		if (!bpos_eq(new_min_pos, POS_MIN))
1043 			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
1044 		return 0;
1045 	}
1046 
1047 	if (u.bi_flags & BCH_INODE_unlinked) {
1048 		ret = check_inode_deleted_list(trans, k.k->p);
1049 		if (ret < 0)
1050 			return ret;
1051 
1052 		fsck_err_on(!ret,
1053 			    trans, unlinked_inode_not_on_deleted_list,
1054 			    "inode %llu:%u unlinked, but not on deleted list",
1055 			    u.bi_inum, k.k->p.snapshot);
1056 		ret = 0;
1057 	}
1058 
1059 	if (u.bi_flags & BCH_INODE_unlinked &&
1060 	    !bch2_inode_open(c, k.k->p) &&
1061 	    (!c->sb.clean ||
1062 	     fsck_err(trans, inode_unlinked_but_clean,
1063 		      "filesystem marked clean, but inode %llu unlinked",
1064 		      u.bi_inum))) {
1065 		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1066 		bch_err_msg(c, ret, "in fsck deleting inode");
1067 		return ret;
1068 	}
1069 
1070 	if (u.bi_flags & BCH_INODE_i_size_dirty &&
1071 	    (!c->sb.clean ||
1072 	     fsck_err(trans, inode_i_size_dirty_but_clean,
1073 		      "filesystem marked clean, but inode %llu has i_size dirty",
1074 		      u.bi_inum))) {
1075 		bch_verbose(c, "truncating inode %llu", u.bi_inum);
1076 
1077 		/*
1078 		 * XXX: need to truncate partial blocks too here - or ideally
1079 		 * just switch units to bytes and that issue goes away
1080 		 */
1081 		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1082 				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
1083 				     iter->pos.snapshot),
1084 				POS(u.bi_inum, U64_MAX),
1085 				0, NULL);
1086 		bch_err_msg(c, ret, "in fsck truncating inode");
1087 		if (ret)
1088 			return ret;
1089 
1090 		/*
1091 		 * We truncated without our normal sector accounting hook, just
1092 		 * make sure we recalculate it:
1093 		 */
1094 		u.bi_flags |= BCH_INODE_i_sectors_dirty;
1095 
1096 		u.bi_flags &= ~BCH_INODE_i_size_dirty;
1097 		do_update = true;
1098 	}
1099 
1100 	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
1101 	    (!c->sb.clean ||
1102 	     fsck_err(trans, inode_i_sectors_dirty_but_clean,
1103 		      "filesystem marked clean, but inode %llu has i_sectors dirty",
1104 		      u.bi_inum))) {
1105 		s64 sectors;
1106 
1107 		bch_verbose(c, "recounting sectors for inode %llu",
1108 			    u.bi_inum);
1109 
1110 		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1111 		if (sectors < 0) {
1112 			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1113 			return sectors;
1114 		}
1115 
1116 		u.bi_sectors = sectors;
1117 		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1118 		do_update = true;
1119 	}
1120 
1121 	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1122 		u.bi_dir = 0;
1123 		u.bi_dir_offset = 0;
1124 		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1125 		do_update = true;
1126 	}
1127 
1128 	if (u.bi_dir || u.bi_dir_offset) {
1129 		ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1130 		if (ret)
1131 			goto err;
1132 	}
1133 
1134 	if (fsck_err_on(u.bi_parent_subvol &&
1135 			(u.bi_subvol == 0 ||
1136 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1137 			trans, inode_bi_parent_nonzero,
1138 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1139 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1140 		u.bi_parent_subvol = 0;
1141 		do_update = true;
1142 	}
1143 
1144 	if (u.bi_subvol) {
1145 		struct bch_subvolume s;
1146 
1147 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1148 		if (ret && !bch2_err_matches(ret, ENOENT))
1149 			goto err;
1150 
1151 		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1152 			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1153 			goto do_update;
1154 		}
1155 
1156 		if (fsck_err_on(ret,
1157 				trans, inode_bi_subvol_missing,
1158 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1159 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1160 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1161 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1162 							   k.k->p.snapshot),
1163 				trans, inode_bi_subvol_wrong,
1164 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1165 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1166 				le64_to_cpu(s.inode),
1167 				le32_to_cpu(s.snapshot))) {
1168 			u.bi_subvol = 0;
1169 			u.bi_parent_subvol = 0;
1170 			do_update = true;
1171 		}
1172 	}
1173 do_update:
1174 	if (do_update) {
1175 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1176 		bch_err_msg(c, ret, "in fsck updating inode");
1177 		if (ret)
1178 			return ret;
1179 	}
1180 err:
1181 fsck_err:
1182 	bch_err_fn(c, ret);
1183 	return ret;
1184 }
1185 
1186 int bch2_check_inodes(struct bch_fs *c)
1187 {
1188 	bool full = c->opts.fsck;
1189 	struct bch_inode_unpacked prev = { 0 };
1190 	struct snapshots_seen s;
1191 
1192 	snapshots_seen_init(&s);
1193 
1194 	int ret = bch2_trans_run(c,
1195 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1196 				POS_MIN,
1197 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1198 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1199 			check_inode(trans, &iter, k, &prev, &s, full)));
1200 
1201 	snapshots_seen_exit(&s);
1202 	bch_err_fn(c, ret);
1203 	return ret;
1204 }
1205 
1206 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1207 {
1208 	switch (btree) {
1209 	case BTREE_ID_extents:
1210 		return S_ISREG(mode) || S_ISLNK(mode);
1211 	case BTREE_ID_dirents:
1212 		return S_ISDIR(mode);
1213 	case BTREE_ID_xattrs:
1214 		return true;
1215 	default:
1216 		BUG();
1217 	}
1218 }
1219 
1220 static int check_key_has_inode(struct btree_trans *trans,
1221 			       struct btree_iter *iter,
1222 			       struct inode_walker *inode,
1223 			       struct inode_walker_entry *i,
1224 			       struct bkey_s_c k)
1225 {
1226 	struct bch_fs *c = trans->c;
1227 	struct printbuf buf = PRINTBUF;
1228 	int ret = PTR_ERR_OR_ZERO(i);
1229 	if (ret)
1230 		return ret;
1231 
1232 	if (k.k->type == KEY_TYPE_whiteout)
1233 		goto out;
1234 
1235 	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1236 		ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1237 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1238 		if (ret)
1239 			goto err;
1240 
1241 		inode->last_pos.inode--;
1242 		ret = -BCH_ERR_transaction_restart_nested;
1243 		goto err;
1244 	}
1245 
1246 	if (fsck_err_on(!i,
1247 			trans, key_in_missing_inode,
1248 			"key in missing inode:\n  %s",
1249 			(printbuf_reset(&buf),
1250 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1251 		goto delete;
1252 
1253 	if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1254 			trans, key_in_wrong_inode_type,
1255 			"key for wrong inode mode %o:\n  %s",
1256 			i->inode.bi_mode,
1257 			(printbuf_reset(&buf),
1258 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1259 		goto delete;
1260 out:
1261 err:
1262 fsck_err:
1263 	printbuf_exit(&buf);
1264 	bch_err_fn(c, ret);
1265 	return ret;
1266 delete:
1267 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1268 	goto out;
1269 }
1270 
1271 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1272 {
1273 	struct bch_fs *c = trans->c;
1274 	int ret = 0;
1275 	s64 count2;
1276 
1277 	darray_for_each(w->inodes, i) {
1278 		if (i->inode.bi_sectors == i->count)
1279 			continue;
1280 
1281 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1282 
1283 		if (w->recalculate_sums)
1284 			i->count = count2;
1285 
1286 		if (i->count != count2) {
1287 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1288 					    w->last_pos.inode, i->snapshot, i->count, count2);
1289 			return -BCH_ERR_internal_fsck_err;
1290 		}
1291 
1292 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1293 				trans, inode_i_sectors_wrong,
1294 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1295 				w->last_pos.inode, i->snapshot,
1296 				i->inode.bi_sectors, i->count)) {
1297 			i->inode.bi_sectors = i->count;
1298 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1299 			if (ret)
1300 				break;
1301 		}
1302 	}
1303 fsck_err:
1304 	bch_err_fn(c, ret);
1305 	return ret;
1306 }
1307 
1308 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1309 {
1310 	u32 restart_count = trans->restart_count;
1311 	return check_i_sectors_notnested(trans, w) ?:
1312 		trans_was_restarted(trans, restart_count);
1313 }
1314 
1315 struct extent_end {
1316 	u32			snapshot;
1317 	u64			offset;
1318 	struct snapshots_seen	seen;
1319 };
1320 
1321 struct extent_ends {
1322 	struct bpos			last_pos;
1323 	DARRAY(struct extent_end)	e;
1324 };
1325 
1326 static void extent_ends_reset(struct extent_ends *extent_ends)
1327 {
1328 	darray_for_each(extent_ends->e, i)
1329 		snapshots_seen_exit(&i->seen);
1330 	extent_ends->e.nr = 0;
1331 }
1332 
1333 static void extent_ends_exit(struct extent_ends *extent_ends)
1334 {
1335 	extent_ends_reset(extent_ends);
1336 	darray_exit(&extent_ends->e);
1337 }
1338 
1339 static void extent_ends_init(struct extent_ends *extent_ends)
1340 {
1341 	memset(extent_ends, 0, sizeof(*extent_ends));
1342 }
1343 
1344 static int extent_ends_at(struct bch_fs *c,
1345 			  struct extent_ends *extent_ends,
1346 			  struct snapshots_seen *seen,
1347 			  struct bkey_s_c k)
1348 {
1349 	struct extent_end *i, n = (struct extent_end) {
1350 		.offset		= k.k->p.offset,
1351 		.snapshot	= k.k->p.snapshot,
1352 		.seen		= *seen,
1353 	};
1354 
1355 	n.seen.ids.data = kmemdup(seen->ids.data,
1356 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1357 			      GFP_KERNEL);
1358 	if (!n.seen.ids.data)
1359 		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1360 
1361 	__darray_for_each(extent_ends->e, i) {
1362 		if (i->snapshot == k.k->p.snapshot) {
1363 			snapshots_seen_exit(&i->seen);
1364 			*i = n;
1365 			return 0;
1366 		}
1367 
1368 		if (i->snapshot >= k.k->p.snapshot)
1369 			break;
1370 	}
1371 
1372 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1373 }
1374 
1375 static int overlapping_extents_found(struct btree_trans *trans,
1376 				     enum btree_id btree,
1377 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1378 				     struct bkey pos2,
1379 				     bool *fixed,
1380 				     struct extent_end *extent_end)
1381 {
1382 	struct bch_fs *c = trans->c;
1383 	struct printbuf buf = PRINTBUF;
1384 	struct btree_iter iter1, iter2 = { NULL };
1385 	struct bkey_s_c k1, k2;
1386 	int ret;
1387 
1388 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1389 
1390 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1391 			     BTREE_ITER_all_snapshots|
1392 			     BTREE_ITER_not_extents);
1393 	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1394 	ret = bkey_err(k1);
1395 	if (ret)
1396 		goto err;
1397 
1398 	prt_str(&buf, "\n  ");
1399 	bch2_bkey_val_to_text(&buf, c, k1);
1400 
1401 	if (!bpos_eq(pos1, k1.k->p)) {
1402 		prt_str(&buf, "\n  wanted\n  ");
1403 		bch2_bpos_to_text(&buf, pos1);
1404 		prt_str(&buf, "\n  ");
1405 		bch2_bkey_to_text(&buf, &pos2);
1406 
1407 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1408 			__func__, buf.buf);
1409 		ret = -BCH_ERR_internal_fsck_err;
1410 		goto err;
1411 	}
1412 
1413 	bch2_trans_copy_iter(&iter2, &iter1);
1414 
1415 	while (1) {
1416 		bch2_btree_iter_advance(&iter2);
1417 
1418 		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1419 		ret = bkey_err(k2);
1420 		if (ret)
1421 			goto err;
1422 
1423 		if (bpos_ge(k2.k->p, pos2.p))
1424 			break;
1425 	}
1426 
1427 	prt_str(&buf, "\n  ");
1428 	bch2_bkey_val_to_text(&buf, c, k2);
1429 
1430 	if (bpos_gt(k2.k->p, pos2.p) ||
1431 	    pos2.size != k2.k->size) {
1432 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1433 			__func__, buf.buf);
1434 		ret = -BCH_ERR_internal_fsck_err;
1435 		goto err;
1436 	}
1437 
1438 	prt_printf(&buf, "\n  overwriting %s extent",
1439 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1440 
1441 	if (fsck_err(trans, extent_overlapping,
1442 		     "overlapping extents%s", buf.buf)) {
1443 		struct btree_iter *old_iter = &iter1;
1444 		struct disk_reservation res = { 0 };
1445 
1446 		if (pos1.snapshot < pos2.p.snapshot) {
1447 			old_iter = &iter2;
1448 			swap(k1, k2);
1449 		}
1450 
1451 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1452 
1453 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1454 				BTREE_UPDATE_internal_snapshot_node,
1455 				k1, k2) ?:
1456 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1457 		bch2_disk_reservation_put(c, &res);
1458 
1459 		if (ret)
1460 			goto err;
1461 
1462 		*fixed = true;
1463 
1464 		if (pos1.snapshot == pos2.p.snapshot) {
1465 			/*
1466 			 * We overwrote the first extent, and did the overwrite
1467 			 * in the same snapshot:
1468 			 */
1469 			extent_end->offset = bkey_start_offset(&pos2);
1470 		} else if (pos1.snapshot > pos2.p.snapshot) {
1471 			/*
1472 			 * We overwrote the first extent in pos2's snapshot:
1473 			 */
1474 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1475 		} else {
1476 			/*
1477 			 * We overwrote the second extent - restart
1478 			 * check_extent() from the top:
1479 			 */
1480 			ret = -BCH_ERR_transaction_restart_nested;
1481 		}
1482 	}
1483 fsck_err:
1484 err:
1485 	bch2_trans_iter_exit(trans, &iter2);
1486 	bch2_trans_iter_exit(trans, &iter1);
1487 	printbuf_exit(&buf);
1488 	return ret;
1489 }
1490 
1491 static int check_overlapping_extents(struct btree_trans *trans,
1492 			      struct snapshots_seen *seen,
1493 			      struct extent_ends *extent_ends,
1494 			      struct bkey_s_c k,
1495 			      struct btree_iter *iter,
1496 			      bool *fixed)
1497 {
1498 	struct bch_fs *c = trans->c;
1499 	int ret = 0;
1500 
1501 	/* transaction restart, running again */
1502 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1503 		return 0;
1504 
1505 	if (extent_ends->last_pos.inode != k.k->p.inode)
1506 		extent_ends_reset(extent_ends);
1507 
1508 	darray_for_each(extent_ends->e, i) {
1509 		if (i->offset <= bkey_start_offset(k.k))
1510 			continue;
1511 
1512 		if (!ref_visible2(c,
1513 				  k.k->p.snapshot, seen,
1514 				  i->snapshot, &i->seen))
1515 			continue;
1516 
1517 		ret = overlapping_extents_found(trans, iter->btree_id,
1518 						SPOS(iter->pos.inode,
1519 						     i->offset,
1520 						     i->snapshot),
1521 						&i->seen,
1522 						*k.k, fixed, i);
1523 		if (ret)
1524 			goto err;
1525 	}
1526 
1527 	extent_ends->last_pos = k.k->p;
1528 err:
1529 	return ret;
1530 }
1531 
1532 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1533 				struct bkey_s_c k)
1534 {
1535 	struct bch_fs *c = trans->c;
1536 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1537 	struct bch_extent_crc_unpacked crc;
1538 	const union bch_extent_entry *i;
1539 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1540 
1541 	bkey_for_each_crc(k.k, ptrs, crc, i)
1542 		if (crc_is_encoded(crc) &&
1543 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1544 			struct printbuf buf = PRINTBUF;
1545 
1546 			bch2_bkey_val_to_text(&buf, c, k);
1547 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1548 			printbuf_exit(&buf);
1549 		}
1550 
1551 	return 0;
1552 }
1553 
1554 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1555 			struct bkey_s_c k,
1556 			struct inode_walker *inode,
1557 			struct snapshots_seen *s,
1558 			struct extent_ends *extent_ends)
1559 {
1560 	struct bch_fs *c = trans->c;
1561 	struct inode_walker_entry *i;
1562 	struct printbuf buf = PRINTBUF;
1563 	int ret = 0;
1564 
1565 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1566 	if (ret) {
1567 		ret = ret < 0 ? ret : 0;
1568 		goto out;
1569 	}
1570 
1571 	if (inode->last_pos.inode != k.k->p.inode) {
1572 		ret = check_i_sectors(trans, inode);
1573 		if (ret)
1574 			goto err;
1575 	}
1576 
1577 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1578 	if (ret)
1579 		goto err;
1580 
1581 	i = walk_inode(trans, inode, k);
1582 	ret = PTR_ERR_OR_ZERO(i);
1583 	if (ret)
1584 		goto err;
1585 
1586 	ret = check_key_has_inode(trans, iter, inode, i, k);
1587 	if (ret)
1588 		goto err;
1589 
1590 	if (k.k->type != KEY_TYPE_whiteout) {
1591 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1592 						&inode->recalculate_sums);
1593 		if (ret)
1594 			goto err;
1595 	}
1596 
1597 	/*
1598 	 * Check inodes in reverse order, from oldest snapshots to newest,
1599 	 * starting from the inode that matches this extent's snapshot. If we
1600 	 * didn't have one, iterate over all inodes:
1601 	 */
1602 	if (!i)
1603 		i = &darray_last(inode->inodes);
1604 
1605 	for (;
1606 	     inode->inodes.data && i >= inode->inodes.data;
1607 	     --i) {
1608 		if (i->snapshot > k.k->p.snapshot ||
1609 		    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1610 			continue;
1611 
1612 		if (k.k->type != KEY_TYPE_whiteout) {
1613 			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1614 					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1615 					!bkey_extent_is_reservation(k),
1616 					trans, extent_past_end_of_inode,
1617 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
1618 					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1619 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1620 				struct btree_iter iter2;
1621 
1622 				bch2_trans_copy_iter(&iter2, iter);
1623 				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1624 				ret =   bch2_btree_iter_traverse(&iter2) ?:
1625 					bch2_btree_delete_at(trans, &iter2,
1626 						BTREE_UPDATE_internal_snapshot_node);
1627 				bch2_trans_iter_exit(trans, &iter2);
1628 				if (ret)
1629 					goto err;
1630 
1631 				iter->k.type = KEY_TYPE_whiteout;
1632 			}
1633 
1634 			if (bkey_extent_is_allocation(k.k))
1635 				i->count += k.k->size;
1636 		}
1637 
1638 		i->seen_this_pos = true;
1639 	}
1640 
1641 	if (k.k->type != KEY_TYPE_whiteout) {
1642 		ret = extent_ends_at(c, extent_ends, s, k);
1643 		if (ret)
1644 			goto err;
1645 	}
1646 out:
1647 err:
1648 fsck_err:
1649 	printbuf_exit(&buf);
1650 	bch_err_fn(c, ret);
1651 	return ret;
1652 }
1653 
1654 /*
1655  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1656  * that i_size an i_sectors are consistent
1657  */
1658 int bch2_check_extents(struct bch_fs *c)
1659 {
1660 	struct inode_walker w = inode_walker_init();
1661 	struct snapshots_seen s;
1662 	struct extent_ends extent_ends;
1663 	struct disk_reservation res = { 0 };
1664 
1665 	snapshots_seen_init(&s);
1666 	extent_ends_init(&extent_ends);
1667 
1668 	int ret = bch2_trans_run(c,
1669 		for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1670 				POS(BCACHEFS_ROOT_INO, 0),
1671 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1672 				&res, NULL,
1673 				BCH_TRANS_COMMIT_no_enospc, ({
1674 			bch2_disk_reservation_put(c, &res);
1675 			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1676 			check_extent_overbig(trans, &iter, k);
1677 		})) ?:
1678 		check_i_sectors_notnested(trans, &w));
1679 
1680 	bch2_disk_reservation_put(c, &res);
1681 	extent_ends_exit(&extent_ends);
1682 	inode_walker_exit(&w);
1683 	snapshots_seen_exit(&s);
1684 
1685 	bch_err_fn(c, ret);
1686 	return ret;
1687 }
1688 
1689 int bch2_check_indirect_extents(struct bch_fs *c)
1690 {
1691 	struct disk_reservation res = { 0 };
1692 
1693 	int ret = bch2_trans_run(c,
1694 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1695 				POS_MIN,
1696 				BTREE_ITER_prefetch, k,
1697 				&res, NULL,
1698 				BCH_TRANS_COMMIT_no_enospc, ({
1699 			bch2_disk_reservation_put(c, &res);
1700 			check_extent_overbig(trans, &iter, k);
1701 		})));
1702 
1703 	bch2_disk_reservation_put(c, &res);
1704 	bch_err_fn(c, ret);
1705 	return ret;
1706 }
1707 
1708 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1709 {
1710 	struct bch_fs *c = trans->c;
1711 	int ret = 0;
1712 	s64 count2;
1713 
1714 	darray_for_each(w->inodes, i) {
1715 		if (i->inode.bi_nlink == i->count)
1716 			continue;
1717 
1718 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1719 		if (count2 < 0)
1720 			return count2;
1721 
1722 		if (i->count != count2) {
1723 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1724 					    w->last_pos.inode, i->snapshot, i->count, count2);
1725 			i->count = count2;
1726 			if (i->inode.bi_nlink == i->count)
1727 				continue;
1728 		}
1729 
1730 		if (fsck_err_on(i->inode.bi_nlink != i->count,
1731 				trans, inode_dir_wrong_nlink,
1732 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1733 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1734 			i->inode.bi_nlink = i->count;
1735 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1736 			if (ret)
1737 				break;
1738 		}
1739 	}
1740 fsck_err:
1741 	bch_err_fn(c, ret);
1742 	return ret;
1743 }
1744 
1745 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1746 {
1747 	u32 restart_count = trans->restart_count;
1748 	return check_subdir_count_notnested(trans, w) ?:
1749 		trans_was_restarted(trans, restart_count);
1750 }
1751 
1752 noinline_for_stack
1753 static int check_dirent_inode_dirent(struct btree_trans *trans,
1754 				   struct btree_iter *iter,
1755 				   struct bkey_s_c_dirent d,
1756 				   struct bch_inode_unpacked *target,
1757 				   u32 target_snapshot)
1758 {
1759 	struct bch_fs *c = trans->c;
1760 	struct printbuf buf = PRINTBUF;
1761 	int ret = 0;
1762 
1763 	if (inode_points_to_dirent(target, d))
1764 		return 0;
1765 
1766 	if (bch2_inode_should_have_bp(target) &&
1767 	    !fsck_err(trans, inode_wrong_backpointer,
1768 		      "dirent points to inode that does not point back:\n  %s",
1769 		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
1770 		       prt_printf(&buf, "\n  "),
1771 		       bch2_inode_unpacked_to_text(&buf, target),
1772 		       buf.buf)))
1773 		goto out_noiter;
1774 
1775 	if (!target->bi_dir &&
1776 	    !target->bi_dir_offset) {
1777 		target->bi_dir		= d.k->p.inode;
1778 		target->bi_dir_offset	= d.k->p.offset;
1779 		return __bch2_fsck_write_inode(trans, target, target_snapshot);
1780 	}
1781 
1782 	struct btree_iter bp_iter = { NULL };
1783 	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1784 			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1785 	ret = bkey_err(bp_dirent);
1786 	if (ret && !bch2_err_matches(ret, ENOENT))
1787 		goto err;
1788 
1789 	bool backpointer_exists = !ret;
1790 	ret = 0;
1791 
1792 	if (fsck_err_on(!backpointer_exists,
1793 			trans, inode_wrong_backpointer,
1794 			"inode %llu:%u has wrong backpointer:\n"
1795 			"got       %llu:%llu\n"
1796 			"should be %llu:%llu",
1797 			target->bi_inum, target_snapshot,
1798 			target->bi_dir,
1799 			target->bi_dir_offset,
1800 			d.k->p.inode,
1801 			d.k->p.offset)) {
1802 		target->bi_dir		= d.k->p.inode;
1803 		target->bi_dir_offset	= d.k->p.offset;
1804 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1805 		goto out;
1806 	}
1807 
1808 	bch2_bkey_val_to_text(&buf, c, d.s_c);
1809 	prt_newline(&buf);
1810 	if (backpointer_exists)
1811 		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1812 
1813 	if (fsck_err_on(backpointer_exists &&
1814 			(S_ISDIR(target->bi_mode) ||
1815 			 target->bi_subvol),
1816 			trans, inode_dir_multiple_links,
1817 			"%s %llu:%u with multiple links\n%s",
1818 			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1819 			target->bi_inum, target_snapshot, buf.buf)) {
1820 		ret = __remove_dirent(trans, d.k->p);
1821 		goto out;
1822 	}
1823 
1824 	/*
1825 	 * hardlinked file with nlink 0:
1826 	 * We're just adjusting nlink here so check_nlinks() will pick
1827 	 * it up, it ignores inodes with nlink 0
1828 	 */
1829 	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1830 			trans, inode_multiple_links_but_nlink_0,
1831 			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1832 			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1833 		target->bi_nlink++;
1834 		target->bi_flags &= ~BCH_INODE_unlinked;
1835 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1836 		if (ret)
1837 			goto err;
1838 	}
1839 out:
1840 err:
1841 fsck_err:
1842 	bch2_trans_iter_exit(trans, &bp_iter);
1843 out_noiter:
1844 	printbuf_exit(&buf);
1845 	bch_err_fn(c, ret);
1846 	return ret;
1847 }
1848 
1849 noinline_for_stack
1850 static int check_dirent_target(struct btree_trans *trans,
1851 			       struct btree_iter *iter,
1852 			       struct bkey_s_c_dirent d,
1853 			       struct bch_inode_unpacked *target,
1854 			       u32 target_snapshot)
1855 {
1856 	struct bch_fs *c = trans->c;
1857 	struct bkey_i_dirent *n;
1858 	struct printbuf buf = PRINTBUF;
1859 	int ret = 0;
1860 
1861 	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1862 	if (ret)
1863 		goto err;
1864 
1865 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
1866 			trans, dirent_d_type_wrong,
1867 			"incorrect d_type: got %s, should be %s:\n%s",
1868 			bch2_d_type_str(d.v->d_type),
1869 			bch2_d_type_str(inode_d_type(target)),
1870 			(printbuf_reset(&buf),
1871 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1872 		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1873 		ret = PTR_ERR_OR_ZERO(n);
1874 		if (ret)
1875 			goto err;
1876 
1877 		bkey_reassemble(&n->k_i, d.s_c);
1878 		n->v.d_type = inode_d_type(target);
1879 		if (n->v.d_type == DT_SUBVOL) {
1880 			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1881 			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1882 		} else {
1883 			n->v.d_inum = cpu_to_le64(target->bi_inum);
1884 		}
1885 
1886 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1887 		if (ret)
1888 			goto err;
1889 
1890 		d = dirent_i_to_s_c(n);
1891 	}
1892 err:
1893 fsck_err:
1894 	printbuf_exit(&buf);
1895 	bch_err_fn(c, ret);
1896 	return ret;
1897 }
1898 
1899 /* find a subvolume that's a descendent of @snapshot: */
1900 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1901 {
1902 	struct btree_iter iter;
1903 	struct bkey_s_c k;
1904 	int ret;
1905 
1906 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1907 		if (k.k->type != KEY_TYPE_subvolume)
1908 			continue;
1909 
1910 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1911 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1912 			bch2_trans_iter_exit(trans, &iter);
1913 			*subvolid = k.k->p.offset;
1914 			goto found;
1915 		}
1916 	}
1917 	if (!ret)
1918 		ret = -ENOENT;
1919 found:
1920 	bch2_trans_iter_exit(trans, &iter);
1921 	return ret;
1922 }
1923 
1924 noinline_for_stack
1925 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1926 				  struct bkey_s_c_dirent d)
1927 {
1928 	struct bch_fs *c = trans->c;
1929 	struct btree_iter subvol_iter = {};
1930 	struct bch_inode_unpacked subvol_root;
1931 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1932 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1933 	u32 parent_snapshot;
1934 	u32 new_parent_subvol = 0;
1935 	u64 parent_inum;
1936 	struct printbuf buf = PRINTBUF;
1937 	int ret = 0;
1938 
1939 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1940 	if (ret && !bch2_err_matches(ret, ENOENT))
1941 		return ret;
1942 
1943 	if (ret ||
1944 	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
1945 		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1946 		if (ret2 && !bch2_err_matches(ret, ENOENT))
1947 			return ret2;
1948 	}
1949 
1950 	if (ret &&
1951 	    !new_parent_subvol &&
1952 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1953 		/*
1954 		 * Couldn't find a subvol for dirent's snapshot - but we lost
1955 		 * subvols, so we need to reconstruct:
1956 		 */
1957 		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
1958 		if (ret)
1959 			return ret;
1960 
1961 		parent_snapshot = d.k->p.snapshot;
1962 	}
1963 
1964 	if (fsck_err_on(ret,
1965 			trans, dirent_to_missing_parent_subvol,
1966 			"dirent parent_subvol points to missing subvolume\n%s",
1967 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1968 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1969 			trans, dirent_not_visible_in_parent_subvol,
1970 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1971 			parent_snapshot,
1972 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1973 		if (!new_parent_subvol) {
1974 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
1975 			return -BCH_ERR_fsck_repair_unimplemented;
1976 		}
1977 
1978 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1979 		ret = PTR_ERR_OR_ZERO(new_dirent);
1980 		if (ret)
1981 			goto err;
1982 
1983 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1984 	}
1985 
1986 	struct bkey_s_c_subvolume s =
1987 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
1988 					 BTREE_ID_subvolumes, POS(0, target_subvol),
1989 					 0, subvolume);
1990 	ret = bkey_err(s.s_c);
1991 	if (ret && !bch2_err_matches(ret, ENOENT))
1992 		return ret;
1993 
1994 	if (ret) {
1995 		if (fsck_err(trans, dirent_to_missing_subvol,
1996 			     "dirent points to missing subvolume\n%s",
1997 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1998 			return __remove_dirent(trans, d.k->p);
1999 		ret = 0;
2000 		goto out;
2001 	}
2002 
2003 	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
2004 			trans, subvol_fs_path_parent_wrong,
2005 			"subvol with wrong fs_path_parent, should be be %u\n%s",
2006 			parent_subvol,
2007 			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
2008 		struct bkey_i_subvolume *n =
2009 			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
2010 		ret = PTR_ERR_OR_ZERO(n);
2011 		if (ret)
2012 			goto err;
2013 
2014 		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
2015 	}
2016 
2017 	u64 target_inum = le64_to_cpu(s.v->inode);
2018 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2019 
2020 	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
2021 	if (ret && !bch2_err_matches(ret, ENOENT))
2022 		goto err;
2023 
2024 	if (ret) {
2025 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2026 		ret = -BCH_ERR_fsck_repair_unimplemented;
2027 		goto err;
2028 	}
2029 
2030 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2031 			trans, inode_bi_parent_wrong,
2032 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2033 			target_inum,
2034 			subvol_root.bi_parent_subvol, parent_subvol)) {
2035 		subvol_root.bi_parent_subvol = parent_subvol;
2036 		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
2037 		if (ret)
2038 			goto err;
2039 	}
2040 
2041 	ret = check_dirent_target(trans, iter, d, &subvol_root,
2042 				  target_snapshot);
2043 	if (ret)
2044 		goto err;
2045 out:
2046 err:
2047 fsck_err:
2048 	bch2_trans_iter_exit(trans, &subvol_iter);
2049 	printbuf_exit(&buf);
2050 	return ret;
2051 }
2052 
2053 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2054 			struct bkey_s_c k,
2055 			struct bch_hash_info *hash_info,
2056 			struct inode_walker *dir,
2057 			struct inode_walker *target,
2058 			struct snapshots_seen *s)
2059 {
2060 	struct bch_fs *c = trans->c;
2061 	struct inode_walker_entry *i;
2062 	struct printbuf buf = PRINTBUF;
2063 	int ret = 0;
2064 
2065 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2066 	if (ret) {
2067 		ret = ret < 0 ? ret : 0;
2068 		goto out;
2069 	}
2070 
2071 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2072 	if (ret)
2073 		goto err;
2074 
2075 	if (k.k->type == KEY_TYPE_whiteout)
2076 		goto out;
2077 
2078 	if (dir->last_pos.inode != k.k->p.inode) {
2079 		ret = check_subdir_count(trans, dir);
2080 		if (ret)
2081 			goto err;
2082 	}
2083 
2084 	i = walk_inode(trans, dir, k);
2085 	ret = PTR_ERR_OR_ZERO(i);
2086 	if (ret < 0)
2087 		goto err;
2088 
2089 	ret = check_key_has_inode(trans, iter, dir, i, k);
2090 	if (ret)
2091 		goto err;
2092 
2093 	if (!i)
2094 		goto out;
2095 
2096 	if (dir->first_this_inode)
2097 		*hash_info = bch2_hash_info_init(c, &i->inode);
2098 	dir->first_this_inode = false;
2099 
2100 	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
2101 	if (ret < 0)
2102 		goto err;
2103 	if (ret) {
2104 		/* dirent has been deleted */
2105 		ret = 0;
2106 		goto out;
2107 	}
2108 
2109 	if (k.k->type != KEY_TYPE_dirent)
2110 		goto out;
2111 
2112 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2113 
2114 	if (d.v->d_type == DT_SUBVOL) {
2115 		ret = check_dirent_to_subvol(trans, iter, d);
2116 		if (ret)
2117 			goto err;
2118 	} else {
2119 		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2120 		if (ret)
2121 			goto err;
2122 
2123 		if (fsck_err_on(!target->inodes.nr,
2124 				trans, dirent_to_missing_inode,
2125 				"dirent points to missing inode:\n%s",
2126 				(printbuf_reset(&buf),
2127 				 bch2_bkey_val_to_text(&buf, c, k),
2128 				 buf.buf))) {
2129 			ret = __remove_dirent(trans, d.k->p);
2130 			if (ret)
2131 				goto err;
2132 		}
2133 
2134 		darray_for_each(target->inodes, i) {
2135 			ret = check_dirent_target(trans, iter, d,
2136 						  &i->inode, i->snapshot);
2137 			if (ret)
2138 				goto err;
2139 		}
2140 
2141 		if (d.v->d_type == DT_DIR)
2142 			for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
2143 				i->count++;
2144 	}
2145 out:
2146 err:
2147 fsck_err:
2148 	printbuf_exit(&buf);
2149 	bch_err_fn(c, ret);
2150 	return ret;
2151 }
2152 
2153 /*
2154  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2155  * validate d_type
2156  */
2157 int bch2_check_dirents(struct bch_fs *c)
2158 {
2159 	struct inode_walker dir = inode_walker_init();
2160 	struct inode_walker target = inode_walker_init();
2161 	struct snapshots_seen s;
2162 	struct bch_hash_info hash_info;
2163 
2164 	snapshots_seen_init(&s);
2165 
2166 	int ret = bch2_trans_run(c,
2167 		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2168 				POS(BCACHEFS_ROOT_INO, 0),
2169 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2170 				k,
2171 				NULL, NULL,
2172 				BCH_TRANS_COMMIT_no_enospc,
2173 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2174 		check_subdir_count_notnested(trans, &dir));
2175 
2176 	snapshots_seen_exit(&s);
2177 	inode_walker_exit(&dir);
2178 	inode_walker_exit(&target);
2179 	bch_err_fn(c, ret);
2180 	return ret;
2181 }
2182 
2183 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2184 		       struct bkey_s_c k,
2185 		       struct bch_hash_info *hash_info,
2186 		       struct inode_walker *inode)
2187 {
2188 	struct bch_fs *c = trans->c;
2189 	struct inode_walker_entry *i;
2190 	int ret;
2191 
2192 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2193 	if (ret < 0)
2194 		return ret;
2195 	if (ret)
2196 		return 0;
2197 
2198 	i = walk_inode(trans, inode, k);
2199 	ret = PTR_ERR_OR_ZERO(i);
2200 	if (ret)
2201 		return ret;
2202 
2203 	ret = check_key_has_inode(trans, iter, inode, i, k);
2204 	if (ret)
2205 		return ret;
2206 
2207 	if (!i)
2208 		return 0;
2209 
2210 	if (inode->first_this_inode)
2211 		*hash_info = bch2_hash_info_init(c, &i->inode);
2212 	inode->first_this_inode = false;
2213 
2214 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2215 	bch_err_fn(c, ret);
2216 	return ret;
2217 }
2218 
2219 /*
2220  * Walk xattrs: verify that they all have a corresponding inode
2221  */
2222 int bch2_check_xattrs(struct bch_fs *c)
2223 {
2224 	struct inode_walker inode = inode_walker_init();
2225 	struct bch_hash_info hash_info;
2226 	int ret = 0;
2227 
2228 	ret = bch2_trans_run(c,
2229 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2230 			POS(BCACHEFS_ROOT_INO, 0),
2231 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2232 			k,
2233 			NULL, NULL,
2234 			BCH_TRANS_COMMIT_no_enospc,
2235 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2236 
2237 	inode_walker_exit(&inode);
2238 	bch_err_fn(c, ret);
2239 	return ret;
2240 }
2241 
2242 static int check_root_trans(struct btree_trans *trans)
2243 {
2244 	struct bch_fs *c = trans->c;
2245 	struct bch_inode_unpacked root_inode;
2246 	u32 snapshot;
2247 	u64 inum;
2248 	int ret;
2249 
2250 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2251 	if (ret && !bch2_err_matches(ret, ENOENT))
2252 		return ret;
2253 
2254 	if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2255 				"root subvol missing")) {
2256 		struct bkey_i_subvolume *root_subvol =
2257 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2258 		ret = PTR_ERR_OR_ZERO(root_subvol);
2259 		if (ret)
2260 			goto err;
2261 
2262 		snapshot	= U32_MAX;
2263 		inum		= BCACHEFS_ROOT_INO;
2264 
2265 		bkey_subvolume_init(&root_subvol->k_i);
2266 		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2267 		root_subvol->v.flags	= 0;
2268 		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2269 		root_subvol->v.inode	= cpu_to_le64(inum);
2270 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2271 		bch_err_msg(c, ret, "writing root subvol");
2272 		if (ret)
2273 			goto err;
2274 	}
2275 
2276 	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2277 	if (ret && !bch2_err_matches(ret, ENOENT))
2278 		return ret;
2279 
2280 	if (mustfix_fsck_err_on(ret,
2281 				trans, root_dir_missing,
2282 				"root directory missing") ||
2283 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2284 				trans, root_inode_not_dir,
2285 				"root inode not a directory")) {
2286 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2287 				0, NULL);
2288 		root_inode.bi_inum = inum;
2289 
2290 		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2291 		bch_err_msg(c, ret, "writing root inode");
2292 	}
2293 err:
2294 fsck_err:
2295 	return ret;
2296 }
2297 
2298 /* Get root directory, create if it doesn't exist: */
2299 int bch2_check_root(struct bch_fs *c)
2300 {
2301 	int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2302 		check_root_trans(trans));
2303 	bch_err_fn(c, ret);
2304 	return ret;
2305 }
2306 
2307 typedef DARRAY(u32) darray_u32;
2308 
2309 static bool darray_u32_has(darray_u32 *d, u32 v)
2310 {
2311 	darray_for_each(*d, i)
2312 		if (*i == v)
2313 			return true;
2314 	return false;
2315 }
2316 
2317 /*
2318  * We've checked that inode backpointers point to valid dirents; here, it's
2319  * sufficient to check that the subvolume root has a dirent:
2320  */
2321 static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
2322 {
2323 	struct bch_inode_unpacked inode;
2324 	int ret = bch2_inode_find_by_inum_trans(trans,
2325 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2326 				&inode);
2327 	if (ret)
2328 		return ret;
2329 
2330 	return inode.bi_dir != 0;
2331 }
2332 
2333 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2334 {
2335 	struct bch_fs *c = trans->c;
2336 	struct btree_iter parent_iter = {};
2337 	darray_u32 subvol_path = {};
2338 	struct printbuf buf = PRINTBUF;
2339 	int ret = 0;
2340 
2341 	if (k.k->type != KEY_TYPE_subvolume)
2342 		return 0;
2343 
2344 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2345 		ret = darray_push(&subvol_path, k.k->p.offset);
2346 		if (ret)
2347 			goto err;
2348 
2349 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2350 
2351 		ret = subvol_has_dirent(trans, s);
2352 		if (ret < 0)
2353 			break;
2354 
2355 		if (fsck_err_on(!ret,
2356 				trans, subvol_unreachable,
2357 				"unreachable subvolume %s",
2358 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2359 				 buf.buf))) {
2360 			ret = reattach_subvol(trans, s);
2361 			break;
2362 		}
2363 
2364 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2365 
2366 		if (darray_u32_has(&subvol_path, parent)) {
2367 			if (fsck_err(c, subvol_loop, "subvolume loop"))
2368 				ret = reattach_subvol(trans, s);
2369 			break;
2370 		}
2371 
2372 		bch2_trans_iter_exit(trans, &parent_iter);
2373 		bch2_trans_iter_init(trans, &parent_iter,
2374 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2375 		k = bch2_btree_iter_peek_slot(&parent_iter);
2376 		ret = bkey_err(k);
2377 		if (ret)
2378 			goto err;
2379 
2380 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2381 				trans, subvol_unreachable,
2382 				"unreachable subvolume %s",
2383 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2384 				 buf.buf))) {
2385 			ret = reattach_subvol(trans, s);
2386 			break;
2387 		}
2388 	}
2389 fsck_err:
2390 err:
2391 	printbuf_exit(&buf);
2392 	darray_exit(&subvol_path);
2393 	bch2_trans_iter_exit(trans, &parent_iter);
2394 	return ret;
2395 }
2396 
2397 int bch2_check_subvolume_structure(struct bch_fs *c)
2398 {
2399 	int ret = bch2_trans_run(c,
2400 		for_each_btree_key_commit(trans, iter,
2401 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2402 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2403 			check_subvol_path(trans, &iter, k)));
2404 	bch_err_fn(c, ret);
2405 	return ret;
2406 }
2407 
2408 struct pathbuf_entry {
2409 	u64	inum;
2410 	u32	snapshot;
2411 };
2412 
2413 typedef DARRAY(struct pathbuf_entry) pathbuf;
2414 
2415 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2416 {
2417 	darray_for_each(*p, i)
2418 		if (i->inum	== inum &&
2419 		    i->snapshot	== snapshot)
2420 			return true;
2421 	return false;
2422 }
2423 
2424 /*
2425  * Check that a given inode is reachable from its subvolume root - we already
2426  * verified subvolume connectivity:
2427  *
2428  * XXX: we should also be verifying that inodes are in the right subvolumes
2429  */
2430 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2431 {
2432 	struct bch_fs *c = trans->c;
2433 	struct btree_iter inode_iter = {};
2434 	struct bch_inode_unpacked inode;
2435 	struct printbuf buf = PRINTBUF;
2436 	u32 snapshot = inode_k.k->p.snapshot;
2437 	int ret = 0;
2438 
2439 	p->nr = 0;
2440 
2441 	BUG_ON(bch2_inode_unpack(inode_k, &inode));
2442 
2443 	while (!inode.bi_subvol) {
2444 		struct btree_iter dirent_iter;
2445 		struct bkey_s_c_dirent d;
2446 		u32 parent_snapshot = snapshot;
2447 
2448 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2449 		ret = bkey_err(d.s_c);
2450 		if (ret && !bch2_err_matches(ret, ENOENT))
2451 			break;
2452 
2453 		if (!ret && !dirent_points_to_inode(d, &inode)) {
2454 			bch2_trans_iter_exit(trans, &dirent_iter);
2455 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2456 		}
2457 
2458 		if (bch2_err_matches(ret, ENOENT)) {
2459 			ret = 0;
2460 			if (fsck_err(trans, inode_unreachable,
2461 				     "unreachable inode\n%s",
2462 				     (printbuf_reset(&buf),
2463 				      bch2_bkey_val_to_text(&buf, c, inode_k),
2464 				      buf.buf)))
2465 				ret = reattach_inode(trans, &inode, snapshot);
2466 			goto out;
2467 		}
2468 
2469 		bch2_trans_iter_exit(trans, &dirent_iter);
2470 
2471 		if (!S_ISDIR(inode.bi_mode))
2472 			break;
2473 
2474 		ret = darray_push(p, ((struct pathbuf_entry) {
2475 			.inum		= inode.bi_inum,
2476 			.snapshot	= snapshot,
2477 		}));
2478 		if (ret)
2479 			return ret;
2480 
2481 		snapshot = parent_snapshot;
2482 
2483 		bch2_trans_iter_exit(trans, &inode_iter);
2484 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2485 					     SPOS(0, inode.bi_dir, snapshot), 0);
2486 		ret = bkey_err(inode_k) ?:
2487 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2488 			: bch2_inode_unpack(inode_k, &inode);
2489 		if (ret) {
2490 			/* Should have been caught in dirents pass */
2491 			bch_err_msg(c, ret, "error looking up parent directory");
2492 			break;
2493 		}
2494 
2495 		snapshot = inode_k.k->p.snapshot;
2496 
2497 		if (path_is_dup(p, inode.bi_inum, snapshot)) {
2498 			/* XXX print path */
2499 			bch_err(c, "directory structure loop");
2500 
2501 			darray_for_each(*p, i)
2502 				pr_err("%llu:%u", i->inum, i->snapshot);
2503 			pr_err("%llu:%u", inode.bi_inum, snapshot);
2504 
2505 			if (fsck_err(trans, dir_loop, "directory structure loop")) {
2506 				ret = remove_backpointer(trans, &inode);
2507 				bch_err_msg(c, ret, "removing dirent");
2508 				if (ret)
2509 					break;
2510 
2511 				ret = reattach_inode(trans, &inode, snapshot);
2512 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2513 			}
2514 			break;
2515 		}
2516 	}
2517 out:
2518 fsck_err:
2519 	bch2_trans_iter_exit(trans, &inode_iter);
2520 	printbuf_exit(&buf);
2521 	bch_err_fn(c, ret);
2522 	return ret;
2523 }
2524 
2525 /*
2526  * Check for unreachable inodes, as well as loops in the directory structure:
2527  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2528  * unreachable:
2529  */
2530 int bch2_check_directory_structure(struct bch_fs *c)
2531 {
2532 	pathbuf path = { 0, };
2533 	int ret;
2534 
2535 	ret = bch2_trans_run(c,
2536 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2537 					  BTREE_ITER_intent|
2538 					  BTREE_ITER_prefetch|
2539 					  BTREE_ITER_all_snapshots, k,
2540 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2541 			if (!bkey_is_inode(k.k))
2542 				continue;
2543 
2544 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2545 				continue;
2546 
2547 			check_path(trans, &path, k);
2548 		})));
2549 	darray_exit(&path);
2550 
2551 	bch_err_fn(c, ret);
2552 	return ret;
2553 }
2554 
2555 struct nlink_table {
2556 	size_t		nr;
2557 	size_t		size;
2558 
2559 	struct nlink {
2560 		u64	inum;
2561 		u32	snapshot;
2562 		u32	count;
2563 	}		*d;
2564 };
2565 
2566 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2567 		     u64 inum, u32 snapshot)
2568 {
2569 	if (t->nr == t->size) {
2570 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2571 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2572 
2573 		if (!d) {
2574 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2575 				new_size);
2576 			return -BCH_ERR_ENOMEM_fsck_add_nlink;
2577 		}
2578 
2579 		if (t->d)
2580 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2581 		kvfree(t->d);
2582 
2583 		t->d = d;
2584 		t->size = new_size;
2585 	}
2586 
2587 
2588 	t->d[t->nr++] = (struct nlink) {
2589 		.inum		= inum,
2590 		.snapshot	= snapshot,
2591 	};
2592 
2593 	return 0;
2594 }
2595 
2596 static int nlink_cmp(const void *_l, const void *_r)
2597 {
2598 	const struct nlink *l = _l;
2599 	const struct nlink *r = _r;
2600 
2601 	return cmp_int(l->inum, r->inum);
2602 }
2603 
2604 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2605 		     struct nlink_table *links,
2606 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2607 {
2608 	struct nlink *link, key = {
2609 		.inum = inum, .snapshot = U32_MAX,
2610 	};
2611 
2612 	if (inum < range_start || inum >= range_end)
2613 		return;
2614 
2615 	link = __inline_bsearch(&key, links->d, links->nr,
2616 				sizeof(links->d[0]), nlink_cmp);
2617 	if (!link)
2618 		return;
2619 
2620 	while (link > links->d && link[0].inum == link[-1].inum)
2621 		--link;
2622 
2623 	for (; link < links->d + links->nr && link->inum == inum; link++)
2624 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2625 			link->count++;
2626 			if (link->snapshot >= snapshot)
2627 				break;
2628 		}
2629 }
2630 
2631 noinline_for_stack
2632 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2633 				       struct nlink_table *t,
2634 				       u64 start, u64 *end)
2635 {
2636 	int ret = bch2_trans_run(c,
2637 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2638 				   POS(0, start),
2639 				   BTREE_ITER_intent|
2640 				   BTREE_ITER_prefetch|
2641 				   BTREE_ITER_all_snapshots, k, ({
2642 			if (!bkey_is_inode(k.k))
2643 				continue;
2644 
2645 			/* Should never fail, checked by bch2_inode_invalid: */
2646 			struct bch_inode_unpacked u;
2647 			BUG_ON(bch2_inode_unpack(k, &u));
2648 
2649 			/*
2650 			 * Backpointer and directory structure checks are sufficient for
2651 			 * directories, since they can't have hardlinks:
2652 			 */
2653 			if (S_ISDIR(u.bi_mode))
2654 				continue;
2655 
2656 			if (!u.bi_nlink)
2657 				continue;
2658 
2659 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2660 			if (ret) {
2661 				*end = k.k->p.offset;
2662 				ret = 0;
2663 				break;
2664 			}
2665 			0;
2666 		})));
2667 
2668 	bch_err_fn(c, ret);
2669 	return ret;
2670 }
2671 
2672 noinline_for_stack
2673 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2674 				     u64 range_start, u64 range_end)
2675 {
2676 	struct snapshots_seen s;
2677 
2678 	snapshots_seen_init(&s);
2679 
2680 	int ret = bch2_trans_run(c,
2681 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2682 				   BTREE_ITER_intent|
2683 				   BTREE_ITER_prefetch|
2684 				   BTREE_ITER_all_snapshots, k, ({
2685 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2686 			if (ret)
2687 				break;
2688 
2689 			if (k.k->type == KEY_TYPE_dirent) {
2690 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2691 
2692 				if (d.v->d_type != DT_DIR &&
2693 				    d.v->d_type != DT_SUBVOL)
2694 					inc_link(c, &s, links, range_start, range_end,
2695 						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2696 			}
2697 			0;
2698 		})));
2699 
2700 	snapshots_seen_exit(&s);
2701 
2702 	bch_err_fn(c, ret);
2703 	return ret;
2704 }
2705 
2706 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2707 				     struct bkey_s_c k,
2708 				     struct nlink_table *links,
2709 				     size_t *idx, u64 range_end)
2710 {
2711 	struct bch_inode_unpacked u;
2712 	struct nlink *link = &links->d[*idx];
2713 	int ret = 0;
2714 
2715 	if (k.k->p.offset >= range_end)
2716 		return 1;
2717 
2718 	if (!bkey_is_inode(k.k))
2719 		return 0;
2720 
2721 	BUG_ON(bch2_inode_unpack(k, &u));
2722 
2723 	if (S_ISDIR(u.bi_mode))
2724 		return 0;
2725 
2726 	if (!u.bi_nlink)
2727 		return 0;
2728 
2729 	while ((cmp_int(link->inum, k.k->p.offset) ?:
2730 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2731 		BUG_ON(*idx == links->nr);
2732 		link = &links->d[++*idx];
2733 	}
2734 
2735 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2736 			trans, inode_wrong_nlink,
2737 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
2738 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2739 			bch2_inode_nlink_get(&u), link->count)) {
2740 		bch2_inode_nlink_set(&u, link->count);
2741 		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2742 	}
2743 fsck_err:
2744 	return ret;
2745 }
2746 
2747 noinline_for_stack
2748 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2749 			       struct nlink_table *links,
2750 			       u64 range_start, u64 range_end)
2751 {
2752 	size_t idx = 0;
2753 
2754 	int ret = bch2_trans_run(c,
2755 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2756 				POS(0, range_start),
2757 				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2758 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2759 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2760 	if (ret < 0) {
2761 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2762 		return ret;
2763 	}
2764 
2765 	return 0;
2766 }
2767 
2768 int bch2_check_nlinks(struct bch_fs *c)
2769 {
2770 	struct nlink_table links = { 0 };
2771 	u64 this_iter_range_start, next_iter_range_start = 0;
2772 	int ret = 0;
2773 
2774 	do {
2775 		this_iter_range_start = next_iter_range_start;
2776 		next_iter_range_start = U64_MAX;
2777 
2778 		ret = check_nlinks_find_hardlinks(c, &links,
2779 						  this_iter_range_start,
2780 						  &next_iter_range_start);
2781 
2782 		ret = check_nlinks_walk_dirents(c, &links,
2783 					  this_iter_range_start,
2784 					  next_iter_range_start);
2785 		if (ret)
2786 			break;
2787 
2788 		ret = check_nlinks_update_hardlinks(c, &links,
2789 					 this_iter_range_start,
2790 					 next_iter_range_start);
2791 		if (ret)
2792 			break;
2793 
2794 		links.nr = 0;
2795 	} while (next_iter_range_start != U64_MAX);
2796 
2797 	kvfree(links.d);
2798 	bch_err_fn(c, ret);
2799 	return ret;
2800 }
2801 
2802 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2803 			     struct bkey_s_c k)
2804 {
2805 	struct bkey_s_c_reflink_p p;
2806 	struct bkey_i_reflink_p *u;
2807 
2808 	if (k.k->type != KEY_TYPE_reflink_p)
2809 		return 0;
2810 
2811 	p = bkey_s_c_to_reflink_p(k);
2812 
2813 	if (!p.v->front_pad && !p.v->back_pad)
2814 		return 0;
2815 
2816 	u = bch2_trans_kmalloc(trans, sizeof(*u));
2817 	int ret = PTR_ERR_OR_ZERO(u);
2818 	if (ret)
2819 		return ret;
2820 
2821 	bkey_reassemble(&u->k_i, k);
2822 	u->v.front_pad	= 0;
2823 	u->v.back_pad	= 0;
2824 
2825 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2826 }
2827 
2828 int bch2_fix_reflink_p(struct bch_fs *c)
2829 {
2830 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2831 		return 0;
2832 
2833 	int ret = bch2_trans_run(c,
2834 		for_each_btree_key_commit(trans, iter,
2835 				BTREE_ID_extents, POS_MIN,
2836 				BTREE_ITER_intent|BTREE_ITER_prefetch|
2837 				BTREE_ITER_all_snapshots, k,
2838 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2839 			fix_reflink_p_key(trans, &iter, k)));
2840 	bch_err_fn(c, ret);
2841 	return ret;
2842 }
2843