xref: /linux/fs/bcachefs/fsck.c (revision 4b3e3810738376b3292d1bf29996640843fbd9a0)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "darray.h"
9 #include "dirent.h"
10 #include "error.h"
11 #include "fs-common.h"
12 #include "fsck.h"
13 #include "inode.h"
14 #include "keylist.h"
15 #include "recovery_passes.h"
16 #include "snapshot.h"
17 #include "super.h"
18 #include "xattr.h"
19 
20 #include <linux/bsearch.h>
21 #include <linux/dcache.h> /* struct qstr */
22 
23 /*
24  * XXX: this is handling transaction restarts without returning
25  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
26  */
27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
28 				    u32 snapshot)
29 {
30 	u64 sectors = 0;
31 
32 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
33 				SPOS(inum, 0, snapshot),
34 				POS(inum, U64_MAX),
35 				0, k, ({
36 		if (bkey_extent_is_allocation(k.k))
37 			sectors += k.k->size;
38 		0;
39 	}));
40 
41 	return ret ?: sectors;
42 }
43 
44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45 				    u32 snapshot)
46 {
47 	u64 subdirs = 0;
48 
49 	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
50 				    SPOS(inum, 0, snapshot),
51 				    POS(inum, U64_MAX),
52 				    0, k, ({
53 		if (k.k->type == KEY_TYPE_dirent &&
54 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
55 			subdirs++;
56 		0;
57 	}));
58 
59 	return ret ?: subdirs;
60 }
61 
62 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
63 			 u32 *snapshot, u64 *inum)
64 {
65 	struct bch_subvolume s;
66 	int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
67 
68 	*snapshot = le32_to_cpu(s.snapshot);
69 	*inum = le64_to_cpu(s.inode);
70 	return ret;
71 }
72 
73 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
74 			      struct bch_inode_unpacked *inode)
75 {
76 	struct btree_iter iter;
77 	struct bkey_s_c k;
78 	int ret;
79 
80 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
81 				     BTREE_ITER_all_snapshots, k, ret) {
82 		if (k.k->p.offset != inode_nr)
83 			break;
84 		if (!bkey_is_inode(k.k))
85 			continue;
86 		ret = bch2_inode_unpack(k, inode);
87 		goto found;
88 	}
89 	ret = -BCH_ERR_ENOENT_inode;
90 found:
91 	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
92 	bch2_trans_iter_exit(trans, &iter);
93 	return ret;
94 }
95 
96 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
97 			struct bch_inode_unpacked *inode,
98 			u32 *snapshot)
99 {
100 	struct btree_iter iter;
101 	struct bkey_s_c k;
102 	int ret;
103 
104 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
105 			       SPOS(0, inode_nr, *snapshot), 0);
106 	ret = bkey_err(k);
107 	if (ret)
108 		goto err;
109 
110 	ret = bkey_is_inode(k.k)
111 		? bch2_inode_unpack(k, inode)
112 		: -BCH_ERR_ENOENT_inode;
113 	if (!ret)
114 		*snapshot = iter.pos.snapshot;
115 err:
116 	bch2_trans_iter_exit(trans, &iter);
117 	return ret;
118 }
119 
120 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
121 			   struct bch_hash_info hash_info,
122 			   subvol_inum dir, struct qstr *name,
123 			   u64 *target, unsigned *type, u32 snapshot)
124 {
125 	struct btree_iter iter;
126 	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
127 							 &hash_info, dir, name, 0, snapshot);
128 	int ret = bkey_err(k);
129 	if (ret)
130 		return ret;
131 
132 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
133 	*target = le64_to_cpu(d.v->d_inum);
134 	*type = d.v->d_type;
135 	bch2_trans_iter_exit(trans, &iter);
136 	return 0;
137 }
138 
139 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
140 {
141 	struct bch_fs *c = trans->c;
142 	struct btree_iter iter;
143 	struct bch_inode_unpacked dir_inode;
144 	struct bch_hash_info dir_hash_info;
145 	int ret;
146 
147 	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
148 	if (ret)
149 		goto err;
150 
151 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
152 
153 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
154 
155 	ret =   bch2_btree_iter_traverse(&iter) ?:
156 		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
157 				    &dir_hash_info, &iter,
158 				    BTREE_UPDATE_internal_snapshot_node);
159 	bch2_trans_iter_exit(trans, &iter);
160 err:
161 	bch_err_fn(c, ret);
162 	return ret;
163 }
164 
165 /* Get lost+found, create if it doesn't exist: */
166 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
167 			    struct bch_inode_unpacked *lostfound,
168 			    u64 reattaching_inum)
169 {
170 	struct bch_fs *c = trans->c;
171 	struct qstr lostfound_str = QSTR("lost+found");
172 	u64 inum = 0;
173 	unsigned d_type = 0;
174 	int ret;
175 
176 	struct bch_snapshot_tree st;
177 	ret = bch2_snapshot_tree_lookup(trans,
178 			bch2_snapshot_tree(c, snapshot), &st);
179 	if (ret)
180 		return ret;
181 
182 	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
183 
184 	struct bch_subvolume subvol;
185 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
186 				 false, 0, &subvol);
187 	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
188 		    le32_to_cpu(st.master_subvol), snapshot);
189 	if (ret)
190 		return ret;
191 
192 	if (!subvol.inode) {
193 		struct btree_iter iter;
194 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
195 				BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
196 				0, subvolume);
197 		ret = PTR_ERR_OR_ZERO(subvol);
198 		if (ret)
199 			return ret;
200 
201 		subvol->v.inode = cpu_to_le64(reattaching_inum);
202 		bch2_trans_iter_exit(trans, &iter);
203 	}
204 
205 	root_inum.inum = le64_to_cpu(subvol.inode);
206 
207 	struct bch_inode_unpacked root_inode;
208 	struct bch_hash_info root_hash_info;
209 	u32 root_inode_snapshot = snapshot;
210 	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
211 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
212 		    root_inum.inum, le32_to_cpu(st.master_subvol));
213 	if (ret)
214 		return ret;
215 
216 	root_hash_info = bch2_hash_info_init(c, &root_inode);
217 
218 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
219 			      &lostfound_str, &inum, &d_type, snapshot);
220 	if (bch2_err_matches(ret, ENOENT))
221 		goto create_lostfound;
222 
223 	bch_err_fn(c, ret);
224 	if (ret)
225 		return ret;
226 
227 	if (d_type != DT_DIR) {
228 		bch_err(c, "error looking up lost+found: not a directory");
229 		return -BCH_ERR_ENOENT_not_directory;
230 	}
231 
232 	/*
233 	 * The bch2_check_dirents pass has already run, dangling dirents
234 	 * shouldn't exist here:
235 	 */
236 	ret = lookup_inode(trans, inum, lostfound, &snapshot);
237 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
238 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
239 	return ret;
240 
241 create_lostfound:
242 	/*
243 	 * XXX: we could have a nicer log message here  if we had a nice way to
244 	 * walk backpointers to print a path
245 	 */
246 	bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
247 
248 	u64 now = bch2_current_time(c);
249 	struct btree_iter lostfound_iter = { NULL };
250 	u64 cpu = raw_smp_processor_id();
251 
252 	bch2_inode_init_early(c, lostfound);
253 	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
254 	lostfound->bi_dir = root_inode.bi_inum;
255 
256 	root_inode.bi_nlink++;
257 
258 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
259 	if (ret)
260 		goto err;
261 
262 	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
263 	ret = bch2_btree_iter_traverse(&lostfound_iter);
264 	if (ret)
265 		goto err;
266 
267 	ret =   bch2_dirent_create_snapshot(trans,
268 				0, root_inode.bi_inum, snapshot, &root_hash_info,
269 				mode_to_type(lostfound->bi_mode),
270 				&lostfound_str,
271 				lostfound->bi_inum,
272 				&lostfound->bi_dir_offset,
273 				STR_HASH_must_create) ?:
274 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
275 				       BTREE_UPDATE_internal_snapshot_node);
276 err:
277 	bch_err_msg(c, ret, "creating lost+found");
278 	bch2_trans_iter_exit(trans, &lostfound_iter);
279 	return ret;
280 }
281 
282 static int reattach_inode(struct btree_trans *trans,
283 			  struct bch_inode_unpacked *inode,
284 			  u32 inode_snapshot)
285 {
286 	struct bch_hash_info dir_hash;
287 	struct bch_inode_unpacked lostfound;
288 	char name_buf[20];
289 	struct qstr name;
290 	u64 dir_offset = 0;
291 	u32 dirent_snapshot = inode_snapshot;
292 	int ret;
293 
294 	if (inode->bi_subvol) {
295 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
296 
297 		u64 root_inum;
298 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
299 				    &dirent_snapshot, &root_inum);
300 		if (ret)
301 			return ret;
302 
303 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
304 	} else {
305 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
306 	}
307 
308 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
309 	if (ret)
310 		return ret;
311 
312 	if (S_ISDIR(inode->bi_mode)) {
313 		lostfound.bi_nlink++;
314 
315 		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
316 		if (ret)
317 			return ret;
318 	}
319 
320 	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
321 
322 	name = (struct qstr) QSTR(name_buf);
323 
324 	ret = bch2_dirent_create_snapshot(trans,
325 				inode->bi_parent_subvol, lostfound.bi_inum,
326 				dirent_snapshot,
327 				&dir_hash,
328 				inode_d_type(inode),
329 				&name,
330 				inode->bi_subvol ?: inode->bi_inum,
331 				&dir_offset,
332 				STR_HASH_must_create);
333 	if (ret)
334 		return ret;
335 
336 	inode->bi_dir		= lostfound.bi_inum;
337 	inode->bi_dir_offset	= dir_offset;
338 
339 	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
340 }
341 
342 static int remove_backpointer(struct btree_trans *trans,
343 			      struct bch_inode_unpacked *inode)
344 {
345 	struct btree_iter iter;
346 	struct bkey_s_c_dirent d;
347 	int ret;
348 
349 	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
350 				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
351 				     dirent);
352 	ret =   bkey_err(d) ?:
353 		__remove_dirent(trans, d.k->p);
354 	bch2_trans_iter_exit(trans, &iter);
355 	return ret;
356 }
357 
358 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
359 {
360 	struct bch_fs *c = trans->c;
361 
362 	struct bch_inode_unpacked inode;
363 	int ret = bch2_inode_find_by_inum_trans(trans,
364 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
365 				&inode);
366 	if (ret)
367 		return ret;
368 
369 	ret = remove_backpointer(trans, &inode);
370 	bch_err_msg(c, ret, "removing dirent");
371 	if (ret)
372 		return ret;
373 
374 	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
375 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
376 	return ret;
377 }
378 
379 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
380 {
381 	struct bch_fs *c = trans->c;
382 
383 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
384 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
385 		return -BCH_ERR_fsck_repair_unimplemented;
386 	}
387 
388 	/*
389 	 * If inum isn't set, that means we're being called from check_dirents,
390 	 * not check_inodes - the root of this subvolume doesn't exist or we
391 	 * would have found it there:
392 	 */
393 	if (!inum) {
394 		struct btree_iter inode_iter = {};
395 		struct bch_inode_unpacked new_inode;
396 		u64 cpu = raw_smp_processor_id();
397 
398 		bch2_inode_init_early(c, &new_inode);
399 		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
400 
401 		new_inode.bi_subvol = subvolid;
402 
403 		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
404 			  bch2_btree_iter_traverse(&inode_iter) ?:
405 			  bch2_inode_write(trans, &inode_iter, &new_inode);
406 		bch2_trans_iter_exit(trans, &inode_iter);
407 		if (ret)
408 			return ret;
409 
410 		inum = new_inode.bi_inum;
411 	}
412 
413 	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
414 
415 	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
416 	int ret = PTR_ERR_OR_ZERO(new_subvol);
417 	if (ret)
418 		return ret;
419 
420 	bkey_subvolume_init(&new_subvol->k_i);
421 	new_subvol->k.p.offset	= subvolid;
422 	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
423 	new_subvol->v.inode	= cpu_to_le64(inum);
424 	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
425 	if (ret)
426 		return ret;
427 
428 	struct btree_iter iter;
429 	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
430 			BTREE_ID_snapshots, POS(0, snapshotid),
431 			0, snapshot);
432 	ret = PTR_ERR_OR_ZERO(s);
433 	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
434 	if (ret)
435 		return ret;
436 
437 	u32 snapshot_tree = le32_to_cpu(s->v.tree);
438 
439 	s->v.subvol = cpu_to_le32(subvolid);
440 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
441 	bch2_trans_iter_exit(trans, &iter);
442 
443 	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
444 			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
445 			0, snapshot_tree);
446 	ret = PTR_ERR_OR_ZERO(st);
447 	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
448 	if (ret)
449 		return ret;
450 
451 	if (!st->v.master_subvol)
452 		st->v.master_subvol = cpu_to_le32(subvolid);
453 
454 	bch2_trans_iter_exit(trans, &iter);
455 	return 0;
456 }
457 
458 static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
459 {
460 	struct bch_fs *c = trans->c;
461 	struct bch_inode_unpacked new_inode;
462 
463 	bch2_inode_init_early(c, &new_inode);
464 	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
465 	new_inode.bi_size = size;
466 	new_inode.bi_inum = inum;
467 
468 	return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
469 }
470 
471 static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
472 {
473 	struct btree_iter iter = {};
474 
475 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
476 	struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
477 	bch2_trans_iter_exit(trans, &iter);
478 	int ret = bkey_err(k);
479 	if (ret)
480 		return ret;
481 
482 	return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
483 }
484 
485 struct snapshots_seen {
486 	struct bpos			pos;
487 	snapshot_id_list		ids;
488 };
489 
490 static inline void snapshots_seen_exit(struct snapshots_seen *s)
491 {
492 	darray_exit(&s->ids);
493 }
494 
495 static inline void snapshots_seen_init(struct snapshots_seen *s)
496 {
497 	memset(s, 0, sizeof(*s));
498 }
499 
500 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
501 {
502 	u32 *i;
503 	__darray_for_each(s->ids, i) {
504 		if (*i == id)
505 			return 0;
506 		if (*i > id)
507 			break;
508 	}
509 
510 	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
511 	if (ret)
512 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
513 			s->ids.size);
514 	return ret;
515 }
516 
517 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
518 				 enum btree_id btree_id, struct bpos pos)
519 {
520 	if (!bkey_eq(s->pos, pos))
521 		s->ids.nr = 0;
522 	s->pos = pos;
523 
524 	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
525 }
526 
527 /**
528  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
529  * and @ancestor hasn't been overwritten in @seen
530  *
531  * @c:		filesystem handle
532  * @seen:	list of snapshot ids already seen at current position
533  * @id:		descendent snapshot id
534  * @ancestor:	ancestor snapshot id
535  *
536  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
537  */
538 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
539 				    u32 id, u32 ancestor)
540 {
541 	ssize_t i;
542 
543 	EBUG_ON(id > ancestor);
544 
545 	/* @ancestor should be the snapshot most recently added to @seen */
546 	EBUG_ON(ancestor != seen->pos.snapshot);
547 	EBUG_ON(ancestor != darray_last(seen->ids));
548 
549 	if (id == ancestor)
550 		return true;
551 
552 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
553 		return false;
554 
555 	/*
556 	 * We know that @id is a descendant of @ancestor, we're checking if
557 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
558 	 * @ascestor and with @id as a descendent.
559 	 *
560 	 * But we already know that we're scanning IDs between @id and @ancestor
561 	 * numerically, since snapshot ID lists are kept sorted, so if we find
562 	 * an id that's an ancestor of @id we're done:
563 	 */
564 
565 	for (i = seen->ids.nr - 2;
566 	     i >= 0 && seen->ids.data[i] >= id;
567 	     --i)
568 		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
569 			return false;
570 
571 	return true;
572 }
573 
574 /**
575  * ref_visible - given a key with snapshot id @src that points to a key with
576  * snapshot id @dst, test whether there is some snapshot in which @dst is
577  * visible.
578  *
579  * @c:		filesystem handle
580  * @s:		list of snapshot IDs already seen at @src
581  * @src:	snapshot ID of src key
582  * @dst:	snapshot ID of dst key
583  * Returns:	true if there is some snapshot in which @dst is visible
584  *
585  * Assumes we're visiting @src keys in natural key order
586  */
587 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
588 			u32 src, u32 dst)
589 {
590 	return dst <= src
591 		? key_visible_in_snapshot(c, s, dst, src)
592 		: bch2_snapshot_is_ancestor(c, src, dst);
593 }
594 
595 static int ref_visible2(struct bch_fs *c,
596 			u32 src, struct snapshots_seen *src_seen,
597 			u32 dst, struct snapshots_seen *dst_seen)
598 {
599 	if (dst > src) {
600 		swap(dst, src);
601 		swap(dst_seen, src_seen);
602 	}
603 	return key_visible_in_snapshot(c, src_seen, dst, src);
604 }
605 
606 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
607 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
608 	     (_i)->snapshot <= (_snapshot); _i++)					\
609 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
610 
611 struct inode_walker_entry {
612 	struct bch_inode_unpacked inode;
613 	u32			snapshot;
614 	bool			seen_this_pos;
615 	u64			count;
616 };
617 
618 struct inode_walker {
619 	bool				first_this_inode;
620 	bool				recalculate_sums;
621 	struct bpos			last_pos;
622 
623 	DARRAY(struct inode_walker_entry) inodes;
624 };
625 
626 static void inode_walker_exit(struct inode_walker *w)
627 {
628 	darray_exit(&w->inodes);
629 }
630 
631 static struct inode_walker inode_walker_init(void)
632 {
633 	return (struct inode_walker) { 0, };
634 }
635 
636 static int add_inode(struct bch_fs *c, struct inode_walker *w,
637 		     struct bkey_s_c inode)
638 {
639 	struct bch_inode_unpacked u;
640 
641 	BUG_ON(bch2_inode_unpack(inode, &u));
642 
643 	return darray_push(&w->inodes, ((struct inode_walker_entry) {
644 		.inode		= u,
645 		.snapshot	= inode.k->p.snapshot,
646 	}));
647 }
648 
649 static int get_inodes_all_snapshots(struct btree_trans *trans,
650 				    struct inode_walker *w, u64 inum)
651 {
652 	struct bch_fs *c = trans->c;
653 	struct btree_iter iter;
654 	struct bkey_s_c k;
655 	int ret;
656 
657 	w->recalculate_sums = false;
658 	w->inodes.nr = 0;
659 
660 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
661 				     BTREE_ITER_all_snapshots, k, ret) {
662 		if (k.k->p.offset != inum)
663 			break;
664 
665 		if (bkey_is_inode(k.k))
666 			add_inode(c, w, k);
667 	}
668 	bch2_trans_iter_exit(trans, &iter);
669 
670 	if (ret)
671 		return ret;
672 
673 	w->first_this_inode = true;
674 	return 0;
675 }
676 
677 static struct inode_walker_entry *
678 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
679 {
680 	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
681 
682 	struct inode_walker_entry *i;
683 	__darray_for_each(w->inodes, i)
684 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
685 			goto found;
686 
687 	return NULL;
688 found:
689 	BUG_ON(k.k->p.snapshot > i->snapshot);
690 
691 	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
692 		struct inode_walker_entry new = *i;
693 
694 		new.snapshot = k.k->p.snapshot;
695 		new.count = 0;
696 
697 		struct printbuf buf = PRINTBUF;
698 		bch2_bkey_val_to_text(&buf, c, k);
699 
700 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
701 			 "unexpected because we should always update the inode when we update a key in that inode\n"
702 			 "%s",
703 			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
704 		printbuf_exit(&buf);
705 
706 		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
707 			--i;
708 
709 		size_t pos = i - w->inodes.data;
710 		int ret = darray_insert_item(&w->inodes, pos, new);
711 		if (ret)
712 			return ERR_PTR(ret);
713 
714 		i = w->inodes.data + pos;
715 	}
716 
717 	return i;
718 }
719 
720 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
721 					     struct inode_walker *w,
722 					     struct bkey_s_c k)
723 {
724 	if (w->last_pos.inode != k.k->p.inode) {
725 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
726 		if (ret)
727 			return ERR_PTR(ret);
728 	} else if (bkey_cmp(w->last_pos, k.k->p)) {
729 		darray_for_each(w->inodes, i)
730 			i->seen_this_pos = false;
731 	}
732 
733 	w->last_pos = k.k->p;
734 
735 	return lookup_inode_for_snapshot(trans->c, w, k);
736 }
737 
738 static int get_visible_inodes(struct btree_trans *trans,
739 			      struct inode_walker *w,
740 			      struct snapshots_seen *s,
741 			      u64 inum)
742 {
743 	struct bch_fs *c = trans->c;
744 	struct btree_iter iter;
745 	struct bkey_s_c k;
746 	int ret;
747 
748 	w->inodes.nr = 0;
749 
750 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
751 			   BTREE_ITER_all_snapshots, k, ret) {
752 		if (k.k->p.offset != inum)
753 			break;
754 
755 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
756 			continue;
757 
758 		if (bkey_is_inode(k.k))
759 			add_inode(c, w, k);
760 
761 		if (k.k->p.snapshot >= s->pos.snapshot)
762 			break;
763 	}
764 	bch2_trans_iter_exit(trans, &iter);
765 
766 	return ret;
767 }
768 
769 static int hash_redo_key(struct btree_trans *trans,
770 			 const struct bch_hash_desc desc,
771 			 struct bch_hash_info *hash_info,
772 			 struct btree_iter *k_iter, struct bkey_s_c k)
773 {
774 	struct bkey_i *delete;
775 	struct bkey_i *tmp;
776 
777 	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
778 	if (IS_ERR(delete))
779 		return PTR_ERR(delete);
780 
781 	tmp = bch2_bkey_make_mut_noupdate(trans, k);
782 	if (IS_ERR(tmp))
783 		return PTR_ERR(tmp);
784 
785 	bkey_init(&delete->k);
786 	delete->k.p = k_iter->pos;
787 	return  bch2_btree_iter_traverse(k_iter) ?:
788 		bch2_trans_update(trans, k_iter, delete, 0) ?:
789 		bch2_hash_set_in_snapshot(trans, desc, hash_info,
790 				       (subvol_inum) { 0, k.k->p.inode },
791 				       k.k->p.snapshot, tmp,
792 				       STR_HASH_must_create|
793 				       BTREE_UPDATE_internal_snapshot_node) ?:
794 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
795 }
796 
797 static int hash_check_key(struct btree_trans *trans,
798 			  const struct bch_hash_desc desc,
799 			  struct bch_hash_info *hash_info,
800 			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
801 {
802 	struct bch_fs *c = trans->c;
803 	struct btree_iter iter = { NULL };
804 	struct printbuf buf = PRINTBUF;
805 	struct bkey_s_c k;
806 	u64 hash;
807 	int ret = 0;
808 
809 	if (hash_k.k->type != desc.key_type)
810 		return 0;
811 
812 	hash = desc.hash_bkey(hash_info, hash_k);
813 
814 	if (likely(hash == hash_k.k->p.offset))
815 		return 0;
816 
817 	if (hash_k.k->p.offset < hash)
818 		goto bad_hash;
819 
820 	for_each_btree_key_norestart(trans, iter, desc.btree_id,
821 				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
822 				     BTREE_ITER_slots, k, ret) {
823 		if (bkey_eq(k.k->p, hash_k.k->p))
824 			break;
825 
826 		if (fsck_err_on(k.k->type == desc.key_type &&
827 				!desc.cmp_bkey(k, hash_k), c,
828 				hash_table_key_duplicate,
829 				"duplicate hash table keys:\n%s",
830 				(printbuf_reset(&buf),
831 				 bch2_bkey_val_to_text(&buf, c, hash_k),
832 				 buf.buf))) {
833 			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
834 			break;
835 		}
836 
837 		if (bkey_deleted(k.k)) {
838 			bch2_trans_iter_exit(trans, &iter);
839 			goto bad_hash;
840 		}
841 	}
842 out:
843 	bch2_trans_iter_exit(trans, &iter);
844 	printbuf_exit(&buf);
845 	return ret;
846 bad_hash:
847 	if (fsck_err(c, hash_table_key_wrong_offset,
848 		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
849 		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
850 		     (printbuf_reset(&buf),
851 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
852 		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
853 		bch_err_fn(c, ret);
854 		if (ret)
855 			return ret;
856 		ret = -BCH_ERR_transaction_restart_nested;
857 	}
858 fsck_err:
859 	goto out;
860 }
861 
862 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
863 						struct btree_iter *iter,
864 						struct bpos pos)
865 {
866 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
867 }
868 
869 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
870 					       struct btree_iter *iter,
871 					       struct bch_inode_unpacked *inode,
872 					       u32 *snapshot)
873 {
874 	if (inode->bi_subvol) {
875 		u64 inum;
876 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
877 		if (ret)
878 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
879 	}
880 
881 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
882 }
883 
884 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
885 				   struct bkey_s_c_dirent d)
886 {
887 	return  inode->bi_dir		== d.k->p.inode &&
888 		inode->bi_dir_offset	== d.k->p.offset;
889 }
890 
891 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
892 				   struct bch_inode_unpacked *inode)
893 {
894 	return d.v->d_type == DT_SUBVOL
895 		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
896 		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
897 }
898 
899 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
900 {
901 	struct btree_iter iter;
902 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
903 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
904 	bch2_trans_iter_exit(trans, &iter);
905 	return ret;
906 }
907 
908 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
909 				    struct bch_inode_unpacked *inode,
910 				    u32 inode_snapshot, bool *write_inode)
911 {
912 	struct bch_fs *c = trans->c;
913 	struct printbuf buf = PRINTBUF;
914 
915 	struct btree_iter dirent_iter = {};
916 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
917 	int ret = bkey_err(d);
918 	if (ret && !bch2_err_matches(ret, ENOENT))
919 		return ret;
920 
921 	if (fsck_err_on(ret,
922 			c, inode_points_to_missing_dirent,
923 			"inode points to missing dirent\n%s",
924 			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
925 	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
926 			c, inode_points_to_wrong_dirent,
927 			"inode points to dirent that does not point back:\n%s",
928 			(bch2_bkey_val_to_text(&buf, c, inode_k),
929 			 prt_newline(&buf),
930 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
931 		/*
932 		 * We just clear the backpointer fields for now. If we find a
933 		 * dirent that points to this inode in check_dirents(), we'll
934 		 * update it then; then when we get to check_path() if the
935 		 * backpointer is still 0 we'll reattach it.
936 		 */
937 		inode->bi_dir = 0;
938 		inode->bi_dir_offset = 0;
939 		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
940 		*write_inode = true;
941 	}
942 
943 	ret = 0;
944 fsck_err:
945 	bch2_trans_iter_exit(trans, &dirent_iter);
946 	printbuf_exit(&buf);
947 	bch_err_fn(c, ret);
948 	return ret;
949 }
950 
951 static int check_inode(struct btree_trans *trans,
952 		       struct btree_iter *iter,
953 		       struct bkey_s_c k,
954 		       struct bch_inode_unpacked *prev,
955 		       struct snapshots_seen *s,
956 		       bool full)
957 {
958 	struct bch_fs *c = trans->c;
959 	struct bch_inode_unpacked u;
960 	bool do_update = false;
961 	int ret;
962 
963 	ret = bch2_check_key_has_snapshot(trans, iter, k);
964 	if (ret < 0)
965 		goto err;
966 	if (ret)
967 		return 0;
968 
969 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
970 	if (ret)
971 		goto err;
972 
973 	if (!bkey_is_inode(k.k))
974 		return 0;
975 
976 	BUG_ON(bch2_inode_unpack(k, &u));
977 
978 	if (!full &&
979 	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
980 			    BCH_INODE_i_sectors_dirty|
981 			    BCH_INODE_unlinked)))
982 		return 0;
983 
984 	if (prev->bi_inum != u.bi_inum)
985 		*prev = u;
986 
987 	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
988 			inode_d_type(prev)	!= inode_d_type(&u),
989 			c, inode_snapshot_mismatch,
990 			"inodes in different snapshots don't match")) {
991 		bch_err(c, "repair not implemented yet");
992 		return -BCH_ERR_fsck_repair_unimplemented;
993 	}
994 
995 	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
996 	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
997 		struct bpos new_min_pos;
998 
999 		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
1000 		if (ret)
1001 			goto err;
1002 
1003 		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
1004 
1005 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1006 
1007 		bch_err_msg(c, ret, "in fsck updating inode");
1008 		if (ret)
1009 			return ret;
1010 
1011 		if (!bpos_eq(new_min_pos, POS_MIN))
1012 			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
1013 		return 0;
1014 	}
1015 
1016 	if (u.bi_flags & BCH_INODE_unlinked) {
1017 		ret = check_inode_deleted_list(trans, k.k->p);
1018 		if (ret < 0)
1019 			return ret;
1020 
1021 		fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
1022 			    "inode %llu:%u unlinked, but not on deleted list",
1023 			    u.bi_inum, k.k->p.snapshot);
1024 		ret = 0;
1025 	}
1026 
1027 	if (u.bi_flags & BCH_INODE_unlinked &&
1028 	    (!c->sb.clean ||
1029 	     fsck_err(c, inode_unlinked_but_clean,
1030 		      "filesystem marked clean, but inode %llu unlinked",
1031 		      u.bi_inum))) {
1032 		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1033 		bch_err_msg(c, ret, "in fsck deleting inode");
1034 		return ret;
1035 	}
1036 
1037 	if (u.bi_flags & BCH_INODE_i_size_dirty &&
1038 	    (!c->sb.clean ||
1039 	     fsck_err(c, inode_i_size_dirty_but_clean,
1040 		      "filesystem marked clean, but inode %llu has i_size dirty",
1041 		      u.bi_inum))) {
1042 		bch_verbose(c, "truncating inode %llu", u.bi_inum);
1043 
1044 		/*
1045 		 * XXX: need to truncate partial blocks too here - or ideally
1046 		 * just switch units to bytes and that issue goes away
1047 		 */
1048 		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1049 				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
1050 				     iter->pos.snapshot),
1051 				POS(u.bi_inum, U64_MAX),
1052 				0, NULL);
1053 		bch_err_msg(c, ret, "in fsck truncating inode");
1054 		if (ret)
1055 			return ret;
1056 
1057 		/*
1058 		 * We truncated without our normal sector accounting hook, just
1059 		 * make sure we recalculate it:
1060 		 */
1061 		u.bi_flags |= BCH_INODE_i_sectors_dirty;
1062 
1063 		u.bi_flags &= ~BCH_INODE_i_size_dirty;
1064 		do_update = true;
1065 	}
1066 
1067 	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
1068 	    (!c->sb.clean ||
1069 	     fsck_err(c, inode_i_sectors_dirty_but_clean,
1070 		      "filesystem marked clean, but inode %llu has i_sectors dirty",
1071 		      u.bi_inum))) {
1072 		s64 sectors;
1073 
1074 		bch_verbose(c, "recounting sectors for inode %llu",
1075 			    u.bi_inum);
1076 
1077 		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1078 		if (sectors < 0) {
1079 			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1080 			return sectors;
1081 		}
1082 
1083 		u.bi_sectors = sectors;
1084 		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1085 		do_update = true;
1086 	}
1087 
1088 	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1089 		u.bi_dir = 0;
1090 		u.bi_dir_offset = 0;
1091 		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1092 		do_update = true;
1093 	}
1094 
1095 	if (u.bi_dir || u.bi_dir_offset) {
1096 		ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1097 		if (ret)
1098 			goto err;
1099 	}
1100 
1101 	if (fsck_err_on(u.bi_parent_subvol &&
1102 			(u.bi_subvol == 0 ||
1103 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1104 			c, inode_bi_parent_nonzero,
1105 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1106 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1107 		u.bi_parent_subvol = 0;
1108 		do_update = true;
1109 	}
1110 
1111 	if (u.bi_subvol) {
1112 		struct bch_subvolume s;
1113 
1114 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1115 		if (ret && !bch2_err_matches(ret, ENOENT))
1116 			goto err;
1117 
1118 		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1119 			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1120 			goto do_update;
1121 		}
1122 
1123 		if (fsck_err_on(ret,
1124 				c, inode_bi_subvol_missing,
1125 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1126 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1127 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1128 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1129 							   k.k->p.snapshot),
1130 				c, inode_bi_subvol_wrong,
1131 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1132 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1133 				le64_to_cpu(s.inode),
1134 				le32_to_cpu(s.snapshot))) {
1135 			u.bi_subvol = 0;
1136 			u.bi_parent_subvol = 0;
1137 			do_update = true;
1138 		}
1139 	}
1140 do_update:
1141 	if (do_update) {
1142 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1143 		bch_err_msg(c, ret, "in fsck updating inode");
1144 		if (ret)
1145 			return ret;
1146 	}
1147 err:
1148 fsck_err:
1149 	bch_err_fn(c, ret);
1150 	return ret;
1151 }
1152 
1153 int bch2_check_inodes(struct bch_fs *c)
1154 {
1155 	bool full = c->opts.fsck;
1156 	struct bch_inode_unpacked prev = { 0 };
1157 	struct snapshots_seen s;
1158 
1159 	snapshots_seen_init(&s);
1160 
1161 	int ret = bch2_trans_run(c,
1162 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1163 				POS_MIN,
1164 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1165 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1166 			check_inode(trans, &iter, k, &prev, &s, full)));
1167 
1168 	snapshots_seen_exit(&s);
1169 	bch_err_fn(c, ret);
1170 	return ret;
1171 }
1172 
1173 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1174 {
1175 	struct bch_fs *c = trans->c;
1176 	int ret = 0;
1177 	s64 count2;
1178 
1179 	darray_for_each(w->inodes, i) {
1180 		if (i->inode.bi_sectors == i->count)
1181 			continue;
1182 
1183 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1184 
1185 		if (w->recalculate_sums)
1186 			i->count = count2;
1187 
1188 		if (i->count != count2) {
1189 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1190 					    w->last_pos.inode, i->snapshot, i->count, count2);
1191 			return -BCH_ERR_internal_fsck_err;
1192 		}
1193 
1194 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1195 				c, inode_i_sectors_wrong,
1196 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1197 				w->last_pos.inode, i->snapshot,
1198 				i->inode.bi_sectors, i->count)) {
1199 			i->inode.bi_sectors = i->count;
1200 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1201 			if (ret)
1202 				break;
1203 		}
1204 	}
1205 fsck_err:
1206 	bch_err_fn(c, ret);
1207 	return ret;
1208 }
1209 
1210 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1211 {
1212 	u32 restart_count = trans->restart_count;
1213 	return check_i_sectors_notnested(trans, w) ?:
1214 		trans_was_restarted(trans, restart_count);
1215 }
1216 
1217 struct extent_end {
1218 	u32			snapshot;
1219 	u64			offset;
1220 	struct snapshots_seen	seen;
1221 };
1222 
1223 struct extent_ends {
1224 	struct bpos			last_pos;
1225 	DARRAY(struct extent_end)	e;
1226 };
1227 
1228 static void extent_ends_reset(struct extent_ends *extent_ends)
1229 {
1230 	darray_for_each(extent_ends->e, i)
1231 		snapshots_seen_exit(&i->seen);
1232 	extent_ends->e.nr = 0;
1233 }
1234 
1235 static void extent_ends_exit(struct extent_ends *extent_ends)
1236 {
1237 	extent_ends_reset(extent_ends);
1238 	darray_exit(&extent_ends->e);
1239 }
1240 
1241 static void extent_ends_init(struct extent_ends *extent_ends)
1242 {
1243 	memset(extent_ends, 0, sizeof(*extent_ends));
1244 }
1245 
1246 static int extent_ends_at(struct bch_fs *c,
1247 			  struct extent_ends *extent_ends,
1248 			  struct snapshots_seen *seen,
1249 			  struct bkey_s_c k)
1250 {
1251 	struct extent_end *i, n = (struct extent_end) {
1252 		.offset		= k.k->p.offset,
1253 		.snapshot	= k.k->p.snapshot,
1254 		.seen		= *seen,
1255 	};
1256 
1257 	n.seen.ids.data = kmemdup(seen->ids.data,
1258 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1259 			      GFP_KERNEL);
1260 	if (!n.seen.ids.data)
1261 		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1262 
1263 	__darray_for_each(extent_ends->e, i) {
1264 		if (i->snapshot == k.k->p.snapshot) {
1265 			snapshots_seen_exit(&i->seen);
1266 			*i = n;
1267 			return 0;
1268 		}
1269 
1270 		if (i->snapshot >= k.k->p.snapshot)
1271 			break;
1272 	}
1273 
1274 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1275 }
1276 
1277 static int overlapping_extents_found(struct btree_trans *trans,
1278 				     enum btree_id btree,
1279 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1280 				     struct bkey pos2,
1281 				     bool *fixed,
1282 				     struct extent_end *extent_end)
1283 {
1284 	struct bch_fs *c = trans->c;
1285 	struct printbuf buf = PRINTBUF;
1286 	struct btree_iter iter1, iter2 = { NULL };
1287 	struct bkey_s_c k1, k2;
1288 	int ret;
1289 
1290 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1291 
1292 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1293 			     BTREE_ITER_all_snapshots|
1294 			     BTREE_ITER_not_extents);
1295 	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1296 	ret = bkey_err(k1);
1297 	if (ret)
1298 		goto err;
1299 
1300 	prt_str(&buf, "\n  ");
1301 	bch2_bkey_val_to_text(&buf, c, k1);
1302 
1303 	if (!bpos_eq(pos1, k1.k->p)) {
1304 		prt_str(&buf, "\n  wanted\n  ");
1305 		bch2_bpos_to_text(&buf, pos1);
1306 		prt_str(&buf, "\n  ");
1307 		bch2_bkey_to_text(&buf, &pos2);
1308 
1309 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1310 			__func__, buf.buf);
1311 		ret = -BCH_ERR_internal_fsck_err;
1312 		goto err;
1313 	}
1314 
1315 	bch2_trans_copy_iter(&iter2, &iter1);
1316 
1317 	while (1) {
1318 		bch2_btree_iter_advance(&iter2);
1319 
1320 		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1321 		ret = bkey_err(k2);
1322 		if (ret)
1323 			goto err;
1324 
1325 		if (bpos_ge(k2.k->p, pos2.p))
1326 			break;
1327 	}
1328 
1329 	prt_str(&buf, "\n  ");
1330 	bch2_bkey_val_to_text(&buf, c, k2);
1331 
1332 	if (bpos_gt(k2.k->p, pos2.p) ||
1333 	    pos2.size != k2.k->size) {
1334 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1335 			__func__, buf.buf);
1336 		ret = -BCH_ERR_internal_fsck_err;
1337 		goto err;
1338 	}
1339 
1340 	prt_printf(&buf, "\n  overwriting %s extent",
1341 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1342 
1343 	if (fsck_err(c, extent_overlapping,
1344 		     "overlapping extents%s", buf.buf)) {
1345 		struct btree_iter *old_iter = &iter1;
1346 		struct disk_reservation res = { 0 };
1347 
1348 		if (pos1.snapshot < pos2.p.snapshot) {
1349 			old_iter = &iter2;
1350 			swap(k1, k2);
1351 		}
1352 
1353 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1354 
1355 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1356 				BTREE_UPDATE_internal_snapshot_node,
1357 				k1, k2) ?:
1358 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1359 		bch2_disk_reservation_put(c, &res);
1360 
1361 		if (ret)
1362 			goto err;
1363 
1364 		*fixed = true;
1365 
1366 		if (pos1.snapshot == pos2.p.snapshot) {
1367 			/*
1368 			 * We overwrote the first extent, and did the overwrite
1369 			 * in the same snapshot:
1370 			 */
1371 			extent_end->offset = bkey_start_offset(&pos2);
1372 		} else if (pos1.snapshot > pos2.p.snapshot) {
1373 			/*
1374 			 * We overwrote the first extent in pos2's snapshot:
1375 			 */
1376 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1377 		} else {
1378 			/*
1379 			 * We overwrote the second extent - restart
1380 			 * check_extent() from the top:
1381 			 */
1382 			ret = -BCH_ERR_transaction_restart_nested;
1383 		}
1384 	}
1385 fsck_err:
1386 err:
1387 	bch2_trans_iter_exit(trans, &iter2);
1388 	bch2_trans_iter_exit(trans, &iter1);
1389 	printbuf_exit(&buf);
1390 	return ret;
1391 }
1392 
1393 static int check_overlapping_extents(struct btree_trans *trans,
1394 			      struct snapshots_seen *seen,
1395 			      struct extent_ends *extent_ends,
1396 			      struct bkey_s_c k,
1397 			      struct btree_iter *iter,
1398 			      bool *fixed)
1399 {
1400 	struct bch_fs *c = trans->c;
1401 	int ret = 0;
1402 
1403 	/* transaction restart, running again */
1404 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1405 		return 0;
1406 
1407 	if (extent_ends->last_pos.inode != k.k->p.inode)
1408 		extent_ends_reset(extent_ends);
1409 
1410 	darray_for_each(extent_ends->e, i) {
1411 		if (i->offset <= bkey_start_offset(k.k))
1412 			continue;
1413 
1414 		if (!ref_visible2(c,
1415 				  k.k->p.snapshot, seen,
1416 				  i->snapshot, &i->seen))
1417 			continue;
1418 
1419 		ret = overlapping_extents_found(trans, iter->btree_id,
1420 						SPOS(iter->pos.inode,
1421 						     i->offset,
1422 						     i->snapshot),
1423 						&i->seen,
1424 						*k.k, fixed, i);
1425 		if (ret)
1426 			goto err;
1427 	}
1428 
1429 	extent_ends->last_pos = k.k->p;
1430 err:
1431 	return ret;
1432 }
1433 
1434 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1435 				struct bkey_s_c k)
1436 {
1437 	struct bch_fs *c = trans->c;
1438 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1439 	struct bch_extent_crc_unpacked crc;
1440 	const union bch_extent_entry *i;
1441 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1442 
1443 	bkey_for_each_crc(k.k, ptrs, crc, i)
1444 		if (crc_is_encoded(crc) &&
1445 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1446 			struct printbuf buf = PRINTBUF;
1447 
1448 			bch2_bkey_val_to_text(&buf, c, k);
1449 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1450 			printbuf_exit(&buf);
1451 		}
1452 
1453 	return 0;
1454 }
1455 
1456 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1457 			struct bkey_s_c k,
1458 			struct inode_walker *inode,
1459 			struct snapshots_seen *s,
1460 			struct extent_ends *extent_ends)
1461 {
1462 	struct bch_fs *c = trans->c;
1463 	struct inode_walker_entry *i;
1464 	struct printbuf buf = PRINTBUF;
1465 	int ret = 0;
1466 
1467 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1468 	if (ret) {
1469 		ret = ret < 0 ? ret : 0;
1470 		goto out;
1471 	}
1472 
1473 	if (inode->last_pos.inode != k.k->p.inode) {
1474 		ret = check_i_sectors(trans, inode);
1475 		if (ret)
1476 			goto err;
1477 	}
1478 
1479 	i = walk_inode(trans, inode, k);
1480 	ret = PTR_ERR_OR_ZERO(i);
1481 	if (ret)
1482 		goto err;
1483 
1484 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1485 	if (ret)
1486 		goto err;
1487 
1488 	if (k.k->type != KEY_TYPE_whiteout) {
1489 		if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1490 			ret =   reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
1491 				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1492 			if (ret)
1493 				goto err;
1494 
1495 			inode->last_pos.inode--;
1496 			ret = -BCH_ERR_transaction_restart_nested;
1497 			goto err;
1498 		}
1499 
1500 		if (fsck_err_on(!i, c, extent_in_missing_inode,
1501 				"extent in missing inode:\n  %s",
1502 				(printbuf_reset(&buf),
1503 				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1504 			goto delete;
1505 
1506 		if (fsck_err_on(i &&
1507 				!S_ISREG(i->inode.bi_mode) &&
1508 				!S_ISLNK(i->inode.bi_mode),
1509 				c, extent_in_non_reg_inode,
1510 				"extent in non regular inode mode %o:\n  %s",
1511 				i->inode.bi_mode,
1512 				(printbuf_reset(&buf),
1513 				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1514 			goto delete;
1515 
1516 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1517 						&inode->recalculate_sums);
1518 		if (ret)
1519 			goto err;
1520 	}
1521 
1522 	/*
1523 	 * Check inodes in reverse order, from oldest snapshots to newest,
1524 	 * starting from the inode that matches this extent's snapshot. If we
1525 	 * didn't have one, iterate over all inodes:
1526 	 */
1527 	if (!i)
1528 		i = inode->inodes.data + inode->inodes.nr - 1;
1529 
1530 	for (;
1531 	     inode->inodes.data && i >= inode->inodes.data;
1532 	     --i) {
1533 		if (i->snapshot > k.k->p.snapshot ||
1534 		    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1535 			continue;
1536 
1537 		if (k.k->type != KEY_TYPE_whiteout) {
1538 			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1539 					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1540 					!bkey_extent_is_reservation(k),
1541 					c, extent_past_end_of_inode,
1542 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
1543 					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1544 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1545 				struct btree_iter iter2;
1546 
1547 				bch2_trans_copy_iter(&iter2, iter);
1548 				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1549 				ret =   bch2_btree_iter_traverse(&iter2) ?:
1550 					bch2_btree_delete_at(trans, &iter2,
1551 						BTREE_UPDATE_internal_snapshot_node);
1552 				bch2_trans_iter_exit(trans, &iter2);
1553 				if (ret)
1554 					goto err;
1555 
1556 				iter->k.type = KEY_TYPE_whiteout;
1557 			}
1558 
1559 			if (bkey_extent_is_allocation(k.k))
1560 				i->count += k.k->size;
1561 		}
1562 
1563 		i->seen_this_pos = true;
1564 	}
1565 
1566 	if (k.k->type != KEY_TYPE_whiteout) {
1567 		ret = extent_ends_at(c, extent_ends, s, k);
1568 		if (ret)
1569 			goto err;
1570 	}
1571 out:
1572 err:
1573 fsck_err:
1574 	printbuf_exit(&buf);
1575 	bch_err_fn(c, ret);
1576 	return ret;
1577 delete:
1578 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1579 	goto out;
1580 }
1581 
1582 /*
1583  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1584  * that i_size an i_sectors are consistent
1585  */
1586 int bch2_check_extents(struct bch_fs *c)
1587 {
1588 	struct inode_walker w = inode_walker_init();
1589 	struct snapshots_seen s;
1590 	struct extent_ends extent_ends;
1591 	struct disk_reservation res = { 0 };
1592 
1593 	snapshots_seen_init(&s);
1594 	extent_ends_init(&extent_ends);
1595 
1596 	int ret = bch2_trans_run(c,
1597 		for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1598 				POS(BCACHEFS_ROOT_INO, 0),
1599 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1600 				&res, NULL,
1601 				BCH_TRANS_COMMIT_no_enospc, ({
1602 			bch2_disk_reservation_put(c, &res);
1603 			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1604 			check_extent_overbig(trans, &iter, k);
1605 		})) ?:
1606 		check_i_sectors_notnested(trans, &w));
1607 
1608 	bch2_disk_reservation_put(c, &res);
1609 	extent_ends_exit(&extent_ends);
1610 	inode_walker_exit(&w);
1611 	snapshots_seen_exit(&s);
1612 
1613 	bch_err_fn(c, ret);
1614 	return ret;
1615 }
1616 
1617 int bch2_check_indirect_extents(struct bch_fs *c)
1618 {
1619 	struct disk_reservation res = { 0 };
1620 
1621 	int ret = bch2_trans_run(c,
1622 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1623 				POS_MIN,
1624 				BTREE_ITER_prefetch, k,
1625 				&res, NULL,
1626 				BCH_TRANS_COMMIT_no_enospc, ({
1627 			bch2_disk_reservation_put(c, &res);
1628 			check_extent_overbig(trans, &iter, k);
1629 		})));
1630 
1631 	bch2_disk_reservation_put(c, &res);
1632 	bch_err_fn(c, ret);
1633 	return ret;
1634 }
1635 
1636 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1637 {
1638 	struct bch_fs *c = trans->c;
1639 	int ret = 0;
1640 	s64 count2;
1641 
1642 	darray_for_each(w->inodes, i) {
1643 		if (i->inode.bi_nlink == i->count)
1644 			continue;
1645 
1646 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1647 		if (count2 < 0)
1648 			return count2;
1649 
1650 		if (i->count != count2) {
1651 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1652 					    w->last_pos.inode, i->snapshot, i->count, count2);
1653 			i->count = count2;
1654 			if (i->inode.bi_nlink == i->count)
1655 				continue;
1656 		}
1657 
1658 		if (fsck_err_on(i->inode.bi_nlink != i->count,
1659 				c, inode_dir_wrong_nlink,
1660 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1661 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1662 			i->inode.bi_nlink = i->count;
1663 			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1664 			if (ret)
1665 				break;
1666 		}
1667 	}
1668 fsck_err:
1669 	bch_err_fn(c, ret);
1670 	return ret;
1671 }
1672 
1673 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1674 {
1675 	u32 restart_count = trans->restart_count;
1676 	return check_subdir_count_notnested(trans, w) ?:
1677 		trans_was_restarted(trans, restart_count);
1678 }
1679 
1680 static int check_dirent_inode_dirent(struct btree_trans *trans,
1681 				   struct btree_iter *iter,
1682 				   struct bkey_s_c_dirent d,
1683 				   struct bch_inode_unpacked *target,
1684 				   u32 target_snapshot)
1685 {
1686 	struct bch_fs *c = trans->c;
1687 	struct printbuf buf = PRINTBUF;
1688 	int ret = 0;
1689 
1690 	if (inode_points_to_dirent(target, d))
1691 		return 0;
1692 
1693 	if (bch2_inode_should_have_bp(target) &&
1694 	    !fsck_err(c, inode_wrong_backpointer,
1695 		      "dirent points to inode that does not point back:\n  %s",
1696 		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
1697 		       prt_printf(&buf, "\n  "),
1698 		       bch2_inode_unpacked_to_text(&buf, target),
1699 		       buf.buf)))
1700 		goto out_noiter;
1701 
1702 	if (!target->bi_dir &&
1703 	    !target->bi_dir_offset) {
1704 		target->bi_dir		= d.k->p.inode;
1705 		target->bi_dir_offset	= d.k->p.offset;
1706 		return __bch2_fsck_write_inode(trans, target, target_snapshot);
1707 	}
1708 
1709 	struct btree_iter bp_iter = { NULL };
1710 	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1711 			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1712 	ret = bkey_err(bp_dirent);
1713 	if (ret && !bch2_err_matches(ret, ENOENT))
1714 		goto err;
1715 
1716 	bool backpointer_exists = !ret;
1717 	ret = 0;
1718 
1719 	if (fsck_err_on(!backpointer_exists,
1720 			c, inode_wrong_backpointer,
1721 			"inode %llu:%u has wrong backpointer:\n"
1722 			"got       %llu:%llu\n"
1723 			"should be %llu:%llu",
1724 			target->bi_inum, target_snapshot,
1725 			target->bi_dir,
1726 			target->bi_dir_offset,
1727 			d.k->p.inode,
1728 			d.k->p.offset)) {
1729 		target->bi_dir		= d.k->p.inode;
1730 		target->bi_dir_offset	= d.k->p.offset;
1731 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1732 		goto out;
1733 	}
1734 
1735 	bch2_bkey_val_to_text(&buf, c, d.s_c);
1736 	prt_newline(&buf);
1737 	if (backpointer_exists)
1738 		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1739 
1740 	if (fsck_err_on(backpointer_exists &&
1741 			(S_ISDIR(target->bi_mode) ||
1742 			 target->bi_subvol),
1743 			c, inode_dir_multiple_links,
1744 			"%s %llu:%u with multiple links\n%s",
1745 			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1746 			target->bi_inum, target_snapshot, buf.buf)) {
1747 		ret = __remove_dirent(trans, d.k->p);
1748 		goto out;
1749 	}
1750 
1751 	/*
1752 	 * hardlinked file with nlink 0:
1753 	 * We're just adjusting nlink here so check_nlinks() will pick
1754 	 * it up, it ignores inodes with nlink 0
1755 	 */
1756 	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1757 			c, inode_multiple_links_but_nlink_0,
1758 			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1759 			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1760 		target->bi_nlink++;
1761 		target->bi_flags &= ~BCH_INODE_unlinked;
1762 		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1763 		if (ret)
1764 			goto err;
1765 	}
1766 out:
1767 err:
1768 fsck_err:
1769 	bch2_trans_iter_exit(trans, &bp_iter);
1770 out_noiter:
1771 	printbuf_exit(&buf);
1772 	bch_err_fn(c, ret);
1773 	return ret;
1774 }
1775 
1776 static int check_dirent_target(struct btree_trans *trans,
1777 			       struct btree_iter *iter,
1778 			       struct bkey_s_c_dirent d,
1779 			       struct bch_inode_unpacked *target,
1780 			       u32 target_snapshot)
1781 {
1782 	struct bch_fs *c = trans->c;
1783 	struct bkey_i_dirent *n;
1784 	struct printbuf buf = PRINTBUF;
1785 	int ret = 0;
1786 
1787 	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1788 	if (ret)
1789 		goto err;
1790 
1791 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
1792 			c, dirent_d_type_wrong,
1793 			"incorrect d_type: got %s, should be %s:\n%s",
1794 			bch2_d_type_str(d.v->d_type),
1795 			bch2_d_type_str(inode_d_type(target)),
1796 			(printbuf_reset(&buf),
1797 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1798 		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1799 		ret = PTR_ERR_OR_ZERO(n);
1800 		if (ret)
1801 			goto err;
1802 
1803 		bkey_reassemble(&n->k_i, d.s_c);
1804 		n->v.d_type = inode_d_type(target);
1805 		if (n->v.d_type == DT_SUBVOL) {
1806 			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1807 			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1808 		} else {
1809 			n->v.d_inum = cpu_to_le64(target->bi_inum);
1810 		}
1811 
1812 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1813 		if (ret)
1814 			goto err;
1815 
1816 		d = dirent_i_to_s_c(n);
1817 	}
1818 err:
1819 fsck_err:
1820 	printbuf_exit(&buf);
1821 	bch_err_fn(c, ret);
1822 	return ret;
1823 }
1824 
1825 /* find a subvolume that's a descendent of @snapshot: */
1826 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1827 {
1828 	struct btree_iter iter;
1829 	struct bkey_s_c k;
1830 	int ret;
1831 
1832 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1833 		if (k.k->type != KEY_TYPE_subvolume)
1834 			continue;
1835 
1836 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1837 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1838 			bch2_trans_iter_exit(trans, &iter);
1839 			*subvolid = k.k->p.offset;
1840 			goto found;
1841 		}
1842 	}
1843 	if (!ret)
1844 		ret = -ENOENT;
1845 found:
1846 	bch2_trans_iter_exit(trans, &iter);
1847 	return ret;
1848 }
1849 
1850 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1851 				  struct bkey_s_c_dirent d)
1852 {
1853 	struct bch_fs *c = trans->c;
1854 	struct btree_iter subvol_iter = {};
1855 	struct bch_inode_unpacked subvol_root;
1856 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1857 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1858 	u32 parent_snapshot;
1859 	u32 new_parent_subvol = 0;
1860 	u64 parent_inum;
1861 	struct printbuf buf = PRINTBUF;
1862 	int ret = 0;
1863 
1864 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1865 	if (ret && !bch2_err_matches(ret, ENOENT))
1866 		return ret;
1867 
1868 	if (ret ||
1869 	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
1870 		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1871 		if (ret2 && !bch2_err_matches(ret, ENOENT))
1872 			return ret2;
1873 	}
1874 
1875 	if (ret &&
1876 	    !new_parent_subvol &&
1877 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1878 		/*
1879 		 * Couldn't find a subvol for dirent's snapshot - but we lost
1880 		 * subvols, so we need to reconstruct:
1881 		 */
1882 		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
1883 		if (ret)
1884 			return ret;
1885 
1886 		parent_snapshot = d.k->p.snapshot;
1887 	}
1888 
1889 	if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
1890 			"dirent parent_subvol points to missing subvolume\n%s",
1891 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1892 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1893 			c, dirent_not_visible_in_parent_subvol,
1894 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1895 			parent_snapshot,
1896 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1897 		if (!new_parent_subvol) {
1898 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
1899 			return -BCH_ERR_fsck_repair_unimplemented;
1900 		}
1901 
1902 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1903 		ret = PTR_ERR_OR_ZERO(new_dirent);
1904 		if (ret)
1905 			goto err;
1906 
1907 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1908 	}
1909 
1910 	struct bkey_s_c_subvolume s =
1911 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
1912 					 BTREE_ID_subvolumes, POS(0, target_subvol),
1913 					 0, subvolume);
1914 	ret = bkey_err(s.s_c);
1915 	if (ret && !bch2_err_matches(ret, ENOENT))
1916 		return ret;
1917 
1918 	if (ret) {
1919 		if (fsck_err(c, dirent_to_missing_subvol,
1920 			     "dirent points to missing subvolume\n%s",
1921 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1922 			return __remove_dirent(trans, d.k->p);
1923 		ret = 0;
1924 		goto out;
1925 	}
1926 
1927 	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
1928 			c, subvol_fs_path_parent_wrong,
1929 			"subvol with wrong fs_path_parent, should be be %u\n%s",
1930 			parent_subvol,
1931 			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
1932 		struct bkey_i_subvolume *n =
1933 			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
1934 		ret = PTR_ERR_OR_ZERO(n);
1935 		if (ret)
1936 			goto err;
1937 
1938 		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
1939 	}
1940 
1941 	u64 target_inum = le64_to_cpu(s.v->inode);
1942 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
1943 
1944 	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
1945 	if (ret && !bch2_err_matches(ret, ENOENT))
1946 		goto err;
1947 
1948 	if (ret) {
1949 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
1950 		ret = -BCH_ERR_fsck_repair_unimplemented;
1951 		ret = 0;
1952 		goto err;
1953 	}
1954 
1955 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
1956 			c, inode_bi_parent_wrong,
1957 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
1958 			target_inum,
1959 			subvol_root.bi_parent_subvol, parent_subvol)) {
1960 		subvol_root.bi_parent_subvol = parent_subvol;
1961 		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
1962 		if (ret)
1963 			goto err;
1964 	}
1965 
1966 	ret = check_dirent_target(trans, iter, d, &subvol_root,
1967 				  target_snapshot);
1968 	if (ret)
1969 		goto err;
1970 out:
1971 err:
1972 fsck_err:
1973 	bch2_trans_iter_exit(trans, &subvol_iter);
1974 	printbuf_exit(&buf);
1975 	return ret;
1976 }
1977 
1978 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1979 			struct bkey_s_c k,
1980 			struct bch_hash_info *hash_info,
1981 			struct inode_walker *dir,
1982 			struct inode_walker *target,
1983 			struct snapshots_seen *s)
1984 {
1985 	struct bch_fs *c = trans->c;
1986 	struct inode_walker_entry *i;
1987 	struct printbuf buf = PRINTBUF;
1988 	int ret = 0;
1989 
1990 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1991 	if (ret) {
1992 		ret = ret < 0 ? ret : 0;
1993 		goto out;
1994 	}
1995 
1996 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1997 	if (ret)
1998 		goto err;
1999 
2000 	if (k.k->type == KEY_TYPE_whiteout)
2001 		goto out;
2002 
2003 	if (dir->last_pos.inode != k.k->p.inode) {
2004 		ret = check_subdir_count(trans, dir);
2005 		if (ret)
2006 			goto err;
2007 	}
2008 
2009 	BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
2010 
2011 	i = walk_inode(trans, dir, k);
2012 	ret = PTR_ERR_OR_ZERO(i);
2013 	if (ret < 0)
2014 		goto err;
2015 
2016 	if (dir->first_this_inode && dir->inodes.nr)
2017 		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
2018 	dir->first_this_inode = false;
2019 
2020 	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
2021 		ret =   reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
2022 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2023 		if (ret)
2024 			goto err;
2025 
2026 		dir->last_pos.inode--;
2027 		ret = -BCH_ERR_transaction_restart_nested;
2028 		goto err;
2029 	}
2030 
2031 	if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
2032 			"dirent in nonexisting directory:\n%s",
2033 			(printbuf_reset(&buf),
2034 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
2035 		ret = bch2_btree_delete_at(trans, iter,
2036 				BTREE_UPDATE_internal_snapshot_node);
2037 		goto out;
2038 	}
2039 
2040 	if (!i)
2041 		goto out;
2042 
2043 	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
2044 			c, dirent_in_non_dir_inode,
2045 			"dirent in non directory inode type %s:\n%s",
2046 			bch2_d_type_str(inode_d_type(&i->inode)),
2047 			(printbuf_reset(&buf),
2048 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
2049 		ret = bch2_btree_delete_at(trans, iter, 0);
2050 		goto out;
2051 	}
2052 
2053 	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
2054 	if (ret < 0)
2055 		goto err;
2056 	if (ret) {
2057 		/* dirent has been deleted */
2058 		ret = 0;
2059 		goto out;
2060 	}
2061 
2062 	if (k.k->type != KEY_TYPE_dirent)
2063 		goto out;
2064 
2065 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2066 
2067 	if (d.v->d_type == DT_SUBVOL) {
2068 		ret = check_dirent_to_subvol(trans, iter, d);
2069 		if (ret)
2070 			goto err;
2071 	} else {
2072 		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2073 		if (ret)
2074 			goto err;
2075 
2076 		if (fsck_err_on(!target->inodes.nr,
2077 				c, dirent_to_missing_inode,
2078 				"dirent points to missing inode:\n%s",
2079 				(printbuf_reset(&buf),
2080 				 bch2_bkey_val_to_text(&buf, c, k),
2081 				 buf.buf))) {
2082 			ret = __remove_dirent(trans, d.k->p);
2083 			if (ret)
2084 				goto err;
2085 		}
2086 
2087 		darray_for_each(target->inodes, i) {
2088 			ret = check_dirent_target(trans, iter, d,
2089 						  &i->inode, i->snapshot);
2090 			if (ret)
2091 				goto err;
2092 		}
2093 
2094 		if (d.v->d_type == DT_DIR)
2095 			for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
2096 				i->count++;
2097 	}
2098 out:
2099 err:
2100 fsck_err:
2101 	printbuf_exit(&buf);
2102 	bch_err_fn(c, ret);
2103 	return ret;
2104 }
2105 
2106 /*
2107  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2108  * validate d_type
2109  */
2110 int bch2_check_dirents(struct bch_fs *c)
2111 {
2112 	struct inode_walker dir = inode_walker_init();
2113 	struct inode_walker target = inode_walker_init();
2114 	struct snapshots_seen s;
2115 	struct bch_hash_info hash_info;
2116 
2117 	snapshots_seen_init(&s);
2118 
2119 	int ret = bch2_trans_run(c,
2120 		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2121 				POS(BCACHEFS_ROOT_INO, 0),
2122 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2123 				k,
2124 				NULL, NULL,
2125 				BCH_TRANS_COMMIT_no_enospc,
2126 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2127 		check_subdir_count_notnested(trans, &dir));
2128 
2129 	snapshots_seen_exit(&s);
2130 	inode_walker_exit(&dir);
2131 	inode_walker_exit(&target);
2132 	bch_err_fn(c, ret);
2133 	return ret;
2134 }
2135 
2136 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2137 		       struct bkey_s_c k,
2138 		       struct bch_hash_info *hash_info,
2139 		       struct inode_walker *inode)
2140 {
2141 	struct bch_fs *c = trans->c;
2142 	struct inode_walker_entry *i;
2143 	int ret;
2144 
2145 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2146 	if (ret < 0)
2147 		return ret;
2148 	if (ret)
2149 		return 0;
2150 
2151 	i = walk_inode(trans, inode, k);
2152 	ret = PTR_ERR_OR_ZERO(i);
2153 	if (ret)
2154 		return ret;
2155 
2156 	if (inode->first_this_inode && inode->inodes.nr)
2157 		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
2158 	inode->first_this_inode = false;
2159 
2160 	if (fsck_err_on(!i, c, xattr_in_missing_inode,
2161 			"xattr for missing inode %llu",
2162 			k.k->p.inode))
2163 		return bch2_btree_delete_at(trans, iter, 0);
2164 
2165 	if (!i)
2166 		return 0;
2167 
2168 	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2169 fsck_err:
2170 	bch_err_fn(c, ret);
2171 	return ret;
2172 }
2173 
2174 /*
2175  * Walk xattrs: verify that they all have a corresponding inode
2176  */
2177 int bch2_check_xattrs(struct bch_fs *c)
2178 {
2179 	struct inode_walker inode = inode_walker_init();
2180 	struct bch_hash_info hash_info;
2181 	int ret = 0;
2182 
2183 	ret = bch2_trans_run(c,
2184 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2185 			POS(BCACHEFS_ROOT_INO, 0),
2186 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2187 			k,
2188 			NULL, NULL,
2189 			BCH_TRANS_COMMIT_no_enospc,
2190 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2191 	bch_err_fn(c, ret);
2192 	return ret;
2193 }
2194 
2195 static int check_root_trans(struct btree_trans *trans)
2196 {
2197 	struct bch_fs *c = trans->c;
2198 	struct bch_inode_unpacked root_inode;
2199 	u32 snapshot;
2200 	u64 inum;
2201 	int ret;
2202 
2203 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2204 	if (ret && !bch2_err_matches(ret, ENOENT))
2205 		return ret;
2206 
2207 	if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
2208 				"root subvol missing")) {
2209 		struct bkey_i_subvolume *root_subvol =
2210 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2211 		ret = PTR_ERR_OR_ZERO(root_subvol);
2212 		if (ret)
2213 			goto err;
2214 
2215 		snapshot	= U32_MAX;
2216 		inum		= BCACHEFS_ROOT_INO;
2217 
2218 		bkey_subvolume_init(&root_subvol->k_i);
2219 		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2220 		root_subvol->v.flags	= 0;
2221 		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2222 		root_subvol->v.inode	= cpu_to_le64(inum);
2223 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2224 		bch_err_msg(c, ret, "writing root subvol");
2225 		if (ret)
2226 			goto err;
2227 	}
2228 
2229 	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2230 	if (ret && !bch2_err_matches(ret, ENOENT))
2231 		return ret;
2232 
2233 	if (mustfix_fsck_err_on(ret, c, root_dir_missing,
2234 				"root directory missing") ||
2235 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2236 				c, root_inode_not_dir,
2237 				"root inode not a directory")) {
2238 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2239 				0, NULL);
2240 		root_inode.bi_inum = inum;
2241 
2242 		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2243 		bch_err_msg(c, ret, "writing root inode");
2244 	}
2245 err:
2246 fsck_err:
2247 	return ret;
2248 }
2249 
2250 /* Get root directory, create if it doesn't exist: */
2251 int bch2_check_root(struct bch_fs *c)
2252 {
2253 	int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2254 		check_root_trans(trans));
2255 	bch_err_fn(c, ret);
2256 	return ret;
2257 }
2258 
2259 typedef DARRAY(u32) darray_u32;
2260 
2261 static bool darray_u32_has(darray_u32 *d, u32 v)
2262 {
2263 	darray_for_each(*d, i)
2264 		if (*i == v)
2265 			return true;
2266 	return false;
2267 }
2268 
2269 /*
2270  * We've checked that inode backpointers point to valid dirents; here, it's
2271  * sufficient to check that the subvolume root has a dirent:
2272  */
2273 static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
2274 {
2275 	struct bch_inode_unpacked inode;
2276 	int ret = bch2_inode_find_by_inum_trans(trans,
2277 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2278 				&inode);
2279 	if (ret)
2280 		return ret;
2281 
2282 	return inode.bi_dir != 0;
2283 }
2284 
2285 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2286 {
2287 	struct bch_fs *c = trans->c;
2288 	struct btree_iter parent_iter = {};
2289 	darray_u32 subvol_path = {};
2290 	struct printbuf buf = PRINTBUF;
2291 	int ret = 0;
2292 
2293 	if (k.k->type != KEY_TYPE_subvolume)
2294 		return 0;
2295 
2296 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2297 		ret = darray_push(&subvol_path, k.k->p.offset);
2298 		if (ret)
2299 			goto err;
2300 
2301 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2302 
2303 		ret = subvol_has_dirent(trans, s);
2304 		if (ret < 0)
2305 			break;
2306 
2307 		if (fsck_err_on(!ret,
2308 				c, subvol_unreachable,
2309 				"unreachable subvolume %s",
2310 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2311 				 buf.buf))) {
2312 			ret = reattach_subvol(trans, s);
2313 			break;
2314 		}
2315 
2316 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2317 
2318 		if (darray_u32_has(&subvol_path, parent)) {
2319 			if (fsck_err(c, subvol_loop, "subvolume loop"))
2320 				ret = reattach_subvol(trans, s);
2321 			break;
2322 		}
2323 
2324 		bch2_trans_iter_exit(trans, &parent_iter);
2325 		bch2_trans_iter_init(trans, &parent_iter,
2326 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2327 		k = bch2_btree_iter_peek_slot(&parent_iter);
2328 		ret = bkey_err(k);
2329 		if (ret)
2330 			goto err;
2331 
2332 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2333 				c, subvol_unreachable,
2334 				"unreachable subvolume %s",
2335 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2336 				 buf.buf))) {
2337 			ret = reattach_subvol(trans, s);
2338 			break;
2339 		}
2340 	}
2341 fsck_err:
2342 err:
2343 	printbuf_exit(&buf);
2344 	darray_exit(&subvol_path);
2345 	bch2_trans_iter_exit(trans, &parent_iter);
2346 	return ret;
2347 }
2348 
2349 int bch2_check_subvolume_structure(struct bch_fs *c)
2350 {
2351 	int ret = bch2_trans_run(c,
2352 		for_each_btree_key_commit(trans, iter,
2353 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2354 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2355 			check_subvol_path(trans, &iter, k)));
2356 	bch_err_fn(c, ret);
2357 	return ret;
2358 }
2359 
2360 struct pathbuf_entry {
2361 	u64	inum;
2362 	u32	snapshot;
2363 };
2364 
2365 typedef DARRAY(struct pathbuf_entry) pathbuf;
2366 
2367 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2368 {
2369 	darray_for_each(*p, i)
2370 		if (i->inum	== inum &&
2371 		    i->snapshot	== snapshot)
2372 			return true;
2373 	return false;
2374 }
2375 
2376 /*
2377  * Check that a given inode is reachable from its subvolume root - we already
2378  * verified subvolume connectivity:
2379  *
2380  * XXX: we should also be verifying that inodes are in the right subvolumes
2381  */
2382 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2383 {
2384 	struct bch_fs *c = trans->c;
2385 	struct btree_iter inode_iter = {};
2386 	struct bch_inode_unpacked inode;
2387 	struct printbuf buf = PRINTBUF;
2388 	u32 snapshot = inode_k.k->p.snapshot;
2389 	int ret = 0;
2390 
2391 	p->nr = 0;
2392 
2393 	BUG_ON(bch2_inode_unpack(inode_k, &inode));
2394 
2395 	while (!inode.bi_subvol) {
2396 		struct btree_iter dirent_iter;
2397 		struct bkey_s_c_dirent d;
2398 		u32 parent_snapshot = snapshot;
2399 
2400 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2401 		ret = bkey_err(d.s_c);
2402 		if (ret && !bch2_err_matches(ret, ENOENT))
2403 			break;
2404 
2405 		if (!ret && !dirent_points_to_inode(d, &inode)) {
2406 			bch2_trans_iter_exit(trans, &dirent_iter);
2407 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2408 		}
2409 
2410 		if (bch2_err_matches(ret, ENOENT)) {
2411 			ret = 0;
2412 			if (fsck_err(c, inode_unreachable,
2413 				     "unreachable inode\n%s",
2414 				     (printbuf_reset(&buf),
2415 				      bch2_bkey_val_to_text(&buf, c, inode_k),
2416 				      buf.buf)))
2417 				ret = reattach_inode(trans, &inode, snapshot);
2418 			goto out;
2419 		}
2420 
2421 		bch2_trans_iter_exit(trans, &dirent_iter);
2422 
2423 		if (!S_ISDIR(inode.bi_mode))
2424 			break;
2425 
2426 		ret = darray_push(p, ((struct pathbuf_entry) {
2427 			.inum		= inode.bi_inum,
2428 			.snapshot	= snapshot,
2429 		}));
2430 		if (ret)
2431 			return ret;
2432 
2433 		snapshot = parent_snapshot;
2434 
2435 		bch2_trans_iter_exit(trans, &inode_iter);
2436 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2437 					     SPOS(0, inode.bi_dir, snapshot), 0);
2438 		ret = bkey_err(inode_k) ?:
2439 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2440 			: bch2_inode_unpack(inode_k, &inode);
2441 		if (ret) {
2442 			/* Should have been caught in dirents pass */
2443 			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
2444 				bch_err(c, "error looking up parent directory: %i", ret);
2445 			break;
2446 		}
2447 
2448 		snapshot = inode_k.k->p.snapshot;
2449 
2450 		if (path_is_dup(p, inode.bi_inum, snapshot)) {
2451 			/* XXX print path */
2452 			bch_err(c, "directory structure loop");
2453 
2454 			darray_for_each(*p, i)
2455 				pr_err("%llu:%u", i->inum, i->snapshot);
2456 			pr_err("%llu:%u", inode.bi_inum, snapshot);
2457 
2458 			if (fsck_err(c, dir_loop, "directory structure loop")) {
2459 				ret = remove_backpointer(trans, &inode);
2460 				bch_err_msg(c, ret, "removing dirent");
2461 				if (ret)
2462 					break;
2463 
2464 				ret = reattach_inode(trans, &inode, snapshot);
2465 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2466 			}
2467 			break;
2468 		}
2469 	}
2470 out:
2471 fsck_err:
2472 	bch2_trans_iter_exit(trans, &inode_iter);
2473 	printbuf_exit(&buf);
2474 	bch_err_fn(c, ret);
2475 	return ret;
2476 }
2477 
2478 /*
2479  * Check for unreachable inodes, as well as loops in the directory structure:
2480  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2481  * unreachable:
2482  */
2483 int bch2_check_directory_structure(struct bch_fs *c)
2484 {
2485 	pathbuf path = { 0, };
2486 	int ret;
2487 
2488 	ret = bch2_trans_run(c,
2489 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2490 					  BTREE_ITER_intent|
2491 					  BTREE_ITER_prefetch|
2492 					  BTREE_ITER_all_snapshots, k,
2493 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2494 			if (!bkey_is_inode(k.k))
2495 				continue;
2496 
2497 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2498 				continue;
2499 
2500 			check_path(trans, &path, k);
2501 		})));
2502 	darray_exit(&path);
2503 
2504 	bch_err_fn(c, ret);
2505 	return ret;
2506 }
2507 
2508 struct nlink_table {
2509 	size_t		nr;
2510 	size_t		size;
2511 
2512 	struct nlink {
2513 		u64	inum;
2514 		u32	snapshot;
2515 		u32	count;
2516 	}		*d;
2517 };
2518 
2519 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2520 		     u64 inum, u32 snapshot)
2521 {
2522 	if (t->nr == t->size) {
2523 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2524 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2525 
2526 		if (!d) {
2527 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2528 				new_size);
2529 			return -BCH_ERR_ENOMEM_fsck_add_nlink;
2530 		}
2531 
2532 		if (t->d)
2533 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2534 		kvfree(t->d);
2535 
2536 		t->d = d;
2537 		t->size = new_size;
2538 	}
2539 
2540 
2541 	t->d[t->nr++] = (struct nlink) {
2542 		.inum		= inum,
2543 		.snapshot	= snapshot,
2544 	};
2545 
2546 	return 0;
2547 }
2548 
2549 static int nlink_cmp(const void *_l, const void *_r)
2550 {
2551 	const struct nlink *l = _l;
2552 	const struct nlink *r = _r;
2553 
2554 	return cmp_int(l->inum, r->inum);
2555 }
2556 
2557 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2558 		     struct nlink_table *links,
2559 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2560 {
2561 	struct nlink *link, key = {
2562 		.inum = inum, .snapshot = U32_MAX,
2563 	};
2564 
2565 	if (inum < range_start || inum >= range_end)
2566 		return;
2567 
2568 	link = __inline_bsearch(&key, links->d, links->nr,
2569 				sizeof(links->d[0]), nlink_cmp);
2570 	if (!link)
2571 		return;
2572 
2573 	while (link > links->d && link[0].inum == link[-1].inum)
2574 		--link;
2575 
2576 	for (; link < links->d + links->nr && link->inum == inum; link++)
2577 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2578 			link->count++;
2579 			if (link->snapshot >= snapshot)
2580 				break;
2581 		}
2582 }
2583 
2584 noinline_for_stack
2585 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2586 				       struct nlink_table *t,
2587 				       u64 start, u64 *end)
2588 {
2589 	int ret = bch2_trans_run(c,
2590 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2591 				   POS(0, start),
2592 				   BTREE_ITER_intent|
2593 				   BTREE_ITER_prefetch|
2594 				   BTREE_ITER_all_snapshots, k, ({
2595 			if (!bkey_is_inode(k.k))
2596 				continue;
2597 
2598 			/* Should never fail, checked by bch2_inode_invalid: */
2599 			struct bch_inode_unpacked u;
2600 			BUG_ON(bch2_inode_unpack(k, &u));
2601 
2602 			/*
2603 			 * Backpointer and directory structure checks are sufficient for
2604 			 * directories, since they can't have hardlinks:
2605 			 */
2606 			if (S_ISDIR(u.bi_mode))
2607 				continue;
2608 
2609 			if (!u.bi_nlink)
2610 				continue;
2611 
2612 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2613 			if (ret) {
2614 				*end = k.k->p.offset;
2615 				ret = 0;
2616 				break;
2617 			}
2618 			0;
2619 		})));
2620 
2621 	bch_err_fn(c, ret);
2622 	return ret;
2623 }
2624 
2625 noinline_for_stack
2626 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2627 				     u64 range_start, u64 range_end)
2628 {
2629 	struct snapshots_seen s;
2630 
2631 	snapshots_seen_init(&s);
2632 
2633 	int ret = bch2_trans_run(c,
2634 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2635 				   BTREE_ITER_intent|
2636 				   BTREE_ITER_prefetch|
2637 				   BTREE_ITER_all_snapshots, k, ({
2638 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2639 			if (ret)
2640 				break;
2641 
2642 			if (k.k->type == KEY_TYPE_dirent) {
2643 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2644 
2645 				if (d.v->d_type != DT_DIR &&
2646 				    d.v->d_type != DT_SUBVOL)
2647 					inc_link(c, &s, links, range_start, range_end,
2648 						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2649 			}
2650 			0;
2651 		})));
2652 
2653 	snapshots_seen_exit(&s);
2654 
2655 	bch_err_fn(c, ret);
2656 	return ret;
2657 }
2658 
2659 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2660 				     struct bkey_s_c k,
2661 				     struct nlink_table *links,
2662 				     size_t *idx, u64 range_end)
2663 {
2664 	struct bch_fs *c = trans->c;
2665 	struct bch_inode_unpacked u;
2666 	struct nlink *link = &links->d[*idx];
2667 	int ret = 0;
2668 
2669 	if (k.k->p.offset >= range_end)
2670 		return 1;
2671 
2672 	if (!bkey_is_inode(k.k))
2673 		return 0;
2674 
2675 	BUG_ON(bch2_inode_unpack(k, &u));
2676 
2677 	if (S_ISDIR(u.bi_mode))
2678 		return 0;
2679 
2680 	if (!u.bi_nlink)
2681 		return 0;
2682 
2683 	while ((cmp_int(link->inum, k.k->p.offset) ?:
2684 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2685 		BUG_ON(*idx == links->nr);
2686 		link = &links->d[++*idx];
2687 	}
2688 
2689 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2690 			c, inode_wrong_nlink,
2691 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
2692 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2693 			bch2_inode_nlink_get(&u), link->count)) {
2694 		bch2_inode_nlink_set(&u, link->count);
2695 		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2696 	}
2697 fsck_err:
2698 	return ret;
2699 }
2700 
2701 noinline_for_stack
2702 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2703 			       struct nlink_table *links,
2704 			       u64 range_start, u64 range_end)
2705 {
2706 	size_t idx = 0;
2707 
2708 	int ret = bch2_trans_run(c,
2709 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2710 				POS(0, range_start),
2711 				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2712 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2713 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2714 	if (ret < 0) {
2715 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2716 		return ret;
2717 	}
2718 
2719 	return 0;
2720 }
2721 
2722 int bch2_check_nlinks(struct bch_fs *c)
2723 {
2724 	struct nlink_table links = { 0 };
2725 	u64 this_iter_range_start, next_iter_range_start = 0;
2726 	int ret = 0;
2727 
2728 	do {
2729 		this_iter_range_start = next_iter_range_start;
2730 		next_iter_range_start = U64_MAX;
2731 
2732 		ret = check_nlinks_find_hardlinks(c, &links,
2733 						  this_iter_range_start,
2734 						  &next_iter_range_start);
2735 
2736 		ret = check_nlinks_walk_dirents(c, &links,
2737 					  this_iter_range_start,
2738 					  next_iter_range_start);
2739 		if (ret)
2740 			break;
2741 
2742 		ret = check_nlinks_update_hardlinks(c, &links,
2743 					 this_iter_range_start,
2744 					 next_iter_range_start);
2745 		if (ret)
2746 			break;
2747 
2748 		links.nr = 0;
2749 	} while (next_iter_range_start != U64_MAX);
2750 
2751 	kvfree(links.d);
2752 	bch_err_fn(c, ret);
2753 	return ret;
2754 }
2755 
2756 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2757 			     struct bkey_s_c k)
2758 {
2759 	struct bkey_s_c_reflink_p p;
2760 	struct bkey_i_reflink_p *u;
2761 
2762 	if (k.k->type != KEY_TYPE_reflink_p)
2763 		return 0;
2764 
2765 	p = bkey_s_c_to_reflink_p(k);
2766 
2767 	if (!p.v->front_pad && !p.v->back_pad)
2768 		return 0;
2769 
2770 	u = bch2_trans_kmalloc(trans, sizeof(*u));
2771 	int ret = PTR_ERR_OR_ZERO(u);
2772 	if (ret)
2773 		return ret;
2774 
2775 	bkey_reassemble(&u->k_i, k);
2776 	u->v.front_pad	= 0;
2777 	u->v.back_pad	= 0;
2778 
2779 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2780 }
2781 
2782 int bch2_fix_reflink_p(struct bch_fs *c)
2783 {
2784 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2785 		return 0;
2786 
2787 	int ret = bch2_trans_run(c,
2788 		for_each_btree_key_commit(trans, iter,
2789 				BTREE_ID_extents, POS_MIN,
2790 				BTREE_ITER_intent|BTREE_ITER_prefetch|
2791 				BTREE_ITER_all_snapshots, k,
2792 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2793 			fix_reflink_p_key(trans, &iter, k)));
2794 	bch_err_fn(c, ret);
2795 	return ret;
2796 }
2797