xref: /linux/fs/bcachefs/fsck.c (revision e814f3fd16acfb7f9966773953de8f740a1e3202)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bcachefs_ioctl.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "darray.h"
10 #include "dirent.h"
11 #include "error.h"
12 #include "fs.h"
13 #include "fs-common.h"
14 #include "fsck.h"
15 #include "inode.h"
16 #include "keylist.h"
17 #include "recovery_passes.h"
18 #include "snapshot.h"
19 #include "super.h"
20 #include "thread_with_file.h"
21 #include "xattr.h"
22 
23 #include <linux/bsearch.h>
24 #include <linux/dcache.h> /* struct qstr */
25 
26 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
27 				   struct bkey_s_c_dirent d)
28 {
29 	return  inode->bi_dir		== d.k->p.inode &&
30 		inode->bi_dir_offset	== d.k->p.offset;
31 }
32 
33 static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
34 					 struct bch_inode_unpacked *inode)
35 {
36 	if (d.v->d_type == DT_SUBVOL
37 	    ? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
38 	    : le64_to_cpu(d.v->d_inum)		== inode->bi_inum)
39 		return 0;
40 	return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
41 }
42 
43 static void dirent_inode_mismatch_msg(struct printbuf *out,
44 				      struct bch_fs *c,
45 				      struct bkey_s_c_dirent dirent,
46 				      struct bch_inode_unpacked *inode)
47 {
48 	prt_str(out, "inode points to dirent that does not point back:");
49 	prt_newline(out);
50 	bch2_bkey_val_to_text(out, c, dirent.s_c);
51 	prt_newline(out);
52 	bch2_inode_unpacked_to_text(out, inode);
53 }
54 
55 static int dirent_points_to_inode(struct bch_fs *c,
56 				  struct bkey_s_c_dirent dirent,
57 				  struct bch_inode_unpacked *inode)
58 {
59 	int ret = dirent_points_to_inode_nowarn(dirent, inode);
60 	if (ret) {
61 		struct printbuf buf = PRINTBUF;
62 		dirent_inode_mismatch_msg(&buf, c, dirent, inode);
63 		bch_warn(c, "%s", buf.buf);
64 		printbuf_exit(&buf);
65 	}
66 	return ret;
67 }
68 
69 /*
70  * XXX: this is handling transaction restarts without returning
71  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
72  */
73 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
74 				    u32 snapshot)
75 {
76 	u64 sectors = 0;
77 
78 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
79 				SPOS(inum, 0, snapshot),
80 				POS(inum, U64_MAX),
81 				0, k, ({
82 		if (bkey_extent_is_allocation(k.k))
83 			sectors += k.k->size;
84 		0;
85 	}));
86 
87 	return ret ?: sectors;
88 }
89 
90 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
91 				    u32 snapshot)
92 {
93 	u64 subdirs = 0;
94 
95 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
96 				    SPOS(inum, 0, snapshot),
97 				    POS(inum, U64_MAX),
98 				    0, k, ({
99 		if (k.k->type == KEY_TYPE_dirent &&
100 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
101 			subdirs++;
102 		0;
103 	}));
104 
105 	return ret ?: subdirs;
106 }
107 
108 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
109 			 u32 *snapshot, u64 *inum)
110 {
111 	struct bch_subvolume s;
112 	int ret = bch2_subvolume_get(trans, subvol, false, &s);
113 
114 	*snapshot = le32_to_cpu(s.snapshot);
115 	*inum = le64_to_cpu(s.inode);
116 	return ret;
117 }
118 
119 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
120 			      struct bch_inode_unpacked *inode)
121 {
122 	struct btree_iter iter;
123 	struct bkey_s_c k;
124 	int ret;
125 
126 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
127 				     BTREE_ITER_all_snapshots, k, ret) {
128 		if (k.k->p.offset != inode_nr)
129 			break;
130 		if (!bkey_is_inode(k.k))
131 			continue;
132 		ret = bch2_inode_unpack(k, inode);
133 		goto found;
134 	}
135 	ret = -BCH_ERR_ENOENT_inode;
136 found:
137 	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
138 	bch2_trans_iter_exit(trans, &iter);
139 	return ret;
140 }
141 
142 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
143 			struct bch_inode_unpacked *inode)
144 {
145 	struct btree_iter iter;
146 	struct bkey_s_c k;
147 	int ret;
148 
149 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
150 			       SPOS(0, inode_nr, snapshot), 0);
151 	ret = bkey_err(k);
152 	if (ret)
153 		goto err;
154 
155 	ret = bkey_is_inode(k.k)
156 		? bch2_inode_unpack(k, inode)
157 		: -BCH_ERR_ENOENT_inode;
158 err:
159 	bch2_trans_iter_exit(trans, &iter);
160 	return ret;
161 }
162 
163 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
164 			   struct bch_hash_info hash_info,
165 			   subvol_inum dir, struct qstr *name,
166 			   u64 *target, unsigned *type, u32 snapshot)
167 {
168 	struct btree_iter iter;
169 	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
170 							 &hash_info, dir, name, 0, snapshot);
171 	int ret = bkey_err(k);
172 	if (ret)
173 		return ret;
174 
175 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
176 	*target = le64_to_cpu(d.v->d_inum);
177 	*type = d.v->d_type;
178 	bch2_trans_iter_exit(trans, &iter);
179 	return 0;
180 }
181 
182 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
183 {
184 	struct bch_fs *c = trans->c;
185 	struct btree_iter iter;
186 	struct bch_inode_unpacked dir_inode;
187 	struct bch_hash_info dir_hash_info;
188 	int ret;
189 
190 	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
191 	if (ret)
192 		goto err;
193 
194 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
195 
196 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
197 
198 	ret =   bch2_btree_iter_traverse(&iter) ?:
199 		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
200 				    &dir_hash_info, &iter,
201 				    BTREE_UPDATE_internal_snapshot_node);
202 	bch2_trans_iter_exit(trans, &iter);
203 err:
204 	bch_err_fn(c, ret);
205 	return ret;
206 }
207 
208 /*
209  * Find any subvolume associated with a tree of snapshots
210  * We can't rely on master_subvol - it might have been deleted.
211  */
212 static int find_snapshot_tree_subvol(struct btree_trans *trans,
213 				     u32 tree_id, u32 *subvol)
214 {
215 	struct btree_iter iter;
216 	struct bkey_s_c k;
217 	int ret;
218 
219 	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
220 		if (k.k->type != KEY_TYPE_snapshot)
221 			continue;
222 
223 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
224 		if (le32_to_cpu(s.v->tree) != tree_id)
225 			continue;
226 
227 		if (s.v->subvol) {
228 			*subvol = le32_to_cpu(s.v->subvol);
229 			goto found;
230 		}
231 	}
232 	ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
233 found:
234 	bch2_trans_iter_exit(trans, &iter);
235 	return ret;
236 }
237 
238 /* Get lost+found, create if it doesn't exist: */
239 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
240 			    struct bch_inode_unpacked *lostfound,
241 			    u64 reattaching_inum)
242 {
243 	struct bch_fs *c = trans->c;
244 	struct qstr lostfound_str = QSTR("lost+found");
245 	struct btree_iter lostfound_iter = { NULL };
246 	u64 inum = 0;
247 	unsigned d_type = 0;
248 	int ret;
249 
250 	struct bch_snapshot_tree st;
251 	ret = bch2_snapshot_tree_lookup(trans,
252 			bch2_snapshot_tree(c, snapshot), &st);
253 	if (ret)
254 		return ret;
255 
256 	u32 subvolid;
257 	ret = find_snapshot_tree_subvol(trans,
258 				bch2_snapshot_tree(c, snapshot), &subvolid);
259 	bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
260 		    bch2_snapshot_tree(c, snapshot));
261 	if (ret)
262 		return ret;
263 
264 	struct bch_subvolume subvol;
265 	ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
266 	bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
267 	if (ret)
268 		return ret;
269 
270 	if (!subvol.inode) {
271 		struct btree_iter iter;
272 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
273 				BTREE_ID_subvolumes, POS(0, subvolid),
274 				0, subvolume);
275 		ret = PTR_ERR_OR_ZERO(subvol);
276 		if (ret)
277 			return ret;
278 
279 		subvol->v.inode = cpu_to_le64(reattaching_inum);
280 		bch2_trans_iter_exit(trans, &iter);
281 	}
282 
283 	subvol_inum root_inum = {
284 		.subvol = subvolid,
285 		.inum = le64_to_cpu(subvol.inode)
286 	};
287 
288 	struct bch_inode_unpacked root_inode;
289 	struct bch_hash_info root_hash_info;
290 	ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
291 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
292 		    root_inum.inum, subvolid);
293 	if (ret)
294 		return ret;
295 
296 	root_hash_info = bch2_hash_info_init(c, &root_inode);
297 
298 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
299 			      &lostfound_str, &inum, &d_type, snapshot);
300 	if (bch2_err_matches(ret, ENOENT))
301 		goto create_lostfound;
302 
303 	bch_err_fn(c, ret);
304 	if (ret)
305 		return ret;
306 
307 	if (d_type != DT_DIR) {
308 		bch_err(c, "error looking up lost+found: not a directory");
309 		return -BCH_ERR_ENOENT_not_directory;
310 	}
311 
312 	/*
313 	 * The bch2_check_dirents pass has already run, dangling dirents
314 	 * shouldn't exist here:
315 	 */
316 	ret = lookup_inode(trans, inum, snapshot, lostfound);
317 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
318 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
319 	return ret;
320 
321 create_lostfound:
322 	/*
323 	 * we always create lost+found in the root snapshot; we don't want
324 	 * different branches of the snapshot tree to have different lost+found
325 	 */
326 	snapshot = le32_to_cpu(st.root_snapshot);
327 	/*
328 	 * XXX: we could have a nicer log message here  if we had a nice way to
329 	 * walk backpointers to print a path
330 	 */
331 	struct printbuf path = PRINTBUF;
332 	ret = bch2_inum_to_path(trans, root_inum, &path);
333 	if (ret)
334 		goto err;
335 
336 	bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
337 		   path.buf, root_inum.subvol, snapshot);
338 	printbuf_exit(&path);
339 
340 	u64 now = bch2_current_time(c);
341 	u64 cpu = raw_smp_processor_id();
342 
343 	bch2_inode_init_early(c, lostfound);
344 	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
345 	lostfound->bi_dir = root_inode.bi_inum;
346 	lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
347 
348 	root_inode.bi_nlink++;
349 
350 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
351 	if (ret)
352 		goto err;
353 
354 	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
355 	ret = bch2_btree_iter_traverse(&lostfound_iter);
356 	if (ret)
357 		goto err;
358 
359 	ret =   bch2_dirent_create_snapshot(trans,
360 				0, root_inode.bi_inum, snapshot, &root_hash_info,
361 				mode_to_type(lostfound->bi_mode),
362 				&lostfound_str,
363 				lostfound->bi_inum,
364 				&lostfound->bi_dir_offset,
365 				STR_HASH_must_create) ?:
366 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
367 				       BTREE_UPDATE_internal_snapshot_node);
368 err:
369 	bch_err_msg(c, ret, "creating lost+found");
370 	bch2_trans_iter_exit(trans, &lostfound_iter);
371 	return ret;
372 }
373 
374 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
375 {
376 	if (inode->bi_inum == BCACHEFS_ROOT_INO &&
377 	    inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
378 		return false;
379 
380 	return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
381 }
382 
383 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
384 {
385 	struct btree_iter iter;
386 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
387 					SPOS(d_pos.inode, d_pos.offset, snapshot),
388 					BTREE_ITER_intent|
389 					BTREE_ITER_with_updates);
390 	int ret = bkey_err(k);
391 	if (ret)
392 		return ret;
393 
394 	if (bpos_eq(k.k->p, d_pos)) {
395 		/*
396 		 * delet_at() doesn't work because the update path doesn't
397 		 * internally use BTREE_ITER_with_updates yet
398 		 */
399 		struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
400 		ret = PTR_ERR_OR_ZERO(k);
401 		if (ret)
402 			goto err;
403 
404 		bkey_init(&k->k);
405 		k->k.type = KEY_TYPE_whiteout;
406 		k->k.p = iter.pos;
407 		ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
408 	}
409 err:
410 	bch2_trans_iter_exit(trans, &iter);
411 	return ret;
412 }
413 
414 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
415 {
416 	struct bch_fs *c = trans->c;
417 	struct bch_inode_unpacked lostfound;
418 	char name_buf[20];
419 	int ret;
420 
421 	u32 dirent_snapshot = inode->bi_snapshot;
422 	if (inode->bi_subvol) {
423 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
424 
425 		u64 root_inum;
426 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
427 				    &dirent_snapshot, &root_inum);
428 		if (ret)
429 			return ret;
430 
431 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
432 	} else {
433 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
434 	}
435 
436 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
437 	if (ret)
438 		return ret;
439 
440 	lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
441 
442 	/* ensure lost+found inode is also present in inode snapshot */
443 	if (!inode->bi_subvol) {
444 		BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
445 		lostfound.bi_snapshot = inode->bi_snapshot;
446 	}
447 
448 	ret = __bch2_fsck_write_inode(trans, &lostfound);
449 	if (ret)
450 		return ret;
451 
452 	struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
453 	struct qstr name = (struct qstr) QSTR(name_buf);
454 
455 	inode->bi_dir = lostfound.bi_inum;
456 
457 	ret = bch2_dirent_create_snapshot(trans,
458 				inode->bi_parent_subvol, lostfound.bi_inum,
459 				dirent_snapshot,
460 				&dir_hash,
461 				inode_d_type(inode),
462 				&name,
463 				inode->bi_subvol ?: inode->bi_inum,
464 				&inode->bi_dir_offset,
465 				STR_HASH_must_create);
466 	if (ret) {
467 		bch_err_msg(c, ret, "error creating dirent");
468 		return ret;
469 	}
470 
471 	ret = __bch2_fsck_write_inode(trans, inode);
472 	if (ret)
473 		return ret;
474 
475 	/*
476 	 * Fix up inodes in child snapshots: if they should also be reattached
477 	 * update the backpointer field, if they should not be we need to emit
478 	 * whiteouts for the dirent we just created.
479 	 */
480 	if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
481 		snapshot_id_list whiteouts_done;
482 		struct btree_iter iter;
483 		struct bkey_s_c k;
484 
485 		darray_init(&whiteouts_done);
486 
487 		for_each_btree_key_reverse_norestart(trans, iter,
488 				BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
489 				BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
490 			if (k.k->p.offset != inode->bi_inum)
491 				break;
492 
493 			if (!bkey_is_inode(k.k) ||
494 			    !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
495 			    snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
496 				continue;
497 
498 			struct bch_inode_unpacked child_inode;
499 			ret = bch2_inode_unpack(k, &child_inode);
500 			if (ret)
501 				break;
502 
503 			if (!inode_should_reattach(&child_inode)) {
504 				ret = maybe_delete_dirent(trans,
505 							  SPOS(lostfound.bi_inum, inode->bi_dir_offset,
506 							       dirent_snapshot),
507 							  k.k->p.snapshot);
508 				if (ret)
509 					break;
510 
511 				ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
512 				if (ret)
513 					break;
514 			} else {
515 				iter.snapshot = k.k->p.snapshot;
516 				child_inode.bi_dir = inode->bi_dir;
517 				child_inode.bi_dir_offset = inode->bi_dir_offset;
518 
519 				ret = bch2_inode_write_flags(trans, &iter, &child_inode,
520 							     BTREE_UPDATE_internal_snapshot_node);
521 				if (ret)
522 					break;
523 			}
524 		}
525 		darray_exit(&whiteouts_done);
526 		bch2_trans_iter_exit(trans, &iter);
527 	}
528 
529 	return ret;
530 }
531 
532 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
533 						struct btree_iter *iter,
534 						struct bpos pos)
535 {
536 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
537 }
538 
539 static int remove_backpointer(struct btree_trans *trans,
540 			      struct bch_inode_unpacked *inode)
541 {
542 	if (!inode->bi_dir)
543 		return 0;
544 
545 	struct bch_fs *c = trans->c;
546 	struct btree_iter iter;
547 	struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
548 				     SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
549 	int ret = bkey_err(d) ?:
550 		  dirent_points_to_inode(c, d, inode) ?:
551 		  __remove_dirent(trans, d.k->p);
552 	bch2_trans_iter_exit(trans, &iter);
553 	return ret;
554 }
555 
556 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
557 {
558 	struct bch_fs *c = trans->c;
559 
560 	struct bch_inode_unpacked inode;
561 	int ret = bch2_inode_find_by_inum_trans(trans,
562 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
563 				&inode);
564 	if (ret)
565 		return ret;
566 
567 	ret = remove_backpointer(trans, &inode);
568 	if (!bch2_err_matches(ret, ENOENT))
569 		bch_err_msg(c, ret, "removing dirent");
570 	if (ret)
571 		return ret;
572 
573 	ret = reattach_inode(trans, &inode);
574 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
575 	return ret;
576 }
577 
578 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
579 {
580 	struct bch_fs *c = trans->c;
581 
582 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
583 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
584 		return -BCH_ERR_fsck_repair_unimplemented;
585 	}
586 
587 	/*
588 	 * If inum isn't set, that means we're being called from check_dirents,
589 	 * not check_inodes - the root of this subvolume doesn't exist or we
590 	 * would have found it there:
591 	 */
592 	if (!inum) {
593 		struct btree_iter inode_iter = {};
594 		struct bch_inode_unpacked new_inode;
595 		u64 cpu = raw_smp_processor_id();
596 
597 		bch2_inode_init_early(c, &new_inode);
598 		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
599 
600 		new_inode.bi_subvol = subvolid;
601 
602 		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
603 			  bch2_btree_iter_traverse(&inode_iter) ?:
604 			  bch2_inode_write(trans, &inode_iter, &new_inode);
605 		bch2_trans_iter_exit(trans, &inode_iter);
606 		if (ret)
607 			return ret;
608 
609 		inum = new_inode.bi_inum;
610 	}
611 
612 	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
613 
614 	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
615 	int ret = PTR_ERR_OR_ZERO(new_subvol);
616 	if (ret)
617 		return ret;
618 
619 	bkey_subvolume_init(&new_subvol->k_i);
620 	new_subvol->k.p.offset	= subvolid;
621 	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
622 	new_subvol->v.inode	= cpu_to_le64(inum);
623 	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
624 	if (ret)
625 		return ret;
626 
627 	struct btree_iter iter;
628 	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
629 			BTREE_ID_snapshots, POS(0, snapshotid),
630 			0, snapshot);
631 	ret = PTR_ERR_OR_ZERO(s);
632 	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
633 	if (ret)
634 		return ret;
635 
636 	u32 snapshot_tree = le32_to_cpu(s->v.tree);
637 
638 	s->v.subvol = cpu_to_le32(subvolid);
639 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
640 	bch2_trans_iter_exit(trans, &iter);
641 
642 	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
643 			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
644 			0, snapshot_tree);
645 	ret = PTR_ERR_OR_ZERO(st);
646 	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
647 	if (ret)
648 		return ret;
649 
650 	if (!st->v.master_subvol)
651 		st->v.master_subvol = cpu_to_le32(subvolid);
652 
653 	bch2_trans_iter_exit(trans, &iter);
654 	return 0;
655 }
656 
657 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
658 {
659 	struct bch_fs *c = trans->c;
660 	unsigned i_mode = S_IFREG;
661 	u64 i_size = 0;
662 
663 	switch (btree) {
664 	case BTREE_ID_extents: {
665 		struct btree_iter iter = {};
666 
667 		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
668 		struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
669 		bch2_trans_iter_exit(trans, &iter);
670 		int ret = bkey_err(k);
671 		if (ret)
672 			return ret;
673 
674 		i_size = k.k->p.offset << 9;
675 		break;
676 	}
677 	case BTREE_ID_dirents:
678 		i_mode = S_IFDIR;
679 		break;
680 	case BTREE_ID_xattrs:
681 		break;
682 	default:
683 		BUG();
684 	}
685 
686 	struct bch_inode_unpacked new_inode;
687 	bch2_inode_init_early(c, &new_inode);
688 	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
689 	new_inode.bi_size = i_size;
690 	new_inode.bi_inum = inum;
691 	new_inode.bi_snapshot = snapshot;
692 
693 	return __bch2_fsck_write_inode(trans, &new_inode);
694 }
695 
696 struct snapshots_seen {
697 	struct bpos			pos;
698 	snapshot_id_list		ids;
699 };
700 
701 static inline void snapshots_seen_exit(struct snapshots_seen *s)
702 {
703 	darray_exit(&s->ids);
704 }
705 
706 static inline void snapshots_seen_init(struct snapshots_seen *s)
707 {
708 	memset(s, 0, sizeof(*s));
709 }
710 
711 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
712 {
713 	u32 *i;
714 	__darray_for_each(s->ids, i) {
715 		if (*i == id)
716 			return 0;
717 		if (*i > id)
718 			break;
719 	}
720 
721 	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
722 	if (ret)
723 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
724 			s->ids.size);
725 	return ret;
726 }
727 
728 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
729 				 enum btree_id btree_id, struct bpos pos)
730 {
731 	if (!bkey_eq(s->pos, pos))
732 		s->ids.nr = 0;
733 	s->pos = pos;
734 
735 	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
736 }
737 
738 /**
739  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
740  * and @ancestor hasn't been overwritten in @seen
741  *
742  * @c:		filesystem handle
743  * @seen:	list of snapshot ids already seen at current position
744  * @id:		descendent snapshot id
745  * @ancestor:	ancestor snapshot id
746  *
747  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
748  */
749 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
750 				    u32 id, u32 ancestor)
751 {
752 	ssize_t i;
753 
754 	EBUG_ON(id > ancestor);
755 
756 	/* @ancestor should be the snapshot most recently added to @seen */
757 	EBUG_ON(ancestor != seen->pos.snapshot);
758 	EBUG_ON(ancestor != darray_last(seen->ids));
759 
760 	if (id == ancestor)
761 		return true;
762 
763 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
764 		return false;
765 
766 	/*
767 	 * We know that @id is a descendant of @ancestor, we're checking if
768 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
769 	 * @ascestor and with @id as a descendent.
770 	 *
771 	 * But we already know that we're scanning IDs between @id and @ancestor
772 	 * numerically, since snapshot ID lists are kept sorted, so if we find
773 	 * an id that's an ancestor of @id we're done:
774 	 */
775 
776 	for (i = seen->ids.nr - 2;
777 	     i >= 0 && seen->ids.data[i] >= id;
778 	     --i)
779 		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
780 			return false;
781 
782 	return true;
783 }
784 
785 /**
786  * ref_visible - given a key with snapshot id @src that points to a key with
787  * snapshot id @dst, test whether there is some snapshot in which @dst is
788  * visible.
789  *
790  * @c:		filesystem handle
791  * @s:		list of snapshot IDs already seen at @src
792  * @src:	snapshot ID of src key
793  * @dst:	snapshot ID of dst key
794  * Returns:	true if there is some snapshot in which @dst is visible
795  *
796  * Assumes we're visiting @src keys in natural key order
797  */
798 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
799 			u32 src, u32 dst)
800 {
801 	return dst <= src
802 		? key_visible_in_snapshot(c, s, dst, src)
803 		: bch2_snapshot_is_ancestor(c, src, dst);
804 }
805 
806 static int ref_visible2(struct bch_fs *c,
807 			u32 src, struct snapshots_seen *src_seen,
808 			u32 dst, struct snapshots_seen *dst_seen)
809 {
810 	if (dst > src) {
811 		swap(dst, src);
812 		swap(dst_seen, src_seen);
813 	}
814 	return key_visible_in_snapshot(c, src_seen, dst, src);
815 }
816 
817 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
818 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
819 	     (_i)->snapshot <= (_snapshot); _i++)					\
820 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
821 
822 struct inode_walker_entry {
823 	struct bch_inode_unpacked inode;
824 	u32			snapshot;
825 	u64			count;
826 };
827 
828 struct inode_walker {
829 	bool				first_this_inode;
830 	bool				have_inodes;
831 	bool				recalculate_sums;
832 	struct bpos			last_pos;
833 
834 	DARRAY(struct inode_walker_entry) inodes;
835 	snapshot_id_list		deletes;
836 };
837 
838 static void inode_walker_exit(struct inode_walker *w)
839 {
840 	darray_exit(&w->inodes);
841 	darray_exit(&w->deletes);
842 }
843 
844 static struct inode_walker inode_walker_init(void)
845 {
846 	return (struct inode_walker) { 0, };
847 }
848 
849 static int add_inode(struct bch_fs *c, struct inode_walker *w,
850 		     struct bkey_s_c inode)
851 {
852 	struct bch_inode_unpacked u;
853 
854 	return bch2_inode_unpack(inode, &u) ?:
855 		darray_push(&w->inodes, ((struct inode_walker_entry) {
856 		.inode		= u,
857 		.snapshot	= inode.k->p.snapshot,
858 	}));
859 }
860 
861 static int get_inodes_all_snapshots(struct btree_trans *trans,
862 				    struct inode_walker *w, u64 inum)
863 {
864 	struct bch_fs *c = trans->c;
865 	struct btree_iter iter;
866 	struct bkey_s_c k;
867 	int ret;
868 
869 	/*
870 	 * We no longer have inodes for w->last_pos; clear this to avoid
871 	 * screwing up check_i_sectors/check_subdir_count if we take a
872 	 * transaction restart here:
873 	 */
874 	w->have_inodes = false;
875 	w->recalculate_sums = false;
876 	w->inodes.nr = 0;
877 
878 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
879 				     BTREE_ITER_all_snapshots, k, ret) {
880 		if (k.k->p.offset != inum)
881 			break;
882 
883 		if (bkey_is_inode(k.k))
884 			add_inode(c, w, k);
885 	}
886 	bch2_trans_iter_exit(trans, &iter);
887 
888 	if (ret)
889 		return ret;
890 
891 	w->first_this_inode = true;
892 	w->have_inodes = true;
893 	return 0;
894 }
895 
896 static struct inode_walker_entry *
897 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
898 {
899 	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
900 
901 	struct inode_walker_entry *i;
902 	__darray_for_each(w->inodes, i)
903 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
904 			goto found;
905 
906 	return NULL;
907 found:
908 	BUG_ON(k.k->p.snapshot > i->snapshot);
909 
910 	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
911 		struct inode_walker_entry new = *i;
912 
913 		new.snapshot = k.k->p.snapshot;
914 		new.count = 0;
915 
916 		struct printbuf buf = PRINTBUF;
917 		bch2_bkey_val_to_text(&buf, c, k);
918 
919 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
920 			 "unexpected because we should always update the inode when we update a key in that inode\n"
921 			 "%s",
922 			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
923 		printbuf_exit(&buf);
924 
925 		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
926 			--i;
927 
928 		size_t pos = i - w->inodes.data;
929 		int ret = darray_insert_item(&w->inodes, pos, new);
930 		if (ret)
931 			return ERR_PTR(ret);
932 
933 		i = w->inodes.data + pos;
934 	}
935 
936 	return i;
937 }
938 
939 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
940 					     struct inode_walker *w,
941 					     struct bkey_s_c k)
942 {
943 	if (w->last_pos.inode != k.k->p.inode) {
944 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
945 		if (ret)
946 			return ERR_PTR(ret);
947 	}
948 
949 	w->last_pos = k.k->p;
950 
951 	return lookup_inode_for_snapshot(trans->c, w, k);
952 }
953 
954 static int get_visible_inodes(struct btree_trans *trans,
955 			      struct inode_walker *w,
956 			      struct snapshots_seen *s,
957 			      u64 inum)
958 {
959 	struct bch_fs *c = trans->c;
960 	struct btree_iter iter;
961 	struct bkey_s_c k;
962 	int ret;
963 
964 	w->inodes.nr = 0;
965 	w->deletes.nr = 0;
966 
967 	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
968 			   BTREE_ITER_all_snapshots, k, ret) {
969 		if (k.k->p.offset != inum)
970 			break;
971 
972 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
973 			continue;
974 
975 		if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
976 			continue;
977 
978 		ret = bkey_is_inode(k.k)
979 			? add_inode(c, w, k)
980 			: snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
981 		if (ret)
982 			break;
983 	}
984 	bch2_trans_iter_exit(trans, &iter);
985 
986 	return ret;
987 }
988 
989 /*
990  * Prefer to delete the first one, since that will be the one at the wrong
991  * offset:
992  * return value: 0 -> delete k1, 1 -> delete k2
993  */
994 int bch2_fsck_update_backpointers(struct btree_trans *trans,
995 				  struct snapshots_seen *s,
996 				  const struct bch_hash_desc desc,
997 				  struct bch_hash_info *hash_info,
998 				  struct bkey_i *new)
999 {
1000 	if (new->k.type != KEY_TYPE_dirent)
1001 		return 0;
1002 
1003 	struct bkey_i_dirent *d = bkey_i_to_dirent(new);
1004 	struct inode_walker target = inode_walker_init();
1005 	int ret = 0;
1006 
1007 	if (d->v.d_type == DT_SUBVOL) {
1008 		BUG();
1009 	} else {
1010 		ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
1011 		if (ret)
1012 			goto err;
1013 
1014 		darray_for_each(target.inodes, i) {
1015 			i->inode.bi_dir_offset = d->k.p.offset;
1016 			ret = __bch2_fsck_write_inode(trans, &i->inode);
1017 			if (ret)
1018 				goto err;
1019 		}
1020 	}
1021 err:
1022 	inode_walker_exit(&target);
1023 	return ret;
1024 }
1025 
1026 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
1027 					       struct btree_iter *iter,
1028 					       struct bch_inode_unpacked *inode,
1029 					       u32 *snapshot)
1030 {
1031 	if (inode->bi_subvol) {
1032 		u64 inum;
1033 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
1034 		if (ret)
1035 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
1036 	}
1037 
1038 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
1039 }
1040 
1041 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
1042 {
1043 	struct btree_iter iter;
1044 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
1045 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
1046 	bch2_trans_iter_exit(trans, &iter);
1047 	return ret;
1048 }
1049 
1050 static int check_inode_dirent_inode(struct btree_trans *trans,
1051 				    struct bch_inode_unpacked *inode,
1052 				    bool *write_inode)
1053 {
1054 	struct bch_fs *c = trans->c;
1055 	struct printbuf buf = PRINTBUF;
1056 
1057 	u32 inode_snapshot = inode->bi_snapshot;
1058 	struct btree_iter dirent_iter = {};
1059 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
1060 	int ret = bkey_err(d);
1061 	if (ret && !bch2_err_matches(ret, ENOENT))
1062 		return ret;
1063 
1064 	if (fsck_err_on(ret,
1065 			trans, inode_points_to_missing_dirent,
1066 			"inode points to missing dirent\n%s",
1067 			(bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
1068 	    fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
1069 			trans, inode_points_to_wrong_dirent,
1070 			"%s",
1071 			(printbuf_reset(&buf),
1072 			 dirent_inode_mismatch_msg(&buf, c, d, inode),
1073 			 buf.buf))) {
1074 		/*
1075 		 * We just clear the backpointer fields for now. If we find a
1076 		 * dirent that points to this inode in check_dirents(), we'll
1077 		 * update it then; then when we get to check_path() if the
1078 		 * backpointer is still 0 we'll reattach it.
1079 		 */
1080 		inode->bi_dir = 0;
1081 		inode->bi_dir_offset = 0;
1082 		*write_inode = true;
1083 	}
1084 
1085 	ret = 0;
1086 fsck_err:
1087 	bch2_trans_iter_exit(trans, &dirent_iter);
1088 	printbuf_exit(&buf);
1089 	bch_err_fn(c, ret);
1090 	return ret;
1091 }
1092 
1093 static int get_snapshot_root_inode(struct btree_trans *trans,
1094 				   struct bch_inode_unpacked *root,
1095 				   u64 inum)
1096 {
1097 	struct btree_iter iter;
1098 	struct bkey_s_c k;
1099 	int ret = 0;
1100 
1101 	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
1102 					     SPOS(0, inum, U32_MAX),
1103 					     BTREE_ITER_all_snapshots, k, ret) {
1104 		if (k.k->p.offset != inum)
1105 			break;
1106 		if (bkey_is_inode(k.k))
1107 			goto found_root;
1108 	}
1109 	if (ret)
1110 		goto err;
1111 	BUG();
1112 found_root:
1113 	ret = bch2_inode_unpack(k, root);
1114 err:
1115 	bch2_trans_iter_exit(trans, &iter);
1116 	return ret;
1117 }
1118 
1119 static int check_directory_size(struct btree_trans *trans,
1120 				struct bch_inode_unpacked *inode_u,
1121 				struct bkey_s_c inode_k, bool *write_inode)
1122 {
1123 	struct btree_iter iter;
1124 	struct bkey_s_c k;
1125 	u64 new_size = 0;
1126 	int ret;
1127 
1128 	for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
1129 			SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot),
1130 			POS(inode_k.k->p.offset, U64_MAX),
1131 			0, k, ret) {
1132 		if (k.k->type != KEY_TYPE_dirent)
1133 			continue;
1134 
1135 		struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k);
1136 		struct qstr name = bch2_dirent_get_name(dirent);
1137 
1138 		new_size += dirent_occupied_size(&name);
1139 	}
1140 	bch2_trans_iter_exit(trans, &iter);
1141 
1142 	if (!ret && inode_u->bi_size != new_size) {
1143 		inode_u->bi_size = new_size;
1144 		*write_inode = true;
1145 	}
1146 
1147 	return ret;
1148 }
1149 
1150 static int check_inode(struct btree_trans *trans,
1151 		       struct btree_iter *iter,
1152 		       struct bkey_s_c k,
1153 		       struct bch_inode_unpacked *snapshot_root,
1154 		       struct snapshots_seen *s)
1155 {
1156 	struct bch_fs *c = trans->c;
1157 	struct printbuf buf = PRINTBUF;
1158 	struct bch_inode_unpacked u;
1159 	bool do_update = false;
1160 	int ret;
1161 
1162 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1163 	if (ret < 0)
1164 		goto err;
1165 	if (ret)
1166 		return 0;
1167 
1168 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1169 	if (ret)
1170 		goto err;
1171 
1172 	if (!bkey_is_inode(k.k))
1173 		return 0;
1174 
1175 	ret = bch2_inode_unpack(k, &u);
1176 	if (ret)
1177 		goto err;
1178 
1179 	if (snapshot_root->bi_inum != u.bi_inum) {
1180 		ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
1181 		if (ret)
1182 			goto err;
1183 	}
1184 
1185 	if (fsck_err_on(u.bi_hash_seed		!= snapshot_root->bi_hash_seed ||
1186 			INODE_STR_HASH(&u)	!= INODE_STR_HASH(snapshot_root),
1187 			trans, inode_snapshot_mismatch,
1188 			"inode hash info in different snapshots don't match")) {
1189 		u.bi_hash_seed = snapshot_root->bi_hash_seed;
1190 		SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
1191 		do_update = true;
1192 	}
1193 
1194 	if (u.bi_dir || u.bi_dir_offset) {
1195 		ret = check_inode_dirent_inode(trans, &u, &do_update);
1196 		if (ret)
1197 			goto err;
1198 	}
1199 
1200 	if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
1201 			trans, inode_unlinked_but_has_dirent,
1202 			"inode unlinked but has dirent\n%s",
1203 			(printbuf_reset(&buf),
1204 			 bch2_inode_unpacked_to_text(&buf, &u),
1205 			 buf.buf))) {
1206 		u.bi_flags &= ~BCH_INODE_unlinked;
1207 		do_update = true;
1208 	}
1209 
1210 	if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
1211 		/* Check for this early so that check_unreachable_inode() will reattach it */
1212 
1213 		ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
1214 		if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
1215 			goto err;
1216 
1217 		fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
1218 			    "dir unlinked but not empty\n%s",
1219 			    (printbuf_reset(&buf),
1220 			     bch2_inode_unpacked_to_text(&buf, &u),
1221 			     buf.buf));
1222 		u.bi_flags &= ~BCH_INODE_unlinked;
1223 		do_update = true;
1224 		ret = 0;
1225 	}
1226 
1227 	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
1228 	if (ret < 0)
1229 		goto err;
1230 
1231 	if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
1232 			trans, inode_has_child_snapshots_wrong,
1233 			"inode has_child_snapshots flag wrong (should be %u)\n%s",
1234 			ret,
1235 			(printbuf_reset(&buf),
1236 			 bch2_inode_unpacked_to_text(&buf, &u),
1237 			 buf.buf))) {
1238 		if (ret)
1239 			u.bi_flags |= BCH_INODE_has_child_snapshot;
1240 		else
1241 			u.bi_flags &= ~BCH_INODE_has_child_snapshot;
1242 		do_update = true;
1243 	}
1244 	ret = 0;
1245 
1246 	if ((u.bi_flags & BCH_INODE_unlinked) &&
1247 	    !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
1248 		if (!test_bit(BCH_FS_started, &c->flags)) {
1249 			/*
1250 			 * If we're not in online fsck, don't delete unlinked
1251 			 * inodes, just make sure they're on the deleted list.
1252 			 *
1253 			 * They might be referred to by a logged operation -
1254 			 * i.e. we might have crashed in the middle of a
1255 			 * truncate on an unlinked but open file - so we want to
1256 			 * let the delete_dead_inodes kill it after resuming
1257 			 * logged ops.
1258 			 */
1259 			ret = check_inode_deleted_list(trans, k.k->p);
1260 			if (ret < 0)
1261 				goto err_noprint;
1262 
1263 			fsck_err_on(!ret,
1264 				    trans, unlinked_inode_not_on_deleted_list,
1265 				    "inode %llu:%u unlinked, but not on deleted list",
1266 				    u.bi_inum, k.k->p.snapshot);
1267 
1268 			ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
1269 			if (ret)
1270 				goto err;
1271 		} else {
1272 			ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
1273 			if (ret < 0)
1274 				goto err;
1275 
1276 			if (fsck_err_on(!ret,
1277 					trans, inode_unlinked_and_not_open,
1278 				      "inode %llu:%u unlinked and not open",
1279 				      u.bi_inum, u.bi_snapshot)) {
1280 				ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1281 				bch_err_msg(c, ret, "in fsck deleting inode");
1282 				goto err_noprint;
1283 			}
1284 			ret = 0;
1285 		}
1286 	}
1287 
1288 	if (fsck_err_on(u.bi_parent_subvol &&
1289 			(u.bi_subvol == 0 ||
1290 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1291 			trans, inode_bi_parent_nonzero,
1292 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1293 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1294 		u.bi_parent_subvol = 0;
1295 		do_update = true;
1296 	}
1297 
1298 	if (u.bi_subvol) {
1299 		struct bch_subvolume s;
1300 
1301 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
1302 		if (ret && !bch2_err_matches(ret, ENOENT))
1303 			goto err;
1304 
1305 		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1306 			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1307 			goto do_update;
1308 		}
1309 
1310 		if (fsck_err_on(ret,
1311 				trans, inode_bi_subvol_missing,
1312 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1313 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1314 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1315 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1316 							   k.k->p.snapshot),
1317 				trans, inode_bi_subvol_wrong,
1318 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1319 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1320 				le64_to_cpu(s.inode),
1321 				le32_to_cpu(s.snapshot))) {
1322 			u.bi_subvol = 0;
1323 			u.bi_parent_subvol = 0;
1324 			do_update = true;
1325 		}
1326 	}
1327 
1328 	if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
1329 			trans, inode_journal_seq_in_future,
1330 			"inode journal seq in future (currently at %llu)\n%s",
1331 			journal_cur_seq(&c->journal),
1332 			(printbuf_reset(&buf),
1333 			 bch2_inode_unpacked_to_text(&buf, &u),
1334 			buf.buf))) {
1335 		u.bi_journal_seq = journal_cur_seq(&c->journal);
1336 		do_update = true;
1337 	}
1338 
1339 	if (S_ISDIR(u.bi_mode)) {
1340 		ret = check_directory_size(trans, &u, k, &do_update);
1341 
1342 		fsck_err_on(ret,
1343 			    trans, directory_size_mismatch,
1344 			    "directory inode %llu:%u with the mismatch directory size",
1345 			    u.bi_inum, k.k->p.snapshot);
1346 		ret = 0;
1347 	}
1348 do_update:
1349 	if (do_update) {
1350 		ret = __bch2_fsck_write_inode(trans, &u);
1351 		bch_err_msg(c, ret, "in fsck updating inode");
1352 		if (ret)
1353 			goto err_noprint;
1354 	}
1355 err:
1356 fsck_err:
1357 	bch_err_fn(c, ret);
1358 err_noprint:
1359 	printbuf_exit(&buf);
1360 	return ret;
1361 }
1362 
1363 int bch2_check_inodes(struct bch_fs *c)
1364 {
1365 	struct bch_inode_unpacked snapshot_root = {};
1366 	struct snapshots_seen s;
1367 
1368 	snapshots_seen_init(&s);
1369 
1370 	int ret = bch2_trans_run(c,
1371 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1372 				POS_MIN,
1373 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1374 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1375 			check_inode(trans, &iter, k, &snapshot_root, &s)));
1376 
1377 	snapshots_seen_exit(&s);
1378 	bch_err_fn(c, ret);
1379 	return ret;
1380 }
1381 
1382 static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
1383 					    struct bch_inode_unpacked *inode)
1384 {
1385 	struct bch_fs *c = trans->c;
1386 	struct btree_iter iter;
1387 	struct bkey_s_c k;
1388 	int ret = 0;
1389 
1390 	/*
1391 	 * We look for inodes to reattach in natural key order, leaves first,
1392 	 * but we should do the reattach at the oldest version that needs to be
1393 	 * reattached:
1394 	 */
1395 	for_each_btree_key_norestart(trans, iter,
1396 				     BTREE_ID_inodes,
1397 				     SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
1398 				     BTREE_ITER_all_snapshots, k, ret) {
1399 		if (k.k->p.offset != inode->bi_inum)
1400 			break;
1401 
1402 		if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
1403 			continue;
1404 
1405 		if (!bkey_is_inode(k.k))
1406 			break;
1407 
1408 		struct bch_inode_unpacked parent_inode;
1409 		ret = bch2_inode_unpack(k, &parent_inode);
1410 		if (ret)
1411 			break;
1412 
1413 		if (!inode_should_reattach(&parent_inode))
1414 			break;
1415 
1416 		*inode = parent_inode;
1417 	}
1418 	bch2_trans_iter_exit(trans, &iter);
1419 
1420 	return ret;
1421 }
1422 
1423 static int check_unreachable_inode(struct btree_trans *trans,
1424 				   struct btree_iter *iter,
1425 				   struct bkey_s_c k)
1426 {
1427 	struct printbuf buf = PRINTBUF;
1428 	int ret = 0;
1429 
1430 	if (!bkey_is_inode(k.k))
1431 		return 0;
1432 
1433 	struct bch_inode_unpacked inode;
1434 	ret = bch2_inode_unpack(k, &inode);
1435 	if (ret)
1436 		return ret;
1437 
1438 	if (!inode_should_reattach(&inode))
1439 		return 0;
1440 
1441 	ret = find_oldest_inode_needs_reattach(trans, &inode);
1442 	if (ret)
1443 		return ret;
1444 
1445 	if (fsck_err(trans, inode_unreachable,
1446 		     "unreachable inode:\n%s",
1447 		     (bch2_inode_unpacked_to_text(&buf, &inode),
1448 		      buf.buf)))
1449 		ret = reattach_inode(trans, &inode);
1450 fsck_err:
1451 	printbuf_exit(&buf);
1452 	return ret;
1453 }
1454 
1455 /*
1456  * Reattach unreachable (but not unlinked) inodes
1457  *
1458  * Run after check_inodes() and check_dirents(), so we node that inode
1459  * backpointer fields point to valid dirents, and every inode that has a dirent
1460  * that points to it has its backpointer field set - so we're just looking for
1461  * non-unlinked inodes without backpointers:
1462  *
1463  * XXX: this is racy w.r.t. hardlink removal in online fsck
1464  */
1465 int bch2_check_unreachable_inodes(struct bch_fs *c)
1466 {
1467 	int ret = bch2_trans_run(c,
1468 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1469 				POS_MIN,
1470 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1471 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1472 			check_unreachable_inode(trans, &iter, k)));
1473 	bch_err_fn(c, ret);
1474 	return ret;
1475 }
1476 
1477 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1478 {
1479 	switch (btree) {
1480 	case BTREE_ID_extents:
1481 		return S_ISREG(mode) || S_ISLNK(mode);
1482 	case BTREE_ID_dirents:
1483 		return S_ISDIR(mode);
1484 	case BTREE_ID_xattrs:
1485 		return true;
1486 	default:
1487 		BUG();
1488 	}
1489 }
1490 
1491 static int check_key_has_inode(struct btree_trans *trans,
1492 			       struct btree_iter *iter,
1493 			       struct inode_walker *inode,
1494 			       struct inode_walker_entry *i,
1495 			       struct bkey_s_c k)
1496 {
1497 	struct bch_fs *c = trans->c;
1498 	struct printbuf buf = PRINTBUF;
1499 	int ret = PTR_ERR_OR_ZERO(i);
1500 	if (ret)
1501 		return ret;
1502 
1503 	if (k.k->type == KEY_TYPE_whiteout)
1504 		goto out;
1505 
1506 	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1507 		ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1508 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1509 		if (ret)
1510 			goto err;
1511 
1512 		inode->last_pos.inode--;
1513 		ret = -BCH_ERR_transaction_restart_nested;
1514 		goto err;
1515 	}
1516 
1517 	if (fsck_err_on(!i,
1518 			trans, key_in_missing_inode,
1519 			"key in missing inode:\n  %s",
1520 			(printbuf_reset(&buf),
1521 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1522 		goto delete;
1523 
1524 	if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1525 			trans, key_in_wrong_inode_type,
1526 			"key for wrong inode mode %o:\n  %s",
1527 			i->inode.bi_mode,
1528 			(printbuf_reset(&buf),
1529 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1530 		goto delete;
1531 out:
1532 err:
1533 fsck_err:
1534 	printbuf_exit(&buf);
1535 	bch_err_fn(c, ret);
1536 	return ret;
1537 delete:
1538 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1539 	goto out;
1540 }
1541 
1542 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1543 {
1544 	struct bch_fs *c = trans->c;
1545 	int ret = 0;
1546 	s64 count2;
1547 
1548 	darray_for_each(w->inodes, i) {
1549 		if (i->inode.bi_sectors == i->count)
1550 			continue;
1551 
1552 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1553 
1554 		if (w->recalculate_sums)
1555 			i->count = count2;
1556 
1557 		if (i->count != count2) {
1558 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1559 					    w->last_pos.inode, i->snapshot, i->count, count2);
1560 			i->count = count2;
1561 		}
1562 
1563 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1564 				trans, inode_i_sectors_wrong,
1565 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1566 				w->last_pos.inode, i->snapshot,
1567 				i->inode.bi_sectors, i->count)) {
1568 			i->inode.bi_sectors = i->count;
1569 			ret = bch2_fsck_write_inode(trans, &i->inode);
1570 			if (ret)
1571 				break;
1572 		}
1573 	}
1574 fsck_err:
1575 	bch_err_fn(c, ret);
1576 	return ret;
1577 }
1578 
1579 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1580 {
1581 	u32 restart_count = trans->restart_count;
1582 	return check_i_sectors_notnested(trans, w) ?:
1583 		trans_was_restarted(trans, restart_count);
1584 }
1585 
1586 struct extent_end {
1587 	u32			snapshot;
1588 	u64			offset;
1589 	struct snapshots_seen	seen;
1590 };
1591 
1592 struct extent_ends {
1593 	struct bpos			last_pos;
1594 	DARRAY(struct extent_end)	e;
1595 };
1596 
1597 static void extent_ends_reset(struct extent_ends *extent_ends)
1598 {
1599 	darray_for_each(extent_ends->e, i)
1600 		snapshots_seen_exit(&i->seen);
1601 	extent_ends->e.nr = 0;
1602 }
1603 
1604 static void extent_ends_exit(struct extent_ends *extent_ends)
1605 {
1606 	extent_ends_reset(extent_ends);
1607 	darray_exit(&extent_ends->e);
1608 }
1609 
1610 static void extent_ends_init(struct extent_ends *extent_ends)
1611 {
1612 	memset(extent_ends, 0, sizeof(*extent_ends));
1613 }
1614 
1615 static int extent_ends_at(struct bch_fs *c,
1616 			  struct extent_ends *extent_ends,
1617 			  struct snapshots_seen *seen,
1618 			  struct bkey_s_c k)
1619 {
1620 	struct extent_end *i, n = (struct extent_end) {
1621 		.offset		= k.k->p.offset,
1622 		.snapshot	= k.k->p.snapshot,
1623 		.seen		= *seen,
1624 	};
1625 
1626 	n.seen.ids.data = kmemdup(seen->ids.data,
1627 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1628 			      GFP_KERNEL);
1629 	if (!n.seen.ids.data)
1630 		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1631 
1632 	__darray_for_each(extent_ends->e, i) {
1633 		if (i->snapshot == k.k->p.snapshot) {
1634 			snapshots_seen_exit(&i->seen);
1635 			*i = n;
1636 			return 0;
1637 		}
1638 
1639 		if (i->snapshot >= k.k->p.snapshot)
1640 			break;
1641 	}
1642 
1643 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1644 }
1645 
1646 static int overlapping_extents_found(struct btree_trans *trans,
1647 				     enum btree_id btree,
1648 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1649 				     struct bkey pos2,
1650 				     bool *fixed,
1651 				     struct extent_end *extent_end)
1652 {
1653 	struct bch_fs *c = trans->c;
1654 	struct printbuf buf = PRINTBUF;
1655 	struct btree_iter iter1, iter2 = { NULL };
1656 	struct bkey_s_c k1, k2;
1657 	int ret;
1658 
1659 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1660 
1661 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1662 			     BTREE_ITER_all_snapshots|
1663 			     BTREE_ITER_not_extents);
1664 	k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
1665 	ret = bkey_err(k1);
1666 	if (ret)
1667 		goto err;
1668 
1669 	prt_str(&buf, "\n  ");
1670 	bch2_bkey_val_to_text(&buf, c, k1);
1671 
1672 	if (!bpos_eq(pos1, k1.k->p)) {
1673 		prt_str(&buf, "\n  wanted\n  ");
1674 		bch2_bpos_to_text(&buf, pos1);
1675 		prt_str(&buf, "\n  ");
1676 		bch2_bkey_to_text(&buf, &pos2);
1677 
1678 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1679 			__func__, buf.buf);
1680 		ret = -BCH_ERR_internal_fsck_err;
1681 		goto err;
1682 	}
1683 
1684 	bch2_trans_copy_iter(&iter2, &iter1);
1685 
1686 	while (1) {
1687 		bch2_btree_iter_advance(&iter2);
1688 
1689 		k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
1690 		ret = bkey_err(k2);
1691 		if (ret)
1692 			goto err;
1693 
1694 		if (bpos_ge(k2.k->p, pos2.p))
1695 			break;
1696 	}
1697 
1698 	prt_str(&buf, "\n  ");
1699 	bch2_bkey_val_to_text(&buf, c, k2);
1700 
1701 	if (bpos_gt(k2.k->p, pos2.p) ||
1702 	    pos2.size != k2.k->size) {
1703 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1704 			__func__, buf.buf);
1705 		ret = -BCH_ERR_internal_fsck_err;
1706 		goto err;
1707 	}
1708 
1709 	prt_printf(&buf, "\n  overwriting %s extent",
1710 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1711 
1712 	if (fsck_err(trans, extent_overlapping,
1713 		     "overlapping extents%s", buf.buf)) {
1714 		struct btree_iter *old_iter = &iter1;
1715 		struct disk_reservation res = { 0 };
1716 
1717 		if (pos1.snapshot < pos2.p.snapshot) {
1718 			old_iter = &iter2;
1719 			swap(k1, k2);
1720 		}
1721 
1722 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1723 
1724 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1725 				BTREE_UPDATE_internal_snapshot_node,
1726 				k1, k2) ?:
1727 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1728 		bch2_disk_reservation_put(c, &res);
1729 
1730 		if (ret)
1731 			goto err;
1732 
1733 		*fixed = true;
1734 
1735 		if (pos1.snapshot == pos2.p.snapshot) {
1736 			/*
1737 			 * We overwrote the first extent, and did the overwrite
1738 			 * in the same snapshot:
1739 			 */
1740 			extent_end->offset = bkey_start_offset(&pos2);
1741 		} else if (pos1.snapshot > pos2.p.snapshot) {
1742 			/*
1743 			 * We overwrote the first extent in pos2's snapshot:
1744 			 */
1745 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1746 		} else {
1747 			/*
1748 			 * We overwrote the second extent - restart
1749 			 * check_extent() from the top:
1750 			 */
1751 			ret = -BCH_ERR_transaction_restart_nested;
1752 		}
1753 	}
1754 fsck_err:
1755 err:
1756 	bch2_trans_iter_exit(trans, &iter2);
1757 	bch2_trans_iter_exit(trans, &iter1);
1758 	printbuf_exit(&buf);
1759 	return ret;
1760 }
1761 
1762 static int check_overlapping_extents(struct btree_trans *trans,
1763 			      struct snapshots_seen *seen,
1764 			      struct extent_ends *extent_ends,
1765 			      struct bkey_s_c k,
1766 			      struct btree_iter *iter,
1767 			      bool *fixed)
1768 {
1769 	struct bch_fs *c = trans->c;
1770 	int ret = 0;
1771 
1772 	/* transaction restart, running again */
1773 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1774 		return 0;
1775 
1776 	if (extent_ends->last_pos.inode != k.k->p.inode)
1777 		extent_ends_reset(extent_ends);
1778 
1779 	darray_for_each(extent_ends->e, i) {
1780 		if (i->offset <= bkey_start_offset(k.k))
1781 			continue;
1782 
1783 		if (!ref_visible2(c,
1784 				  k.k->p.snapshot, seen,
1785 				  i->snapshot, &i->seen))
1786 			continue;
1787 
1788 		ret = overlapping_extents_found(trans, iter->btree_id,
1789 						SPOS(iter->pos.inode,
1790 						     i->offset,
1791 						     i->snapshot),
1792 						&i->seen,
1793 						*k.k, fixed, i);
1794 		if (ret)
1795 			goto err;
1796 	}
1797 
1798 	extent_ends->last_pos = k.k->p;
1799 err:
1800 	return ret;
1801 }
1802 
1803 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1804 				struct bkey_s_c k)
1805 {
1806 	struct bch_fs *c = trans->c;
1807 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1808 	struct bch_extent_crc_unpacked crc;
1809 	const union bch_extent_entry *i;
1810 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1811 
1812 	bkey_for_each_crc(k.k, ptrs, crc, i)
1813 		if (crc_is_encoded(crc) &&
1814 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1815 			struct printbuf buf = PRINTBUF;
1816 
1817 			bch2_bkey_val_to_text(&buf, c, k);
1818 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1819 			printbuf_exit(&buf);
1820 		}
1821 
1822 	return 0;
1823 }
1824 
1825 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1826 			struct bkey_s_c k,
1827 			struct inode_walker *inode,
1828 			struct snapshots_seen *s,
1829 			struct extent_ends *extent_ends,
1830 			struct disk_reservation *res)
1831 {
1832 	struct bch_fs *c = trans->c;
1833 	struct printbuf buf = PRINTBUF;
1834 	int ret = 0;
1835 
1836 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1837 	if (ret) {
1838 		ret = ret < 0 ? ret : 0;
1839 		goto out;
1840 	}
1841 
1842 	if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
1843 		ret = check_i_sectors(trans, inode);
1844 		if (ret)
1845 			goto err;
1846 	}
1847 
1848 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1849 	if (ret)
1850 		goto err;
1851 
1852 	struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
1853 	ret = PTR_ERR_OR_ZERO(extent_i);
1854 	if (ret)
1855 		goto err;
1856 
1857 	ret = check_key_has_inode(trans, iter, inode, extent_i, k);
1858 	if (ret)
1859 		goto err;
1860 
1861 	if (k.k->type != KEY_TYPE_whiteout) {
1862 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1863 						&inode->recalculate_sums);
1864 		if (ret)
1865 			goto err;
1866 
1867 		/*
1868 		 * Check inodes in reverse order, from oldest snapshots to
1869 		 * newest, starting from the inode that matches this extent's
1870 		 * snapshot. If we didn't have one, iterate over all inodes:
1871 		 */
1872 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1873 		     inode->inodes.data && i >= inode->inodes.data;
1874 		     --i) {
1875 			if (i->snapshot > k.k->p.snapshot ||
1876 			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1877 				continue;
1878 
1879 			if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1880 					!bkey_extent_is_reservation(k),
1881 					trans, extent_past_end_of_inode,
1882 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
1883 					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1884 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1885 				struct btree_iter iter2;
1886 
1887 				bch2_trans_copy_iter(&iter2, iter);
1888 				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1889 				ret =   bch2_btree_iter_traverse(&iter2) ?:
1890 					bch2_btree_delete_at(trans, &iter2,
1891 						BTREE_UPDATE_internal_snapshot_node);
1892 				bch2_trans_iter_exit(trans, &iter2);
1893 				if (ret)
1894 					goto err;
1895 
1896 				iter->k.type = KEY_TYPE_whiteout;
1897 				break;
1898 			}
1899 		}
1900 	}
1901 
1902 	ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
1903 	if (ret)
1904 		goto err;
1905 
1906 	if (bkey_extent_is_allocation(k.k)) {
1907 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1908 		     inode->inodes.data && i >= inode->inodes.data;
1909 		     --i) {
1910 			if (i->snapshot > k.k->p.snapshot ||
1911 			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1912 				continue;
1913 
1914 			i->count += k.k->size;
1915 		}
1916 	}
1917 
1918 	if (k.k->type != KEY_TYPE_whiteout) {
1919 		ret = extent_ends_at(c, extent_ends, s, k);
1920 		if (ret)
1921 			goto err;
1922 	}
1923 out:
1924 err:
1925 fsck_err:
1926 	printbuf_exit(&buf);
1927 	bch_err_fn(c, ret);
1928 	return ret;
1929 }
1930 
1931 /*
1932  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1933  * that i_size an i_sectors are consistent
1934  */
1935 int bch2_check_extents(struct bch_fs *c)
1936 {
1937 	struct inode_walker w = inode_walker_init();
1938 	struct snapshots_seen s;
1939 	struct extent_ends extent_ends;
1940 	struct disk_reservation res = { 0 };
1941 
1942 	snapshots_seen_init(&s);
1943 	extent_ends_init(&extent_ends);
1944 
1945 	int ret = bch2_trans_run(c,
1946 		for_each_btree_key(trans, iter, BTREE_ID_extents,
1947 				POS(BCACHEFS_ROOT_INO, 0),
1948 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
1949 			bch2_disk_reservation_put(c, &res);
1950 			check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
1951 			check_extent_overbig(trans, &iter, k);
1952 		})) ?:
1953 		check_i_sectors_notnested(trans, &w));
1954 
1955 	bch2_disk_reservation_put(c, &res);
1956 	extent_ends_exit(&extent_ends);
1957 	inode_walker_exit(&w);
1958 	snapshots_seen_exit(&s);
1959 
1960 	bch_err_fn(c, ret);
1961 	return ret;
1962 }
1963 
1964 int bch2_check_indirect_extents(struct bch_fs *c)
1965 {
1966 	struct disk_reservation res = { 0 };
1967 
1968 	int ret = bch2_trans_run(c,
1969 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1970 				POS_MIN,
1971 				BTREE_ITER_prefetch, k,
1972 				&res, NULL,
1973 				BCH_TRANS_COMMIT_no_enospc, ({
1974 			bch2_disk_reservation_put(c, &res);
1975 			check_extent_overbig(trans, &iter, k);
1976 		})));
1977 
1978 	bch2_disk_reservation_put(c, &res);
1979 	bch_err_fn(c, ret);
1980 	return ret;
1981 }
1982 
1983 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1984 {
1985 	struct bch_fs *c = trans->c;
1986 	int ret = 0;
1987 	s64 count2;
1988 
1989 	darray_for_each(w->inodes, i) {
1990 		if (i->inode.bi_nlink == i->count)
1991 			continue;
1992 
1993 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1994 		if (count2 < 0)
1995 			return count2;
1996 
1997 		if (i->count != count2) {
1998 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1999 					    w->last_pos.inode, i->snapshot, i->count, count2);
2000 			i->count = count2;
2001 			if (i->inode.bi_nlink == i->count)
2002 				continue;
2003 		}
2004 
2005 		if (fsck_err_on(i->inode.bi_nlink != i->count,
2006 				trans, inode_dir_wrong_nlink,
2007 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
2008 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
2009 			i->inode.bi_nlink = i->count;
2010 			ret = bch2_fsck_write_inode(trans, &i->inode);
2011 			if (ret)
2012 				break;
2013 		}
2014 	}
2015 fsck_err:
2016 	bch_err_fn(c, ret);
2017 	return ret;
2018 }
2019 
2020 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
2021 {
2022 	u32 restart_count = trans->restart_count;
2023 	return check_subdir_count_notnested(trans, w) ?:
2024 		trans_was_restarted(trans, restart_count);
2025 }
2026 
2027 noinline_for_stack
2028 static int check_dirent_inode_dirent(struct btree_trans *trans,
2029 				   struct btree_iter *iter,
2030 				   struct bkey_s_c_dirent d,
2031 				   struct bch_inode_unpacked *target)
2032 {
2033 	struct bch_fs *c = trans->c;
2034 	struct printbuf buf = PRINTBUF;
2035 	struct btree_iter bp_iter = { NULL };
2036 	int ret = 0;
2037 
2038 	if (inode_points_to_dirent(target, d))
2039 		return 0;
2040 
2041 	if (!target->bi_dir &&
2042 	    !target->bi_dir_offset) {
2043 		fsck_err_on(S_ISDIR(target->bi_mode),
2044 			    trans, inode_dir_missing_backpointer,
2045 			    "directory with missing backpointer\n%s",
2046 			    (printbuf_reset(&buf),
2047 			     bch2_bkey_val_to_text(&buf, c, d.s_c),
2048 			     prt_printf(&buf, "\n"),
2049 			     bch2_inode_unpacked_to_text(&buf, target),
2050 			     buf.buf));
2051 
2052 		fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
2053 			    trans, inode_unlinked_but_has_dirent,
2054 			    "inode unlinked but has dirent\n%s",
2055 			    (printbuf_reset(&buf),
2056 			     bch2_bkey_val_to_text(&buf, c, d.s_c),
2057 			     prt_printf(&buf, "\n"),
2058 			     bch2_inode_unpacked_to_text(&buf, target),
2059 			     buf.buf));
2060 
2061 		target->bi_flags &= ~BCH_INODE_unlinked;
2062 		target->bi_dir		= d.k->p.inode;
2063 		target->bi_dir_offset	= d.k->p.offset;
2064 		return __bch2_fsck_write_inode(trans, target);
2065 	}
2066 
2067 	if (bch2_inode_should_have_single_bp(target) &&
2068 	    !fsck_err(trans, inode_wrong_backpointer,
2069 		      "dirent points to inode that does not point back:\n  %s",
2070 		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
2071 		       prt_printf(&buf, "\n  "),
2072 		       bch2_inode_unpacked_to_text(&buf, target),
2073 		       buf.buf)))
2074 		goto err;
2075 
2076 	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
2077 			      SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
2078 	ret = bkey_err(bp_dirent);
2079 	if (ret && !bch2_err_matches(ret, ENOENT))
2080 		goto err;
2081 
2082 	bool backpointer_exists = !ret;
2083 	ret = 0;
2084 
2085 	if (fsck_err_on(!backpointer_exists,
2086 			trans, inode_wrong_backpointer,
2087 			"inode %llu:%u has wrong backpointer:\n"
2088 			"got       %llu:%llu\n"
2089 			"should be %llu:%llu",
2090 			target->bi_inum, target->bi_snapshot,
2091 			target->bi_dir,
2092 			target->bi_dir_offset,
2093 			d.k->p.inode,
2094 			d.k->p.offset)) {
2095 		target->bi_dir		= d.k->p.inode;
2096 		target->bi_dir_offset	= d.k->p.offset;
2097 		ret = __bch2_fsck_write_inode(trans, target);
2098 		goto out;
2099 	}
2100 
2101 	bch2_bkey_val_to_text(&buf, c, d.s_c);
2102 	prt_newline(&buf);
2103 	if (backpointer_exists)
2104 		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
2105 
2106 	if (fsck_err_on(backpointer_exists &&
2107 			(S_ISDIR(target->bi_mode) ||
2108 			 target->bi_subvol),
2109 			trans, inode_dir_multiple_links,
2110 			"%s %llu:%u with multiple links\n%s",
2111 			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
2112 			target->bi_inum, target->bi_snapshot, buf.buf)) {
2113 		ret = __remove_dirent(trans, d.k->p);
2114 		goto out;
2115 	}
2116 
2117 	/*
2118 	 * hardlinked file with nlink 0:
2119 	 * We're just adjusting nlink here so check_nlinks() will pick
2120 	 * it up, it ignores inodes with nlink 0
2121 	 */
2122 	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
2123 			trans, inode_multiple_links_but_nlink_0,
2124 			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
2125 			target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
2126 		target->bi_nlink++;
2127 		target->bi_flags &= ~BCH_INODE_unlinked;
2128 		ret = __bch2_fsck_write_inode(trans, target);
2129 		if (ret)
2130 			goto err;
2131 	}
2132 out:
2133 err:
2134 fsck_err:
2135 	bch2_trans_iter_exit(trans, &bp_iter);
2136 	printbuf_exit(&buf);
2137 	bch_err_fn(c, ret);
2138 	return ret;
2139 }
2140 
2141 noinline_for_stack
2142 static int check_dirent_target(struct btree_trans *trans,
2143 			       struct btree_iter *iter,
2144 			       struct bkey_s_c_dirent d,
2145 			       struct bch_inode_unpacked *target)
2146 {
2147 	struct bch_fs *c = trans->c;
2148 	struct bkey_i_dirent *n;
2149 	struct printbuf buf = PRINTBUF;
2150 	int ret = 0;
2151 
2152 	ret = check_dirent_inode_dirent(trans, iter, d, target);
2153 	if (ret)
2154 		goto err;
2155 
2156 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
2157 			trans, dirent_d_type_wrong,
2158 			"incorrect d_type: got %s, should be %s:\n%s",
2159 			bch2_d_type_str(d.v->d_type),
2160 			bch2_d_type_str(inode_d_type(target)),
2161 			(printbuf_reset(&buf),
2162 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
2163 		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
2164 		ret = PTR_ERR_OR_ZERO(n);
2165 		if (ret)
2166 			goto err;
2167 
2168 		bkey_reassemble(&n->k_i, d.s_c);
2169 		n->v.d_type = inode_d_type(target);
2170 		if (n->v.d_type == DT_SUBVOL) {
2171 			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
2172 			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
2173 		} else {
2174 			n->v.d_inum = cpu_to_le64(target->bi_inum);
2175 		}
2176 
2177 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
2178 		if (ret)
2179 			goto err;
2180 
2181 		d = dirent_i_to_s_c(n);
2182 	}
2183 err:
2184 fsck_err:
2185 	printbuf_exit(&buf);
2186 	bch_err_fn(c, ret);
2187 	return ret;
2188 }
2189 
2190 /* find a subvolume that's a descendent of @snapshot: */
2191 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
2192 {
2193 	struct btree_iter iter;
2194 	struct bkey_s_c k;
2195 	int ret;
2196 
2197 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
2198 		if (k.k->type != KEY_TYPE_subvolume)
2199 			continue;
2200 
2201 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2202 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
2203 			bch2_trans_iter_exit(trans, &iter);
2204 			*subvolid = k.k->p.offset;
2205 			goto found;
2206 		}
2207 	}
2208 	if (!ret)
2209 		ret = -ENOENT;
2210 found:
2211 	bch2_trans_iter_exit(trans, &iter);
2212 	return ret;
2213 }
2214 
2215 noinline_for_stack
2216 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
2217 				  struct bkey_s_c_dirent d)
2218 {
2219 	struct bch_fs *c = trans->c;
2220 	struct btree_iter subvol_iter = {};
2221 	struct bch_inode_unpacked subvol_root;
2222 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
2223 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
2224 	u32 parent_snapshot;
2225 	u32 new_parent_subvol = 0;
2226 	u64 parent_inum;
2227 	struct printbuf buf = PRINTBUF;
2228 	int ret = 0;
2229 
2230 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
2231 	if (ret && !bch2_err_matches(ret, ENOENT))
2232 		return ret;
2233 
2234 	if (ret ||
2235 	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
2236 		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
2237 		if (ret2 && !bch2_err_matches(ret, ENOENT))
2238 			return ret2;
2239 	}
2240 
2241 	if (ret &&
2242 	    !new_parent_subvol &&
2243 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
2244 		/*
2245 		 * Couldn't find a subvol for dirent's snapshot - but we lost
2246 		 * subvols, so we need to reconstruct:
2247 		 */
2248 		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
2249 		if (ret)
2250 			return ret;
2251 
2252 		parent_snapshot = d.k->p.snapshot;
2253 	}
2254 
2255 	if (fsck_err_on(ret,
2256 			trans, dirent_to_missing_parent_subvol,
2257 			"dirent parent_subvol points to missing subvolume\n%s",
2258 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
2259 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
2260 			trans, dirent_not_visible_in_parent_subvol,
2261 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
2262 			parent_snapshot,
2263 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
2264 		if (!new_parent_subvol) {
2265 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
2266 			return -BCH_ERR_fsck_repair_unimplemented;
2267 		}
2268 
2269 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
2270 		ret = PTR_ERR_OR_ZERO(new_dirent);
2271 		if (ret)
2272 			goto err;
2273 
2274 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
2275 	}
2276 
2277 	struct bkey_s_c_subvolume s =
2278 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
2279 					 BTREE_ID_subvolumes, POS(0, target_subvol),
2280 					 0, subvolume);
2281 	ret = bkey_err(s.s_c);
2282 	if (ret && !bch2_err_matches(ret, ENOENT))
2283 		return ret;
2284 
2285 	if (ret) {
2286 		if (fsck_err(trans, dirent_to_missing_subvol,
2287 			     "dirent points to missing subvolume\n%s",
2288 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
2289 			return __remove_dirent(trans, d.k->p);
2290 		ret = 0;
2291 		goto out;
2292 	}
2293 
2294 	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
2295 			trans, subvol_fs_path_parent_wrong,
2296 			"subvol with wrong fs_path_parent, should be be %u\n%s",
2297 			parent_subvol,
2298 			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
2299 		struct bkey_i_subvolume *n =
2300 			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
2301 		ret = PTR_ERR_OR_ZERO(n);
2302 		if (ret)
2303 			goto err;
2304 
2305 		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
2306 	}
2307 
2308 	u64 target_inum = le64_to_cpu(s.v->inode);
2309 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2310 
2311 	ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
2312 	if (ret && !bch2_err_matches(ret, ENOENT))
2313 		goto err;
2314 
2315 	if (ret) {
2316 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2317 		ret = -BCH_ERR_fsck_repair_unimplemented;
2318 		goto err;
2319 	}
2320 
2321 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2322 			trans, inode_bi_parent_wrong,
2323 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2324 			target_inum,
2325 			subvol_root.bi_parent_subvol, parent_subvol)) {
2326 		subvol_root.bi_parent_subvol = parent_subvol;
2327 		subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
2328 		ret = __bch2_fsck_write_inode(trans, &subvol_root);
2329 		if (ret)
2330 			goto err;
2331 	}
2332 
2333 	ret = check_dirent_target(trans, iter, d, &subvol_root);
2334 	if (ret)
2335 		goto err;
2336 out:
2337 err:
2338 fsck_err:
2339 	bch2_trans_iter_exit(trans, &subvol_iter);
2340 	printbuf_exit(&buf);
2341 	return ret;
2342 }
2343 
2344 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2345 			struct bkey_s_c k,
2346 			struct bch_hash_info *hash_info,
2347 			struct inode_walker *dir,
2348 			struct inode_walker *target,
2349 			struct snapshots_seen *s)
2350 {
2351 	struct bch_fs *c = trans->c;
2352 	struct inode_walker_entry *i;
2353 	struct printbuf buf = PRINTBUF;
2354 	int ret = 0;
2355 
2356 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2357 	if (ret) {
2358 		ret = ret < 0 ? ret : 0;
2359 		goto out;
2360 	}
2361 
2362 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2363 	if (ret)
2364 		goto err;
2365 
2366 	if (k.k->type == KEY_TYPE_whiteout)
2367 		goto out;
2368 
2369 	if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
2370 		ret = check_subdir_count(trans, dir);
2371 		if (ret)
2372 			goto err;
2373 	}
2374 
2375 	i = walk_inode(trans, dir, k);
2376 	ret = PTR_ERR_OR_ZERO(i);
2377 	if (ret < 0)
2378 		goto err;
2379 
2380 	ret = check_key_has_inode(trans, iter, dir, i, k);
2381 	if (ret)
2382 		goto err;
2383 
2384 	if (!i)
2385 		goto out;
2386 
2387 	if (dir->first_this_inode)
2388 		*hash_info = bch2_hash_info_init(c, &i->inode);
2389 	dir->first_this_inode = false;
2390 
2391 	ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
2392 	if (ret < 0)
2393 		goto err;
2394 	if (ret) {
2395 		/* dirent has been deleted */
2396 		ret = 0;
2397 		goto out;
2398 	}
2399 
2400 	if (k.k->type != KEY_TYPE_dirent)
2401 		goto out;
2402 
2403 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2404 
2405 	if (d.v->d_type == DT_SUBVOL) {
2406 		ret = check_dirent_to_subvol(trans, iter, d);
2407 		if (ret)
2408 			goto err;
2409 	} else {
2410 		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2411 		if (ret)
2412 			goto err;
2413 
2414 		if (fsck_err_on(!target->inodes.nr,
2415 				trans, dirent_to_missing_inode,
2416 				"dirent points to missing inode:\n%s",
2417 				(printbuf_reset(&buf),
2418 				 bch2_bkey_val_to_text(&buf, c, k),
2419 				 buf.buf))) {
2420 			ret = __remove_dirent(trans, d.k->p);
2421 			if (ret)
2422 				goto err;
2423 		}
2424 
2425 		darray_for_each(target->inodes, i) {
2426 			ret = check_dirent_target(trans, iter, d, &i->inode);
2427 			if (ret)
2428 				goto err;
2429 		}
2430 
2431 		darray_for_each(target->deletes, i)
2432 			if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
2433 					trans, dirent_to_overwritten_inode,
2434 					"dirent points to inode overwritten in snapshot %u:\n%s",
2435 					*i,
2436 					(printbuf_reset(&buf),
2437 					 bch2_bkey_val_to_text(&buf, c, k),
2438 					 buf.buf))) {
2439 				struct btree_iter delete_iter;
2440 				bch2_trans_iter_init(trans, &delete_iter,
2441 						     BTREE_ID_dirents,
2442 						     SPOS(k.k->p.inode, k.k->p.offset, *i),
2443 						     BTREE_ITER_intent);
2444 				ret =   bch2_btree_iter_traverse(&delete_iter) ?:
2445 					bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
2446 							  hash_info,
2447 							  &delete_iter,
2448 							  BTREE_UPDATE_internal_snapshot_node);
2449 				bch2_trans_iter_exit(trans, &delete_iter);
2450 				if (ret)
2451 					goto err;
2452 
2453 			}
2454 	}
2455 
2456 	ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2457 	if (ret)
2458 		goto err;
2459 
2460 	if (d.v->d_type == DT_DIR)
2461 		for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
2462 			i->count++;
2463 out:
2464 err:
2465 fsck_err:
2466 	printbuf_exit(&buf);
2467 	bch_err_fn(c, ret);
2468 	return ret;
2469 }
2470 
2471 /*
2472  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2473  * validate d_type
2474  */
2475 int bch2_check_dirents(struct bch_fs *c)
2476 {
2477 	struct inode_walker dir = inode_walker_init();
2478 	struct inode_walker target = inode_walker_init();
2479 	struct snapshots_seen s;
2480 	struct bch_hash_info hash_info;
2481 
2482 	snapshots_seen_init(&s);
2483 
2484 	int ret = bch2_trans_run(c,
2485 		for_each_btree_key(trans, iter, BTREE_ID_dirents,
2486 				POS(BCACHEFS_ROOT_INO, 0),
2487 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2488 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2489 		check_subdir_count_notnested(trans, &dir));
2490 
2491 	snapshots_seen_exit(&s);
2492 	inode_walker_exit(&dir);
2493 	inode_walker_exit(&target);
2494 	bch_err_fn(c, ret);
2495 	return ret;
2496 }
2497 
2498 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2499 		       struct bkey_s_c k,
2500 		       struct bch_hash_info *hash_info,
2501 		       struct inode_walker *inode)
2502 {
2503 	struct bch_fs *c = trans->c;
2504 	struct inode_walker_entry *i;
2505 	int ret;
2506 
2507 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2508 	if (ret < 0)
2509 		return ret;
2510 	if (ret)
2511 		return 0;
2512 
2513 	i = walk_inode(trans, inode, k);
2514 	ret = PTR_ERR_OR_ZERO(i);
2515 	if (ret)
2516 		return ret;
2517 
2518 	ret = check_key_has_inode(trans, iter, inode, i, k);
2519 	if (ret)
2520 		return ret;
2521 
2522 	if (!i)
2523 		return 0;
2524 
2525 	if (inode->first_this_inode)
2526 		*hash_info = bch2_hash_info_init(c, &i->inode);
2527 	inode->first_this_inode = false;
2528 
2529 	ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
2530 	bch_err_fn(c, ret);
2531 	return ret;
2532 }
2533 
2534 /*
2535  * Walk xattrs: verify that they all have a corresponding inode
2536  */
2537 int bch2_check_xattrs(struct bch_fs *c)
2538 {
2539 	struct inode_walker inode = inode_walker_init();
2540 	struct bch_hash_info hash_info;
2541 	int ret = 0;
2542 
2543 	ret = bch2_trans_run(c,
2544 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2545 			POS(BCACHEFS_ROOT_INO, 0),
2546 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2547 			k,
2548 			NULL, NULL,
2549 			BCH_TRANS_COMMIT_no_enospc,
2550 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2551 
2552 	inode_walker_exit(&inode);
2553 	bch_err_fn(c, ret);
2554 	return ret;
2555 }
2556 
2557 static int check_root_trans(struct btree_trans *trans)
2558 {
2559 	struct bch_fs *c = trans->c;
2560 	struct bch_inode_unpacked root_inode;
2561 	u32 snapshot;
2562 	u64 inum;
2563 	int ret;
2564 
2565 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2566 	if (ret && !bch2_err_matches(ret, ENOENT))
2567 		return ret;
2568 
2569 	if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2570 				"root subvol missing")) {
2571 		struct bkey_i_subvolume *root_subvol =
2572 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2573 		ret = PTR_ERR_OR_ZERO(root_subvol);
2574 		if (ret)
2575 			goto err;
2576 
2577 		snapshot	= U32_MAX;
2578 		inum		= BCACHEFS_ROOT_INO;
2579 
2580 		bkey_subvolume_init(&root_subvol->k_i);
2581 		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2582 		root_subvol->v.flags	= 0;
2583 		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2584 		root_subvol->v.inode	= cpu_to_le64(inum);
2585 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2586 		bch_err_msg(c, ret, "writing root subvol");
2587 		if (ret)
2588 			goto err;
2589 	}
2590 
2591 	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
2592 	if (ret && !bch2_err_matches(ret, ENOENT))
2593 		return ret;
2594 
2595 	if (mustfix_fsck_err_on(ret,
2596 				trans, root_dir_missing,
2597 				"root directory missing") ||
2598 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2599 				trans, root_inode_not_dir,
2600 				"root inode not a directory")) {
2601 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2602 				0, NULL);
2603 		root_inode.bi_inum = inum;
2604 		root_inode.bi_snapshot = snapshot;
2605 
2606 		ret = __bch2_fsck_write_inode(trans, &root_inode);
2607 		bch_err_msg(c, ret, "writing root inode");
2608 	}
2609 err:
2610 fsck_err:
2611 	return ret;
2612 }
2613 
2614 /* Get root directory, create if it doesn't exist: */
2615 int bch2_check_root(struct bch_fs *c)
2616 {
2617 	int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2618 		check_root_trans(trans));
2619 	bch_err_fn(c, ret);
2620 	return ret;
2621 }
2622 
2623 typedef DARRAY(u32) darray_u32;
2624 
2625 static bool darray_u32_has(darray_u32 *d, u32 v)
2626 {
2627 	darray_for_each(*d, i)
2628 		if (*i == v)
2629 			return true;
2630 	return false;
2631 }
2632 
2633 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2634 {
2635 	struct bch_fs *c = trans->c;
2636 	struct btree_iter parent_iter = {};
2637 	darray_u32 subvol_path = {};
2638 	struct printbuf buf = PRINTBUF;
2639 	int ret = 0;
2640 
2641 	if (k.k->type != KEY_TYPE_subvolume)
2642 		return 0;
2643 
2644 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2645 		ret = darray_push(&subvol_path, k.k->p.offset);
2646 		if (ret)
2647 			goto err;
2648 
2649 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2650 
2651 		struct bch_inode_unpacked subvol_root;
2652 		ret = bch2_inode_find_by_inum_trans(trans,
2653 					(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2654 					&subvol_root);
2655 		if (ret)
2656 			break;
2657 
2658 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2659 
2660 		if (darray_u32_has(&subvol_path, parent)) {
2661 			if (fsck_err(c, subvol_loop, "subvolume loop"))
2662 				ret = reattach_subvol(trans, s);
2663 			break;
2664 		}
2665 
2666 		bch2_trans_iter_exit(trans, &parent_iter);
2667 		bch2_trans_iter_init(trans, &parent_iter,
2668 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2669 		k = bch2_btree_iter_peek_slot(&parent_iter);
2670 		ret = bkey_err(k);
2671 		if (ret)
2672 			goto err;
2673 
2674 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2675 				trans, subvol_unreachable,
2676 				"unreachable subvolume %s",
2677 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2678 				 buf.buf))) {
2679 			ret = reattach_subvol(trans, s);
2680 			break;
2681 		}
2682 	}
2683 fsck_err:
2684 err:
2685 	printbuf_exit(&buf);
2686 	darray_exit(&subvol_path);
2687 	bch2_trans_iter_exit(trans, &parent_iter);
2688 	return ret;
2689 }
2690 
2691 int bch2_check_subvolume_structure(struct bch_fs *c)
2692 {
2693 	int ret = bch2_trans_run(c,
2694 		for_each_btree_key_commit(trans, iter,
2695 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2696 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2697 			check_subvol_path(trans, &iter, k)));
2698 	bch_err_fn(c, ret);
2699 	return ret;
2700 }
2701 
2702 struct pathbuf_entry {
2703 	u64	inum;
2704 	u32	snapshot;
2705 };
2706 
2707 typedef DARRAY(struct pathbuf_entry) pathbuf;
2708 
2709 static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
2710 				      u32 new_depth)
2711 {
2712 	struct btree_iter iter;
2713 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
2714 					       SPOS(0, p->inum, p->snapshot), 0);
2715 
2716 	struct bch_inode_unpacked inode;
2717 	int ret = bkey_err(k) ?:
2718 		!bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
2719 		: bch2_inode_unpack(k, &inode);
2720 	if (ret)
2721 		goto err;
2722 
2723 	if (inode.bi_depth != new_depth) {
2724 		inode.bi_depth = new_depth;
2725 		ret = __bch2_fsck_write_inode(trans, &inode) ?:
2726 			bch2_trans_commit(trans, NULL, NULL, 0);
2727 	}
2728 err:
2729 	bch2_trans_iter_exit(trans, &iter);
2730 	return ret;
2731 }
2732 
2733 static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
2734 {
2735 	u32 restart_count = trans->restart_count;
2736 	int ret = 0;
2737 
2738 	darray_for_each_reverse(*path, i) {
2739 		ret = nested_lockrestart_do(trans,
2740 				bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
2741 		bch_err_fn(trans->c, ret);
2742 		if (ret)
2743 			break;
2744 
2745 		new_bi_depth++;
2746 	}
2747 
2748 	return ret ?: trans_was_restarted(trans, restart_count);
2749 }
2750 
2751 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2752 {
2753 	darray_for_each(*p, i)
2754 		if (i->inum	== inum &&
2755 		    i->snapshot	== snapshot)
2756 			return true;
2757 	return false;
2758 }
2759 
2760 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
2761 {
2762 	struct bch_fs *c = trans->c;
2763 	struct btree_iter inode_iter = {};
2764 	pathbuf path = {};
2765 	struct printbuf buf = PRINTBUF;
2766 	u32 snapshot = inode_k.k->p.snapshot;
2767 	bool redo_bi_depth = false;
2768 	u32 min_bi_depth = U32_MAX;
2769 	int ret = 0;
2770 
2771 	struct bch_inode_unpacked inode;
2772 	ret = bch2_inode_unpack(inode_k, &inode);
2773 	if (ret)
2774 		return ret;
2775 
2776 	while (!inode.bi_subvol) {
2777 		struct btree_iter dirent_iter;
2778 		struct bkey_s_c_dirent d;
2779 		u32 parent_snapshot = snapshot;
2780 
2781 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2782 		ret = bkey_err(d.s_c);
2783 		if (ret && !bch2_err_matches(ret, ENOENT))
2784 			goto out;
2785 
2786 		if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
2787 			bch2_trans_iter_exit(trans, &dirent_iter);
2788 
2789 		if (bch2_err_matches(ret, ENOENT)) {
2790 			printbuf_reset(&buf);
2791 			bch2_bkey_val_to_text(&buf, c, inode_k);
2792 			bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
2793 				bch2_err_str(ret), buf.buf);
2794 			goto out;
2795 		}
2796 
2797 		bch2_trans_iter_exit(trans, &dirent_iter);
2798 
2799 		ret = darray_push(&path, ((struct pathbuf_entry) {
2800 			.inum		= inode.bi_inum,
2801 			.snapshot	= snapshot,
2802 		}));
2803 		if (ret)
2804 			return ret;
2805 
2806 		snapshot = parent_snapshot;
2807 
2808 		bch2_trans_iter_exit(trans, &inode_iter);
2809 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2810 					     SPOS(0, inode.bi_dir, snapshot), 0);
2811 
2812 		struct bch_inode_unpacked parent_inode;
2813 		ret = bkey_err(inode_k) ?:
2814 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2815 			: bch2_inode_unpack(inode_k, &parent_inode);
2816 		if (ret) {
2817 			/* Should have been caught in dirents pass */
2818 			bch_err_msg(c, ret, "error looking up parent directory");
2819 			goto out;
2820 		}
2821 
2822 		min_bi_depth = parent_inode.bi_depth;
2823 
2824 		if (parent_inode.bi_depth < inode.bi_depth &&
2825 		    min_bi_depth < U16_MAX)
2826 			break;
2827 
2828 		inode = parent_inode;
2829 		snapshot = inode_k.k->p.snapshot;
2830 		redo_bi_depth = true;
2831 
2832 		if (path_is_dup(&path, inode.bi_inum, snapshot)) {
2833 			/* XXX print path */
2834 			bch_err(c, "directory structure loop");
2835 
2836 			darray_for_each(path, i)
2837 				pr_err("%llu:%u", i->inum, i->snapshot);
2838 			pr_err("%llu:%u", inode.bi_inum, snapshot);
2839 
2840 			if (fsck_err(trans, dir_loop, "directory structure loop")) {
2841 				ret = remove_backpointer(trans, &inode);
2842 				bch_err_msg(c, ret, "removing dirent");
2843 				if (ret)
2844 					break;
2845 
2846 				ret = reattach_inode(trans, &inode);
2847 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2848 			}
2849 
2850 			goto out;
2851 		}
2852 	}
2853 
2854 	if (inode.bi_subvol)
2855 		min_bi_depth = 0;
2856 
2857 	if (redo_bi_depth)
2858 		ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
2859 out:
2860 fsck_err:
2861 	bch2_trans_iter_exit(trans, &inode_iter);
2862 	darray_exit(&path);
2863 	printbuf_exit(&buf);
2864 	bch_err_fn(c, ret);
2865 	return ret;
2866 }
2867 
2868 /*
2869  * Check for loops in the directory structure: all other connectivity issues
2870  * have been fixed by prior passes
2871  */
2872 int bch2_check_directory_structure(struct bch_fs *c)
2873 {
2874 	int ret = bch2_trans_run(c,
2875 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2876 					  BTREE_ITER_intent|
2877 					  BTREE_ITER_prefetch|
2878 					  BTREE_ITER_all_snapshots, k,
2879 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2880 			if (!S_ISDIR(bkey_inode_mode(k)))
2881 				continue;
2882 
2883 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2884 				continue;
2885 
2886 			check_path_loop(trans, k);
2887 		})));
2888 
2889 	bch_err_fn(c, ret);
2890 	return ret;
2891 }
2892 
2893 struct nlink_table {
2894 	size_t		nr;
2895 	size_t		size;
2896 
2897 	struct nlink {
2898 		u64	inum;
2899 		u32	snapshot;
2900 		u32	count;
2901 	}		*d;
2902 };
2903 
2904 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2905 		     u64 inum, u32 snapshot)
2906 {
2907 	if (t->nr == t->size) {
2908 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2909 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2910 
2911 		if (!d) {
2912 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2913 				new_size);
2914 			return -BCH_ERR_ENOMEM_fsck_add_nlink;
2915 		}
2916 
2917 		if (t->d)
2918 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2919 		kvfree(t->d);
2920 
2921 		t->d = d;
2922 		t->size = new_size;
2923 	}
2924 
2925 
2926 	t->d[t->nr++] = (struct nlink) {
2927 		.inum		= inum,
2928 		.snapshot	= snapshot,
2929 	};
2930 
2931 	return 0;
2932 }
2933 
2934 static int nlink_cmp(const void *_l, const void *_r)
2935 {
2936 	const struct nlink *l = _l;
2937 	const struct nlink *r = _r;
2938 
2939 	return cmp_int(l->inum, r->inum);
2940 }
2941 
2942 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2943 		     struct nlink_table *links,
2944 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2945 {
2946 	struct nlink *link, key = {
2947 		.inum = inum, .snapshot = U32_MAX,
2948 	};
2949 
2950 	if (inum < range_start || inum >= range_end)
2951 		return;
2952 
2953 	link = __inline_bsearch(&key, links->d, links->nr,
2954 				sizeof(links->d[0]), nlink_cmp);
2955 	if (!link)
2956 		return;
2957 
2958 	while (link > links->d && link[0].inum == link[-1].inum)
2959 		--link;
2960 
2961 	for (; link < links->d + links->nr && link->inum == inum; link++)
2962 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2963 			link->count++;
2964 			if (link->snapshot >= snapshot)
2965 				break;
2966 		}
2967 }
2968 
2969 noinline_for_stack
2970 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2971 				       struct nlink_table *t,
2972 				       u64 start, u64 *end)
2973 {
2974 	int ret = bch2_trans_run(c,
2975 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2976 				   POS(0, start),
2977 				   BTREE_ITER_intent|
2978 				   BTREE_ITER_prefetch|
2979 				   BTREE_ITER_all_snapshots, k, ({
2980 			if (!bkey_is_inode(k.k))
2981 				continue;
2982 
2983 			/* Should never fail, checked by bch2_inode_invalid: */
2984 			struct bch_inode_unpacked u;
2985 			_ret3 = bch2_inode_unpack(k, &u);
2986 			if (_ret3)
2987 				break;
2988 
2989 			/*
2990 			 * Backpointer and directory structure checks are sufficient for
2991 			 * directories, since they can't have hardlinks:
2992 			 */
2993 			if (S_ISDIR(u.bi_mode))
2994 				continue;
2995 
2996 			/*
2997 			 * Previous passes ensured that bi_nlink is nonzero if
2998 			 * it had multiple hardlinks:
2999 			 */
3000 			if (!u.bi_nlink)
3001 				continue;
3002 
3003 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
3004 			if (ret) {
3005 				*end = k.k->p.offset;
3006 				ret = 0;
3007 				break;
3008 			}
3009 			0;
3010 		})));
3011 
3012 	bch_err_fn(c, ret);
3013 	return ret;
3014 }
3015 
3016 noinline_for_stack
3017 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
3018 				     u64 range_start, u64 range_end)
3019 {
3020 	struct snapshots_seen s;
3021 
3022 	snapshots_seen_init(&s);
3023 
3024 	int ret = bch2_trans_run(c,
3025 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
3026 				   BTREE_ITER_intent|
3027 				   BTREE_ITER_prefetch|
3028 				   BTREE_ITER_all_snapshots, k, ({
3029 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
3030 			if (ret)
3031 				break;
3032 
3033 			if (k.k->type == KEY_TYPE_dirent) {
3034 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
3035 
3036 				if (d.v->d_type != DT_DIR &&
3037 				    d.v->d_type != DT_SUBVOL)
3038 					inc_link(c, &s, links, range_start, range_end,
3039 						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
3040 			}
3041 			0;
3042 		})));
3043 
3044 	snapshots_seen_exit(&s);
3045 
3046 	bch_err_fn(c, ret);
3047 	return ret;
3048 }
3049 
3050 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
3051 				     struct bkey_s_c k,
3052 				     struct nlink_table *links,
3053 				     size_t *idx, u64 range_end)
3054 {
3055 	struct bch_inode_unpacked u;
3056 	struct nlink *link = &links->d[*idx];
3057 	int ret = 0;
3058 
3059 	if (k.k->p.offset >= range_end)
3060 		return 1;
3061 
3062 	if (!bkey_is_inode(k.k))
3063 		return 0;
3064 
3065 	ret = bch2_inode_unpack(k, &u);
3066 	if (ret)
3067 		return ret;
3068 
3069 	if (S_ISDIR(u.bi_mode))
3070 		return 0;
3071 
3072 	if (!u.bi_nlink)
3073 		return 0;
3074 
3075 	while ((cmp_int(link->inum, k.k->p.offset) ?:
3076 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
3077 		BUG_ON(*idx == links->nr);
3078 		link = &links->d[++*idx];
3079 	}
3080 
3081 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
3082 			trans, inode_wrong_nlink,
3083 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
3084 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
3085 			bch2_inode_nlink_get(&u), link->count)) {
3086 		bch2_inode_nlink_set(&u, link->count);
3087 		ret = __bch2_fsck_write_inode(trans, &u);
3088 	}
3089 fsck_err:
3090 	return ret;
3091 }
3092 
3093 noinline_for_stack
3094 static int check_nlinks_update_hardlinks(struct bch_fs *c,
3095 			       struct nlink_table *links,
3096 			       u64 range_start, u64 range_end)
3097 {
3098 	size_t idx = 0;
3099 
3100 	int ret = bch2_trans_run(c,
3101 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
3102 				POS(0, range_start),
3103 				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
3104 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
3105 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
3106 	if (ret < 0) {
3107 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
3108 		return ret;
3109 	}
3110 
3111 	return 0;
3112 }
3113 
3114 int bch2_check_nlinks(struct bch_fs *c)
3115 {
3116 	struct nlink_table links = { 0 };
3117 	u64 this_iter_range_start, next_iter_range_start = 0;
3118 	int ret = 0;
3119 
3120 	do {
3121 		this_iter_range_start = next_iter_range_start;
3122 		next_iter_range_start = U64_MAX;
3123 
3124 		ret = check_nlinks_find_hardlinks(c, &links,
3125 						  this_iter_range_start,
3126 						  &next_iter_range_start);
3127 
3128 		ret = check_nlinks_walk_dirents(c, &links,
3129 					  this_iter_range_start,
3130 					  next_iter_range_start);
3131 		if (ret)
3132 			break;
3133 
3134 		ret = check_nlinks_update_hardlinks(c, &links,
3135 					 this_iter_range_start,
3136 					 next_iter_range_start);
3137 		if (ret)
3138 			break;
3139 
3140 		links.nr = 0;
3141 	} while (next_iter_range_start != U64_MAX);
3142 
3143 	kvfree(links.d);
3144 	bch_err_fn(c, ret);
3145 	return ret;
3146 }
3147 
3148 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
3149 			     struct bkey_s_c k)
3150 {
3151 	struct bkey_s_c_reflink_p p;
3152 	struct bkey_i_reflink_p *u;
3153 
3154 	if (k.k->type != KEY_TYPE_reflink_p)
3155 		return 0;
3156 
3157 	p = bkey_s_c_to_reflink_p(k);
3158 
3159 	if (!p.v->front_pad && !p.v->back_pad)
3160 		return 0;
3161 
3162 	u = bch2_trans_kmalloc(trans, sizeof(*u));
3163 	int ret = PTR_ERR_OR_ZERO(u);
3164 	if (ret)
3165 		return ret;
3166 
3167 	bkey_reassemble(&u->k_i, k);
3168 	u->v.front_pad	= 0;
3169 	u->v.back_pad	= 0;
3170 
3171 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
3172 }
3173 
3174 int bch2_fix_reflink_p(struct bch_fs *c)
3175 {
3176 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
3177 		return 0;
3178 
3179 	int ret = bch2_trans_run(c,
3180 		for_each_btree_key_commit(trans, iter,
3181 				BTREE_ID_extents, POS_MIN,
3182 				BTREE_ITER_intent|BTREE_ITER_prefetch|
3183 				BTREE_ITER_all_snapshots, k,
3184 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
3185 			fix_reflink_p_key(trans, &iter, k)));
3186 	bch_err_fn(c, ret);
3187 	return ret;
3188 }
3189 
3190 #ifndef NO_BCACHEFS_CHARDEV
3191 
3192 struct fsck_thread {
3193 	struct thread_with_stdio thr;
3194 	struct bch_fs		*c;
3195 	struct bch_opts		opts;
3196 };
3197 
3198 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
3199 {
3200 	struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
3201 	kfree(thr);
3202 }
3203 
3204 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
3205 {
3206 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3207 	struct bch_fs *c = thr->c;
3208 
3209 	int ret = PTR_ERR_OR_ZERO(c);
3210 	if (ret)
3211 		return ret;
3212 
3213 	ret = bch2_fs_start(thr->c);
3214 	if (ret)
3215 		goto err;
3216 
3217 	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
3218 		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
3219 		ret |= 1;
3220 	}
3221 	if (test_bit(BCH_FS_error, &c->flags)) {
3222 		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
3223 		ret |= 4;
3224 	}
3225 err:
3226 	bch2_fs_stop(c);
3227 	return ret;
3228 }
3229 
3230 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
3231 	.exit		= bch2_fsck_thread_exit,
3232 	.fn		= bch2_fsck_offline_thread_fn,
3233 };
3234 
3235 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
3236 {
3237 	struct bch_ioctl_fsck_offline arg;
3238 	struct fsck_thread *thr = NULL;
3239 	darray_str(devs) = {};
3240 	long ret = 0;
3241 
3242 	if (copy_from_user(&arg, user_arg, sizeof(arg)))
3243 		return -EFAULT;
3244 
3245 	if (arg.flags)
3246 		return -EINVAL;
3247 
3248 	if (!capable(CAP_SYS_ADMIN))
3249 		return -EPERM;
3250 
3251 	for (size_t i = 0; i < arg.nr_devs; i++) {
3252 		u64 dev_u64;
3253 		ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
3254 		if (ret)
3255 			goto err;
3256 
3257 		char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
3258 		ret = PTR_ERR_OR_ZERO(dev_str);
3259 		if (ret)
3260 			goto err;
3261 
3262 		ret = darray_push(&devs, dev_str);
3263 		if (ret) {
3264 			kfree(dev_str);
3265 			goto err;
3266 		}
3267 	}
3268 
3269 	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3270 	if (!thr) {
3271 		ret = -ENOMEM;
3272 		goto err;
3273 	}
3274 
3275 	thr->opts = bch2_opts_empty();
3276 
3277 	if (arg.opts) {
3278 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3279 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
3280 			bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
3281 		if (!IS_ERR(optstr))
3282 			kfree(optstr);
3283 
3284 		if (ret)
3285 			goto err;
3286 	}
3287 
3288 	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
3289 	opt_set(thr->opts, read_only, 1);
3290 	opt_set(thr->opts, ratelimit_errors, 0);
3291 
3292 	/* We need request_key() to be called before we punt to kthread: */
3293 	opt_set(thr->opts, nostart, true);
3294 
3295 	bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
3296 
3297 	thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
3298 
3299 	if (!IS_ERR(thr->c) &&
3300 	    thr->c->opts.errors == BCH_ON_ERROR_panic)
3301 		thr->c->opts.errors = BCH_ON_ERROR_ro;
3302 
3303 	ret = __bch2_run_thread_with_stdio(&thr->thr);
3304 out:
3305 	darray_for_each(devs, i)
3306 		kfree(*i);
3307 	darray_exit(&devs);
3308 	return ret;
3309 err:
3310 	if (thr)
3311 		bch2_fsck_thread_exit(&thr->thr);
3312 	pr_err("ret %s", bch2_err_str(ret));
3313 	goto out;
3314 }
3315 
3316 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
3317 {
3318 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3319 	struct bch_fs *c = thr->c;
3320 
3321 	c->stdio_filter = current;
3322 	c->stdio = &thr->thr.stdio;
3323 
3324 	/*
3325 	 * XXX: can we figure out a way to do this without mucking with c->opts?
3326 	 */
3327 	unsigned old_fix_errors = c->opts.fix_errors;
3328 	if (opt_defined(thr->opts, fix_errors))
3329 		c->opts.fix_errors = thr->opts.fix_errors;
3330 	else
3331 		c->opts.fix_errors = FSCK_FIX_ask;
3332 
3333 	c->opts.fsck = true;
3334 	set_bit(BCH_FS_fsck_running, &c->flags);
3335 
3336 	c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
3337 	int ret = bch2_run_online_recovery_passes(c);
3338 
3339 	clear_bit(BCH_FS_fsck_running, &c->flags);
3340 	bch_err_fn(c, ret);
3341 
3342 	c->stdio = NULL;
3343 	c->stdio_filter = NULL;
3344 	c->opts.fix_errors = old_fix_errors;
3345 
3346 	up(&c->online_fsck_mutex);
3347 	bch2_ro_ref_put(c);
3348 	return ret;
3349 }
3350 
3351 static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
3352 	.exit		= bch2_fsck_thread_exit,
3353 	.fn		= bch2_fsck_online_thread_fn,
3354 };
3355 
3356 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
3357 {
3358 	struct fsck_thread *thr = NULL;
3359 	long ret = 0;
3360 
3361 	if (arg.flags)
3362 		return -EINVAL;
3363 
3364 	if (!capable(CAP_SYS_ADMIN))
3365 		return -EPERM;
3366 
3367 	if (!bch2_ro_ref_tryget(c))
3368 		return -EROFS;
3369 
3370 	if (down_trylock(&c->online_fsck_mutex)) {
3371 		bch2_ro_ref_put(c);
3372 		return -EAGAIN;
3373 	}
3374 
3375 	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3376 	if (!thr) {
3377 		ret = -ENOMEM;
3378 		goto err;
3379 	}
3380 
3381 	thr->c = c;
3382 	thr->opts = bch2_opts_empty();
3383 
3384 	if (arg.opts) {
3385 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3386 
3387 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
3388 			bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
3389 		if (!IS_ERR(optstr))
3390 			kfree(optstr);
3391 
3392 		if (ret)
3393 			goto err;
3394 	}
3395 
3396 	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
3397 err:
3398 	if (ret < 0) {
3399 		bch_err_fn(c, ret);
3400 		if (thr)
3401 			bch2_fsck_thread_exit(&thr->thr);
3402 		up(&c->online_fsck_mutex);
3403 		bch2_ro_ref_put(c);
3404 	}
3405 	return ret;
3406 }
3407 
3408 #endif /* NO_BCACHEFS_CHARDEV */
3409