xref: /linux/fs/bcachefs/fsck.c (revision fcab107abe1ab5be9dbe874baa722372da8f4f73)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bcachefs_ioctl.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "darray.h"
10 #include "dirent.h"
11 #include "error.h"
12 #include "fs.h"
13 #include "fsck.h"
14 #include "inode.h"
15 #include "keylist.h"
16 #include "namei.h"
17 #include "recovery_passes.h"
18 #include "snapshot.h"
19 #include "super.h"
20 #include "thread_with_file.h"
21 #include "xattr.h"
22 
23 #include <linux/bsearch.h>
24 #include <linux/dcache.h> /* struct qstr */
25 
26 static int dirent_points_to_inode_nowarn(struct bch_fs *c,
27 					 struct bkey_s_c_dirent d,
28 					 struct bch_inode_unpacked *inode)
29 {
30 	if (d.v->d_type == DT_SUBVOL
31 	    ? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
32 	    : le64_to_cpu(d.v->d_inum)		== inode->bi_inum)
33 		return 0;
34 	return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
35 }
36 
37 static void dirent_inode_mismatch_msg(struct printbuf *out,
38 				      struct bch_fs *c,
39 				      struct bkey_s_c_dirent dirent,
40 				      struct bch_inode_unpacked *inode)
41 {
42 	prt_str(out, "inode points to dirent that does not point back:");
43 	prt_newline(out);
44 	bch2_bkey_val_to_text(out, c, dirent.s_c);
45 	prt_newline(out);
46 	bch2_inode_unpacked_to_text(out, inode);
47 }
48 
49 static int dirent_points_to_inode(struct bch_fs *c,
50 				  struct bkey_s_c_dirent dirent,
51 				  struct bch_inode_unpacked *inode)
52 {
53 	int ret = dirent_points_to_inode_nowarn(c, dirent, inode);
54 	if (ret) {
55 		struct printbuf buf = PRINTBUF;
56 		dirent_inode_mismatch_msg(&buf, c, dirent, inode);
57 		bch_warn(c, "%s", buf.buf);
58 		printbuf_exit(&buf);
59 	}
60 	return ret;
61 }
62 
63 /*
64  * XXX: this is handling transaction restarts without returning
65  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
66  */
67 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
68 				    u32 snapshot)
69 {
70 	u64 sectors = 0;
71 
72 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
73 				SPOS(inum, 0, snapshot),
74 				POS(inum, U64_MAX),
75 				0, k, ({
76 		if (bkey_extent_is_allocation(k.k))
77 			sectors += k.k->size;
78 		0;
79 	}));
80 
81 	return ret ?: sectors;
82 }
83 
84 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
85 				    u32 snapshot)
86 {
87 	u64 subdirs = 0;
88 
89 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
90 				    SPOS(inum, 0, snapshot),
91 				    POS(inum, U64_MAX),
92 				    0, k, ({
93 		if (k.k->type == KEY_TYPE_dirent &&
94 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
95 			subdirs++;
96 		0;
97 	}));
98 
99 	return ret ?: subdirs;
100 }
101 
102 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
103 			 u32 *snapshot, u64 *inum)
104 {
105 	struct bch_subvolume s;
106 	int ret = bch2_subvolume_get(trans, subvol, false, &s);
107 
108 	*snapshot = le32_to_cpu(s.snapshot);
109 	*inum = le64_to_cpu(s.inode);
110 	return ret;
111 }
112 
113 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
114 			   struct bch_hash_info hash_info,
115 			   subvol_inum dir, struct qstr *name,
116 			   u64 *target, unsigned *type, u32 snapshot)
117 {
118 	struct btree_iter iter;
119 	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
120 							 &hash_info, dir, name, 0, snapshot);
121 	int ret = bkey_err(k);
122 	if (ret)
123 		return ret;
124 
125 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
126 	*target = le64_to_cpu(d.v->d_inum);
127 	*type = d.v->d_type;
128 	bch2_trans_iter_exit(trans, &iter);
129 	return 0;
130 }
131 
132 /*
133  * Find any subvolume associated with a tree of snapshots
134  * We can't rely on master_subvol - it might have been deleted.
135  */
136 static int find_snapshot_tree_subvol(struct btree_trans *trans,
137 				     u32 tree_id, u32 *subvol)
138 {
139 	struct btree_iter iter;
140 	struct bkey_s_c k;
141 	int ret;
142 
143 	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
144 		if (k.k->type != KEY_TYPE_snapshot)
145 			continue;
146 
147 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
148 		if (le32_to_cpu(s.v->tree) != tree_id)
149 			continue;
150 
151 		if (s.v->subvol) {
152 			*subvol = le32_to_cpu(s.v->subvol);
153 			goto found;
154 		}
155 	}
156 	ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol);
157 found:
158 	bch2_trans_iter_exit(trans, &iter);
159 	return ret;
160 }
161 
162 /* Get lost+found, create if it doesn't exist: */
163 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
164 			    struct bch_inode_unpacked *lostfound,
165 			    u64 reattaching_inum)
166 {
167 	struct bch_fs *c = trans->c;
168 	struct qstr lostfound_str = QSTR("lost+found");
169 	struct btree_iter lostfound_iter = {};
170 	u64 inum = 0;
171 	unsigned d_type = 0;
172 	int ret;
173 
174 	struct bch_snapshot_tree st;
175 	ret = bch2_snapshot_tree_lookup(trans,
176 			bch2_snapshot_tree(c, snapshot), &st);
177 	if (ret)
178 		return ret;
179 
180 	u32 subvolid;
181 	ret = find_snapshot_tree_subvol(trans,
182 				bch2_snapshot_tree(c, snapshot), &subvolid);
183 	bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
184 		    bch2_snapshot_tree(c, snapshot));
185 	if (ret)
186 		return ret;
187 
188 	struct bch_subvolume subvol;
189 	ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
190 	bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
191 	if (ret)
192 		return ret;
193 
194 	if (!subvol.inode) {
195 		struct btree_iter iter;
196 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
197 				BTREE_ID_subvolumes, POS(0, subvolid),
198 				0, subvolume);
199 		ret = PTR_ERR_OR_ZERO(subvol);
200 		if (ret)
201 			return ret;
202 
203 		subvol->v.inode = cpu_to_le64(reattaching_inum);
204 		bch2_trans_iter_exit(trans, &iter);
205 	}
206 
207 	subvol_inum root_inum = {
208 		.subvol = subvolid,
209 		.inum = le64_to_cpu(subvol.inode)
210 	};
211 
212 	struct bch_inode_unpacked root_inode;
213 	struct bch_hash_info root_hash_info;
214 	ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0);
215 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
216 		    root_inum.inum, subvolid);
217 	if (ret)
218 		return ret;
219 
220 	root_hash_info = bch2_hash_info_init(c, &root_inode);
221 
222 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
223 			      &lostfound_str, &inum, &d_type, snapshot);
224 	if (bch2_err_matches(ret, ENOENT))
225 		goto create_lostfound;
226 
227 	bch_err_fn(c, ret);
228 	if (ret)
229 		return ret;
230 
231 	if (d_type != DT_DIR) {
232 		bch_err(c, "error looking up lost+found: not a directory");
233 		return bch_err_throw(c, ENOENT_not_directory);
234 	}
235 
236 	/*
237 	 * The bch2_check_dirents pass has already run, dangling dirents
238 	 * shouldn't exist here:
239 	 */
240 	ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0);
241 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
242 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
243 	return ret;
244 
245 create_lostfound:
246 	/*
247 	 * we always create lost+found in the root snapshot; we don't want
248 	 * different branches of the snapshot tree to have different lost+found
249 	 */
250 	snapshot = le32_to_cpu(st.root_snapshot);
251 	/*
252 	 * XXX: we could have a nicer log message here  if we had a nice way to
253 	 * walk backpointers to print a path
254 	 */
255 	struct printbuf path = PRINTBUF;
256 	ret = bch2_inum_to_path(trans, root_inum, &path);
257 	if (ret)
258 		goto err;
259 
260 	bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
261 		   path.buf, root_inum.subvol, snapshot);
262 	printbuf_exit(&path);
263 
264 	u64 now = bch2_current_time(c);
265 	u64 cpu = raw_smp_processor_id();
266 
267 	bch2_inode_init_early(c, lostfound);
268 	bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
269 	lostfound->bi_dir = root_inode.bi_inum;
270 	lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
271 
272 	root_inode.bi_nlink++;
273 
274 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
275 	if (ret)
276 		goto err;
277 
278 	bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot);
279 	ret = bch2_btree_iter_traverse(trans, &lostfound_iter);
280 	if (ret)
281 		goto err;
282 
283 	ret =   bch2_dirent_create_snapshot(trans,
284 				0, root_inode.bi_inum, snapshot, &root_hash_info,
285 				mode_to_type(lostfound->bi_mode),
286 				&lostfound_str,
287 				lostfound->bi_inum,
288 				&lostfound->bi_dir_offset,
289 				BTREE_UPDATE_internal_snapshot_node|
290 				STR_HASH_must_create) ?:
291 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
292 				       BTREE_UPDATE_internal_snapshot_node);
293 err:
294 	bch_err_msg(c, ret, "creating lost+found");
295 	bch2_trans_iter_exit(trans, &lostfound_iter);
296 	return ret;
297 }
298 
299 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
300 {
301 	if (inode->bi_inum == BCACHEFS_ROOT_INO &&
302 	    inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
303 		return false;
304 
305 	/*
306 	 * Subvolume roots are special: older versions of subvolume roots may be
307 	 * disconnected, it's only the newest version that matters.
308 	 *
309 	 * We only keep a single dirent pointing to a subvolume root, i.e.
310 	 * older versions of snapshots will not have a different dirent pointing
311 	 * to the same subvolume root.
312 	 *
313 	 * This is because dirents that point to subvolumes are only visible in
314 	 * the parent subvolume - versioning is not needed - and keeping them
315 	 * around would break fsck, because when we're crossing subvolumes we
316 	 * don't have a consistent snapshot ID to do check the inode <-> dirent
317 	 * relationships.
318 	 *
319 	 * Thus, a subvolume root that's been renamed after a snapshot will have
320 	 * a disconnected older version - that's expected.
321 	 *
322 	 * Note that taking a snapshot always updates the root inode (to update
323 	 * the dirent backpointer), so a subvolume root inode with
324 	 * BCH_INODE_has_child_snapshot is never visible.
325 	 */
326 	if (inode->bi_subvol &&
327 	    (inode->bi_flags & BCH_INODE_has_child_snapshot))
328 		return false;
329 
330 	return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
331 }
332 
333 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
334 {
335 	struct btree_iter iter;
336 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
337 					SPOS(d_pos.inode, d_pos.offset, snapshot),
338 					BTREE_ITER_intent|
339 					BTREE_ITER_with_updates);
340 	int ret = bkey_err(k);
341 	if (ret)
342 		return ret;
343 
344 	if (bpos_eq(k.k->p, d_pos)) {
345 		/*
346 		 * delet_at() doesn't work because the update path doesn't
347 		 * internally use BTREE_ITER_with_updates yet
348 		 */
349 		struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
350 		ret = PTR_ERR_OR_ZERO(k);
351 		if (ret)
352 			goto err;
353 
354 		bkey_init(&k->k);
355 		k->k.type = KEY_TYPE_whiteout;
356 		k->k.p = iter.pos;
357 		ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
358 	}
359 err:
360 	bch2_trans_iter_exit(trans, &iter);
361 	return ret;
362 }
363 
364 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
365 {
366 	struct bch_fs *c = trans->c;
367 	struct bch_inode_unpacked lostfound;
368 	char name_buf[20];
369 	int ret;
370 
371 	u32 dirent_snapshot = inode->bi_snapshot;
372 	if (inode->bi_subvol) {
373 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
374 
375 		u64 root_inum;
376 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
377 				    &dirent_snapshot, &root_inum);
378 		if (ret)
379 			return ret;
380 
381 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
382 	} else {
383 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
384 	}
385 
386 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
387 	if (ret)
388 		return ret;
389 
390 	lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
391 
392 	/* ensure lost+found inode is also present in inode snapshot */
393 	if (!inode->bi_subvol) {
394 		BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
395 		lostfound.bi_snapshot = inode->bi_snapshot;
396 	}
397 
398 	ret = __bch2_fsck_write_inode(trans, &lostfound);
399 	if (ret)
400 		return ret;
401 
402 	struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
403 	struct qstr name = QSTR(name_buf);
404 
405 	inode->bi_dir = lostfound.bi_inum;
406 
407 	ret = bch2_dirent_create_snapshot(trans,
408 				inode->bi_parent_subvol, lostfound.bi_inum,
409 				dirent_snapshot,
410 				&dir_hash,
411 				inode_d_type(inode),
412 				&name,
413 				inode->bi_subvol ?: inode->bi_inum,
414 				&inode->bi_dir_offset,
415 				BTREE_UPDATE_internal_snapshot_node|
416 				STR_HASH_must_create);
417 	if (ret) {
418 		bch_err_msg(c, ret, "error creating dirent");
419 		return ret;
420 	}
421 
422 	ret = __bch2_fsck_write_inode(trans, inode);
423 	if (ret)
424 		return ret;
425 
426 	/*
427 	 * Fix up inodes in child snapshots: if they should also be reattached
428 	 * update the backpointer field, if they should not be we need to emit
429 	 * whiteouts for the dirent we just created.
430 	 */
431 	if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
432 		snapshot_id_list whiteouts_done;
433 		struct btree_iter iter;
434 		struct bkey_s_c k;
435 
436 		darray_init(&whiteouts_done);
437 
438 		for_each_btree_key_reverse_norestart(trans, iter,
439 				BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
440 				BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
441 			if (k.k->p.offset != inode->bi_inum)
442 				break;
443 
444 			if (!bkey_is_inode(k.k) ||
445 			    !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
446 			    snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
447 				continue;
448 
449 			struct bch_inode_unpacked child_inode;
450 			ret = bch2_inode_unpack(k, &child_inode);
451 			if (ret)
452 				break;
453 
454 			if (!inode_should_reattach(&child_inode)) {
455 				ret = maybe_delete_dirent(trans,
456 							  SPOS(lostfound.bi_inum, inode->bi_dir_offset,
457 							       dirent_snapshot),
458 							  k.k->p.snapshot);
459 				if (ret)
460 					break;
461 
462 				ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
463 				if (ret)
464 					break;
465 			} else {
466 				iter.snapshot = k.k->p.snapshot;
467 				child_inode.bi_dir = inode->bi_dir;
468 				child_inode.bi_dir_offset = inode->bi_dir_offset;
469 
470 				ret = bch2_inode_write_flags(trans, &iter, &child_inode,
471 							     BTREE_UPDATE_internal_snapshot_node);
472 				if (ret)
473 					break;
474 			}
475 		}
476 		darray_exit(&whiteouts_done);
477 		bch2_trans_iter_exit(trans, &iter);
478 	}
479 
480 	return ret;
481 }
482 
483 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
484 						struct btree_iter *iter,
485 						struct bpos pos)
486 {
487 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
488 }
489 
490 static int remove_backpointer(struct btree_trans *trans,
491 			      struct bch_inode_unpacked *inode)
492 {
493 	if (!inode->bi_dir)
494 		return 0;
495 
496 	struct bch_fs *c = trans->c;
497 	struct btree_iter iter;
498 	struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
499 				     SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
500 	int ret = bkey_err(d) ?:
501 		  dirent_points_to_inode(c, d, inode) ?:
502 		  bch2_fsck_remove_dirent(trans, d.k->p);
503 	bch2_trans_iter_exit(trans, &iter);
504 	return ret;
505 }
506 
507 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
508 {
509 	struct bch_fs *c = trans->c;
510 
511 	struct bch_inode_unpacked inode;
512 	int ret = bch2_inode_find_by_inum_trans(trans,
513 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
514 				&inode);
515 	if (ret)
516 		return ret;
517 
518 	ret = remove_backpointer(trans, &inode);
519 	if (!bch2_err_matches(ret, ENOENT))
520 		bch_err_msg(c, ret, "removing dirent");
521 	if (ret)
522 		return ret;
523 
524 	ret = reattach_inode(trans, &inode);
525 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
526 	return ret;
527 }
528 
529 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
530 {
531 	struct bch_fs *c = trans->c;
532 
533 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
534 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
535 		return bch_err_throw(c, fsck_repair_unimplemented);
536 	}
537 
538 	/*
539 	 * If inum isn't set, that means we're being called from check_dirents,
540 	 * not check_inodes - the root of this subvolume doesn't exist or we
541 	 * would have found it there:
542 	 */
543 	if (!inum) {
544 		struct btree_iter inode_iter = {};
545 		struct bch_inode_unpacked new_inode;
546 		u64 cpu = raw_smp_processor_id();
547 
548 		bch2_inode_init_early(c, &new_inode);
549 		bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
550 
551 		new_inode.bi_subvol = subvolid;
552 
553 		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
554 			  bch2_btree_iter_traverse(trans, &inode_iter) ?:
555 			  bch2_inode_write(trans, &inode_iter, &new_inode);
556 		bch2_trans_iter_exit(trans, &inode_iter);
557 		if (ret)
558 			return ret;
559 
560 		inum = new_inode.bi_inum;
561 	}
562 
563 	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
564 
565 	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
566 	int ret = PTR_ERR_OR_ZERO(new_subvol);
567 	if (ret)
568 		return ret;
569 
570 	bkey_subvolume_init(&new_subvol->k_i);
571 	new_subvol->k.p.offset	= subvolid;
572 	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
573 	new_subvol->v.inode	= cpu_to_le64(inum);
574 	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
575 	if (ret)
576 		return ret;
577 
578 	struct btree_iter iter;
579 	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
580 			BTREE_ID_snapshots, POS(0, snapshotid),
581 			0, snapshot);
582 	ret = PTR_ERR_OR_ZERO(s);
583 	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
584 	if (ret)
585 		return ret;
586 
587 	u32 snapshot_tree = le32_to_cpu(s->v.tree);
588 
589 	s->v.subvol = cpu_to_le32(subvolid);
590 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
591 	bch2_trans_iter_exit(trans, &iter);
592 
593 	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
594 			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
595 			0, snapshot_tree);
596 	ret = PTR_ERR_OR_ZERO(st);
597 	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
598 	if (ret)
599 		return ret;
600 
601 	if (!st->v.master_subvol)
602 		st->v.master_subvol = cpu_to_le32(subvolid);
603 
604 	bch2_trans_iter_exit(trans, &iter);
605 	return 0;
606 }
607 
608 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
609 {
610 	struct bch_fs *c = trans->c;
611 	unsigned i_mode = S_IFREG;
612 	u64 i_size = 0;
613 
614 	switch (btree) {
615 	case BTREE_ID_extents: {
616 		struct btree_iter iter = {};
617 
618 		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
619 		struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0));
620 		bch2_trans_iter_exit(trans, &iter);
621 		int ret = bkey_err(k);
622 		if (ret)
623 			return ret;
624 
625 		i_size = k.k->p.offset << 9;
626 		break;
627 	}
628 	case BTREE_ID_dirents:
629 		i_mode = S_IFDIR;
630 		break;
631 	case BTREE_ID_xattrs:
632 		break;
633 	default:
634 		BUG();
635 	}
636 
637 	struct bch_inode_unpacked new_inode;
638 	bch2_inode_init_early(c, &new_inode);
639 	bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
640 	new_inode.bi_size = i_size;
641 	new_inode.bi_inum = inum;
642 	new_inode.bi_snapshot = snapshot;
643 
644 	return __bch2_fsck_write_inode(trans, &new_inode);
645 }
646 
647 static inline void snapshots_seen_exit(struct snapshots_seen *s)
648 {
649 	darray_exit(&s->ids);
650 }
651 
652 static inline void snapshots_seen_init(struct snapshots_seen *s)
653 {
654 	memset(s, 0, sizeof(*s));
655 }
656 
657 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
658 {
659 	u32 *i;
660 	__darray_for_each(s->ids, i) {
661 		if (*i == id)
662 			return 0;
663 		if (*i > id)
664 			break;
665 	}
666 
667 	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
668 	if (ret)
669 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
670 			s->ids.size);
671 	return ret;
672 }
673 
674 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
675 				 enum btree_id btree_id, struct bpos pos)
676 {
677 	if (!bkey_eq(s->pos, pos))
678 		s->ids.nr = 0;
679 	s->pos = pos;
680 
681 	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
682 }
683 
684 /**
685  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
686  * and @ancestor hasn't been overwritten in @seen
687  *
688  * @c:		filesystem handle
689  * @seen:	list of snapshot ids already seen at current position
690  * @id:		descendent snapshot id
691  * @ancestor:	ancestor snapshot id
692  *
693  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
694  */
695 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
696 				    u32 id, u32 ancestor)
697 {
698 	ssize_t i;
699 
700 	EBUG_ON(id > ancestor);
701 
702 	/* @ancestor should be the snapshot most recently added to @seen */
703 	EBUG_ON(ancestor != seen->pos.snapshot);
704 	EBUG_ON(ancestor != darray_last(seen->ids));
705 
706 	if (id == ancestor)
707 		return true;
708 
709 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
710 		return false;
711 
712 	/*
713 	 * We know that @id is a descendant of @ancestor, we're checking if
714 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
715 	 * @ascestor and with @id as a descendent.
716 	 *
717 	 * But we already know that we're scanning IDs between @id and @ancestor
718 	 * numerically, since snapshot ID lists are kept sorted, so if we find
719 	 * an id that's an ancestor of @id we're done:
720 	 */
721 
722 	for (i = seen->ids.nr - 2;
723 	     i >= 0 && seen->ids.data[i] >= id;
724 	     --i)
725 		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
726 			return false;
727 
728 	return true;
729 }
730 
731 /**
732  * ref_visible - given a key with snapshot id @src that points to a key with
733  * snapshot id @dst, test whether there is some snapshot in which @dst is
734  * visible.
735  *
736  * @c:		filesystem handle
737  * @s:		list of snapshot IDs already seen at @src
738  * @src:	snapshot ID of src key
739  * @dst:	snapshot ID of dst key
740  * Returns:	true if there is some snapshot in which @dst is visible
741  *
742  * Assumes we're visiting @src keys in natural key order
743  */
744 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
745 			u32 src, u32 dst)
746 {
747 	return dst <= src
748 		? key_visible_in_snapshot(c, s, dst, src)
749 		: bch2_snapshot_is_ancestor(c, src, dst);
750 }
751 
752 static int ref_visible2(struct bch_fs *c,
753 			u32 src, struct snapshots_seen *src_seen,
754 			u32 dst, struct snapshots_seen *dst_seen)
755 {
756 	if (dst > src) {
757 		swap(dst, src);
758 		swap(dst_seen, src_seen);
759 	}
760 	return key_visible_in_snapshot(c, src_seen, dst, src);
761 }
762 
763 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
764 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
765 	     (_i)->inode.bi_snapshot <= (_snapshot); _i++)				\
766 		if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot))
767 
768 struct inode_walker_entry {
769 	struct bch_inode_unpacked inode;
770 	bool			whiteout;
771 	u64			count;
772 	u64			i_size;
773 };
774 
775 struct inode_walker {
776 	bool				first_this_inode;
777 	bool				have_inodes;
778 	bool				recalculate_sums;
779 	struct bpos			last_pos;
780 
781 	DARRAY(struct inode_walker_entry) inodes;
782 	snapshot_id_list		deletes;
783 };
784 
785 static void inode_walker_exit(struct inode_walker *w)
786 {
787 	darray_exit(&w->inodes);
788 	darray_exit(&w->deletes);
789 }
790 
791 static struct inode_walker inode_walker_init(void)
792 {
793 	return (struct inode_walker) { 0, };
794 }
795 
796 static int add_inode(struct bch_fs *c, struct inode_walker *w,
797 		     struct bkey_s_c inode)
798 {
799 	int ret = darray_push(&w->inodes, ((struct inode_walker_entry) {
800 		.whiteout	= !bkey_is_inode(inode.k),
801 	}));
802 	if (ret)
803 		return ret;
804 
805 	struct inode_walker_entry *n = &darray_last(w->inodes);
806 	if (!n->whiteout) {
807 		return bch2_inode_unpack(inode, &n->inode);
808 	} else {
809 		n->inode.bi_inum	= inode.k->p.inode;
810 		n->inode.bi_snapshot	= inode.k->p.snapshot;
811 		return 0;
812 	}
813 }
814 
815 static int get_inodes_all_snapshots(struct btree_trans *trans,
816 				    struct inode_walker *w, u64 inum)
817 {
818 	struct bch_fs *c = trans->c;
819 	struct btree_iter iter;
820 	struct bkey_s_c k;
821 	int ret;
822 
823 	/*
824 	 * We no longer have inodes for w->last_pos; clear this to avoid
825 	 * screwing up check_i_sectors/check_subdir_count if we take a
826 	 * transaction restart here:
827 	 */
828 	w->have_inodes = false;
829 	w->recalculate_sums = false;
830 	w->inodes.nr = 0;
831 
832 	for_each_btree_key_max_norestart(trans, iter,
833 			BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX),
834 			BTREE_ITER_all_snapshots, k, ret) {
835 		ret = add_inode(c, w, k);
836 		if (ret)
837 			break;
838 	}
839 	bch2_trans_iter_exit(trans, &iter);
840 
841 	if (ret)
842 		return ret;
843 
844 	w->first_this_inode = true;
845 	w->have_inodes = true;
846 	return 0;
847 }
848 
849 static int get_visible_inodes(struct btree_trans *trans,
850 			      struct inode_walker *w,
851 			      struct snapshots_seen *s,
852 			      u64 inum)
853 {
854 	struct bch_fs *c = trans->c;
855 	struct btree_iter iter;
856 	struct bkey_s_c k;
857 	int ret;
858 
859 	w->inodes.nr = 0;
860 	w->deletes.nr = 0;
861 
862 	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
863 			   BTREE_ITER_all_snapshots, k, ret) {
864 		if (k.k->p.offset != inum)
865 			break;
866 
867 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
868 			continue;
869 
870 		if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
871 			continue;
872 
873 		ret = bkey_is_inode(k.k)
874 			? add_inode(c, w, k)
875 			: snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
876 		if (ret)
877 			break;
878 	}
879 	bch2_trans_iter_exit(trans, &iter);
880 
881 	return ret;
882 }
883 
884 static struct inode_walker_entry *
885 lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
886 {
887 	struct bch_fs *c = trans->c;
888 
889 	struct inode_walker_entry *i = darray_find_p(w->inodes, i,
890 		    bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot));
891 
892 	if (!i)
893 		return NULL;
894 
895 	struct printbuf buf = PRINTBUF;
896 	int ret = 0;
897 
898 	if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
899 			trans, snapshot_key_missing_inode_snapshot,
900 			 "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
901 			 "unexpected because we should always update the inode when we update a key in that inode\n"
902 			 "%s",
903 			 w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
904 			 (bch2_bkey_val_to_text(&buf, c, k),
905 			  buf.buf))) {
906 		struct bch_inode_unpacked new = i->inode;
907 		struct bkey_i whiteout;
908 
909 		new.bi_snapshot = k.k->p.snapshot;
910 
911 		if (!i->whiteout) {
912 			ret = __bch2_fsck_write_inode(trans, &new);
913 		} else {
914 			bkey_init(&whiteout.k);
915 			whiteout.k.type = KEY_TYPE_whiteout;
916 			whiteout.k.p = SPOS(0, i->inode.bi_inum, i->inode.bi_snapshot);
917 			ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
918 							  &whiteout,
919 							  BTREE_UPDATE_internal_snapshot_node);
920 		}
921 
922 		if (ret)
923 			goto fsck_err;
924 
925 		ret = bch2_trans_commit(trans, NULL, NULL, 0);
926 		if (ret)
927 			goto fsck_err;
928 
929 		struct inode_walker_entry new_entry = *i;
930 
931 		new_entry.inode.bi_snapshot	= k.k->p.snapshot;
932 		new_entry.count			= 0;
933 		new_entry.i_size		= 0;
934 
935 		while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot)
936 			--i;
937 
938 		size_t pos = i - w->inodes.data;
939 		ret = darray_insert_item(&w->inodes, pos, new_entry);
940 		if (ret)
941 			goto fsck_err;
942 
943 		ret = bch_err_throw(c, transaction_restart_nested);
944 		goto fsck_err;
945 	}
946 
947 	printbuf_exit(&buf);
948 	return i;
949 fsck_err:
950 	printbuf_exit(&buf);
951 	return ERR_PTR(ret);
952 }
953 
954 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
955 					     struct inode_walker *w,
956 					     struct bkey_s_c k)
957 {
958 	if (w->last_pos.inode != k.k->p.inode) {
959 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
960 		if (ret)
961 			return ERR_PTR(ret);
962 	}
963 
964 	w->last_pos = k.k->p;
965 
966 	return lookup_inode_for_snapshot(trans, w, k);
967 }
968 
969 /*
970  * Prefer to delete the first one, since that will be the one at the wrong
971  * offset:
972  * return value: 0 -> delete k1, 1 -> delete k2
973  */
974 int bch2_fsck_update_backpointers(struct btree_trans *trans,
975 				  struct snapshots_seen *s,
976 				  const struct bch_hash_desc desc,
977 				  struct bch_hash_info *hash_info,
978 				  struct bkey_i *new)
979 {
980 	if (new->k.type != KEY_TYPE_dirent)
981 		return 0;
982 
983 	struct bkey_i_dirent *d = bkey_i_to_dirent(new);
984 	struct inode_walker target = inode_walker_init();
985 	int ret = 0;
986 
987 	if (d->v.d_type == DT_SUBVOL) {
988 		bch_err(trans->c, "%s does not support DT_SUBVOL", __func__);
989 		ret = -BCH_ERR_fsck_repair_unimplemented;
990 	} else {
991 		ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
992 		if (ret)
993 			goto err;
994 
995 		darray_for_each(target.inodes, i) {
996 			i->inode.bi_dir_offset = d->k.p.offset;
997 			ret = __bch2_fsck_write_inode(trans, &i->inode);
998 			if (ret)
999 				goto err;
1000 		}
1001 	}
1002 err:
1003 	inode_walker_exit(&target);
1004 	return ret;
1005 }
1006 
1007 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
1008 					       struct btree_iter *iter,
1009 					       struct bch_inode_unpacked *inode,
1010 					       u32 *snapshot)
1011 {
1012 	if (inode->bi_subvol) {
1013 		u64 inum;
1014 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
1015 		if (ret)
1016 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
1017 	}
1018 
1019 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
1020 }
1021 
1022 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
1023 {
1024 	struct btree_iter iter;
1025 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
1026 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
1027 	bch2_trans_iter_exit(trans, &iter);
1028 	return ret;
1029 }
1030 
1031 static int check_inode_dirent_inode(struct btree_trans *trans,
1032 				    struct bch_inode_unpacked *inode,
1033 				    bool *write_inode)
1034 {
1035 	struct bch_fs *c = trans->c;
1036 	struct printbuf buf = PRINTBUF;
1037 
1038 	u32 inode_snapshot = inode->bi_snapshot;
1039 	struct btree_iter dirent_iter = {};
1040 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
1041 	int ret = bkey_err(d);
1042 	if (ret && !bch2_err_matches(ret, ENOENT))
1043 		return ret;
1044 
1045 	if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) &&
1046 	    inode->bi_subvol &&
1047 	    (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
1048 		/* Older version of a renamed subvolume root: we won't have a
1049 		 * correct dirent for it. That's expected, see
1050 		 * inode_should_reattach().
1051 		 *
1052 		 * We don't clear the backpointer field when doing the rename
1053 		 * because there might be arbitrarily many versions in older
1054 		 * snapshots.
1055 		 */
1056 		inode->bi_dir = 0;
1057 		inode->bi_dir_offset = 0;
1058 		*write_inode = true;
1059 		goto out;
1060 	}
1061 
1062 	if (fsck_err_on(ret,
1063 			trans, inode_points_to_missing_dirent,
1064 			"inode points to missing dirent\n%s",
1065 			(bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
1066 	    fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode),
1067 			trans, inode_points_to_wrong_dirent,
1068 			"%s",
1069 			(printbuf_reset(&buf),
1070 			 dirent_inode_mismatch_msg(&buf, c, d, inode),
1071 			 buf.buf))) {
1072 		/*
1073 		 * We just clear the backpointer fields for now. If we find a
1074 		 * dirent that points to this inode in check_dirents(), we'll
1075 		 * update it then; then when we get to check_path() if the
1076 		 * backpointer is still 0 we'll reattach it.
1077 		 */
1078 		inode->bi_dir = 0;
1079 		inode->bi_dir_offset = 0;
1080 		*write_inode = true;
1081 	}
1082 out:
1083 	ret = 0;
1084 fsck_err:
1085 	bch2_trans_iter_exit(trans, &dirent_iter);
1086 	printbuf_exit(&buf);
1087 	bch_err_fn(c, ret);
1088 	return ret;
1089 }
1090 
1091 static int check_inode(struct btree_trans *trans,
1092 		       struct btree_iter *iter,
1093 		       struct bkey_s_c k,
1094 		       struct bch_inode_unpacked *snapshot_root,
1095 		       struct snapshots_seen *s)
1096 {
1097 	struct bch_fs *c = trans->c;
1098 	struct printbuf buf = PRINTBUF;
1099 	struct bch_inode_unpacked u;
1100 	bool do_update = false;
1101 	int ret;
1102 
1103 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1104 	if (ret < 0)
1105 		goto err;
1106 	if (ret)
1107 		return 0;
1108 
1109 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1110 	if (ret)
1111 		goto err;
1112 
1113 	if (!bkey_is_inode(k.k))
1114 		return 0;
1115 
1116 	ret = bch2_inode_unpack(k, &u);
1117 	if (ret)
1118 		goto err;
1119 
1120 	if (snapshot_root->bi_inum != u.bi_inum) {
1121 		ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root);
1122 		if (ret)
1123 			goto err;
1124 	}
1125 
1126 	if (u.bi_hash_seed	!= snapshot_root->bi_hash_seed ||
1127 	    INODE_STR_HASH(&u)	!= INODE_STR_HASH(snapshot_root)) {
1128 		ret = bch2_repair_inode_hash_info(trans, snapshot_root);
1129 		BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented);
1130 		if (ret)
1131 			goto err;
1132 	}
1133 
1134 	ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update);
1135 	if (ret)
1136 		goto err;
1137 
1138 	if (u.bi_dir || u.bi_dir_offset) {
1139 		ret = check_inode_dirent_inode(trans, &u, &do_update);
1140 		if (ret)
1141 			goto err;
1142 	}
1143 
1144 	if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
1145 			trans, inode_unlinked_but_has_dirent,
1146 			"inode unlinked but has dirent\n%s",
1147 			(printbuf_reset(&buf),
1148 			 bch2_inode_unpacked_to_text(&buf, &u),
1149 			 buf.buf))) {
1150 		u.bi_flags &= ~BCH_INODE_unlinked;
1151 		do_update = true;
1152 	}
1153 
1154 	if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
1155 		/* Check for this early so that check_unreachable_inode() will reattach it */
1156 
1157 		ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
1158 		if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
1159 			goto err;
1160 
1161 		fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
1162 			    "dir unlinked but not empty\n%s",
1163 			    (printbuf_reset(&buf),
1164 			     bch2_inode_unpacked_to_text(&buf, &u),
1165 			     buf.buf));
1166 		u.bi_flags &= ~BCH_INODE_unlinked;
1167 		do_update = true;
1168 		ret = 0;
1169 	}
1170 
1171 	if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size,
1172 			trans, inode_dir_has_nonzero_i_size,
1173 			"directory %llu:%u with nonzero i_size %lli",
1174 			u.bi_inum, u.bi_snapshot, u.bi_size)) {
1175 		u.bi_size = 0;
1176 		do_update = true;
1177 	}
1178 
1179 	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
1180 	if (ret < 0)
1181 		goto err;
1182 
1183 	if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
1184 			trans, inode_has_child_snapshots_wrong,
1185 			"inode has_child_snapshots flag wrong (should be %u)\n%s",
1186 			ret,
1187 			(printbuf_reset(&buf),
1188 			 bch2_inode_unpacked_to_text(&buf, &u),
1189 			 buf.buf))) {
1190 		if (ret)
1191 			u.bi_flags |= BCH_INODE_has_child_snapshot;
1192 		else
1193 			u.bi_flags &= ~BCH_INODE_has_child_snapshot;
1194 		do_update = true;
1195 	}
1196 	ret = 0;
1197 
1198 	if ((u.bi_flags & BCH_INODE_unlinked) &&
1199 	    !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
1200 		if (!test_bit(BCH_FS_started, &c->flags)) {
1201 			/*
1202 			 * If we're not in online fsck, don't delete unlinked
1203 			 * inodes, just make sure they're on the deleted list.
1204 			 *
1205 			 * They might be referred to by a logged operation -
1206 			 * i.e. we might have crashed in the middle of a
1207 			 * truncate on an unlinked but open file - so we want to
1208 			 * let the delete_dead_inodes kill it after resuming
1209 			 * logged ops.
1210 			 */
1211 			ret = check_inode_deleted_list(trans, k.k->p);
1212 			if (ret < 0)
1213 				goto err_noprint;
1214 
1215 			fsck_err_on(!ret,
1216 				    trans, unlinked_inode_not_on_deleted_list,
1217 				    "inode %llu:%u unlinked, but not on deleted list",
1218 				    u.bi_inum, k.k->p.snapshot);
1219 
1220 			ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
1221 			if (ret)
1222 				goto err;
1223 		} else {
1224 			ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
1225 			if (ret < 0)
1226 				goto err;
1227 
1228 			if (fsck_err_on(!ret,
1229 					trans, inode_unlinked_and_not_open,
1230 				      "inode %llu:%u unlinked and not open",
1231 				      u.bi_inum, u.bi_snapshot)) {
1232 				ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1233 				bch_err_msg(c, ret, "in fsck deleting inode");
1234 				goto err_noprint;
1235 			}
1236 			ret = 0;
1237 		}
1238 	}
1239 
1240 	if (fsck_err_on(u.bi_parent_subvol &&
1241 			(u.bi_subvol == 0 ||
1242 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1243 			trans, inode_bi_parent_nonzero,
1244 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1245 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1246 		u.bi_parent_subvol = 0;
1247 		do_update = true;
1248 	}
1249 
1250 	if (u.bi_subvol) {
1251 		struct bch_subvolume s;
1252 
1253 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
1254 		if (ret && !bch2_err_matches(ret, ENOENT))
1255 			goto err;
1256 
1257 		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1258 			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1259 			goto do_update;
1260 		}
1261 
1262 		if (fsck_err_on(ret,
1263 				trans, inode_bi_subvol_missing,
1264 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1265 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1266 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1267 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1268 							   k.k->p.snapshot),
1269 				trans, inode_bi_subvol_wrong,
1270 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1271 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1272 				le64_to_cpu(s.inode),
1273 				le32_to_cpu(s.snapshot))) {
1274 			u.bi_subvol = 0;
1275 			u.bi_parent_subvol = 0;
1276 			do_update = true;
1277 		}
1278 	}
1279 
1280 	if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
1281 			trans, inode_journal_seq_in_future,
1282 			"inode journal seq in future (currently at %llu)\n%s",
1283 			journal_cur_seq(&c->journal),
1284 			(printbuf_reset(&buf),
1285 			 bch2_inode_unpacked_to_text(&buf, &u),
1286 			buf.buf))) {
1287 		u.bi_journal_seq = journal_cur_seq(&c->journal);
1288 		do_update = true;
1289 	}
1290 do_update:
1291 	if (do_update) {
1292 		ret = __bch2_fsck_write_inode(trans, &u);
1293 		bch_err_msg(c, ret, "in fsck updating inode");
1294 		if (ret)
1295 			goto err_noprint;
1296 	}
1297 err:
1298 fsck_err:
1299 	bch_err_fn(c, ret);
1300 err_noprint:
1301 	printbuf_exit(&buf);
1302 	return ret;
1303 }
1304 
1305 int bch2_check_inodes(struct bch_fs *c)
1306 {
1307 	struct bch_inode_unpacked snapshot_root = {};
1308 	struct snapshots_seen s;
1309 
1310 	snapshots_seen_init(&s);
1311 
1312 	int ret = bch2_trans_run(c,
1313 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1314 				POS_MIN,
1315 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1316 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1317 			check_inode(trans, &iter, k, &snapshot_root, &s)));
1318 
1319 	snapshots_seen_exit(&s);
1320 	bch_err_fn(c, ret);
1321 	return ret;
1322 }
1323 
1324 static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
1325 					    struct bch_inode_unpacked *inode)
1326 {
1327 	struct bch_fs *c = trans->c;
1328 	struct btree_iter iter;
1329 	struct bkey_s_c k;
1330 	int ret = 0;
1331 
1332 	/*
1333 	 * We look for inodes to reattach in natural key order, leaves first,
1334 	 * but we should do the reattach at the oldest version that needs to be
1335 	 * reattached:
1336 	 */
1337 	for_each_btree_key_norestart(trans, iter,
1338 				     BTREE_ID_inodes,
1339 				     SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
1340 				     BTREE_ITER_all_snapshots, k, ret) {
1341 		if (k.k->p.offset != inode->bi_inum)
1342 			break;
1343 
1344 		if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
1345 			continue;
1346 
1347 		if (!bkey_is_inode(k.k))
1348 			break;
1349 
1350 		struct bch_inode_unpacked parent_inode;
1351 		ret = bch2_inode_unpack(k, &parent_inode);
1352 		if (ret)
1353 			break;
1354 
1355 		if (!inode_should_reattach(&parent_inode))
1356 			break;
1357 
1358 		*inode = parent_inode;
1359 	}
1360 	bch2_trans_iter_exit(trans, &iter);
1361 
1362 	return ret;
1363 }
1364 
1365 static int check_unreachable_inode(struct btree_trans *trans,
1366 				   struct btree_iter *iter,
1367 				   struct bkey_s_c k)
1368 {
1369 	struct printbuf buf = PRINTBUF;
1370 	int ret = 0;
1371 
1372 	if (!bkey_is_inode(k.k))
1373 		return 0;
1374 
1375 	struct bch_inode_unpacked inode;
1376 	ret = bch2_inode_unpack(k, &inode);
1377 	if (ret)
1378 		return ret;
1379 
1380 	if (!inode_should_reattach(&inode))
1381 		return 0;
1382 
1383 	ret = find_oldest_inode_needs_reattach(trans, &inode);
1384 	if (ret)
1385 		return ret;
1386 
1387 	if (fsck_err(trans, inode_unreachable,
1388 		     "unreachable inode:\n%s",
1389 		     (bch2_inode_unpacked_to_text(&buf, &inode),
1390 		      buf.buf)))
1391 		ret = reattach_inode(trans, &inode);
1392 fsck_err:
1393 	printbuf_exit(&buf);
1394 	return ret;
1395 }
1396 
1397 /*
1398  * Reattach unreachable (but not unlinked) inodes
1399  *
1400  * Run after check_inodes() and check_dirents(), so we node that inode
1401  * backpointer fields point to valid dirents, and every inode that has a dirent
1402  * that points to it has its backpointer field set - so we're just looking for
1403  * non-unlinked inodes without backpointers:
1404  *
1405  * XXX: this is racy w.r.t. hardlink removal in online fsck
1406  */
1407 int bch2_check_unreachable_inodes(struct bch_fs *c)
1408 {
1409 	int ret = bch2_trans_run(c,
1410 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1411 				POS_MIN,
1412 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1413 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1414 			check_unreachable_inode(trans, &iter, k)));
1415 	bch_err_fn(c, ret);
1416 	return ret;
1417 }
1418 
1419 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1420 {
1421 	switch (btree) {
1422 	case BTREE_ID_extents:
1423 		return S_ISREG(mode) || S_ISLNK(mode);
1424 	case BTREE_ID_dirents:
1425 		return S_ISDIR(mode);
1426 	case BTREE_ID_xattrs:
1427 		return true;
1428 	default:
1429 		BUG();
1430 	}
1431 }
1432 
1433 static int check_key_has_inode(struct btree_trans *trans,
1434 			       struct btree_iter *iter,
1435 			       struct inode_walker *inode,
1436 			       struct inode_walker_entry *i,
1437 			       struct bkey_s_c k)
1438 {
1439 	struct bch_fs *c = trans->c;
1440 	struct printbuf buf = PRINTBUF;
1441 	int ret = PTR_ERR_OR_ZERO(i);
1442 	if (ret)
1443 		return ret;
1444 
1445 	if (k.k->type == KEY_TYPE_whiteout)
1446 		goto out;
1447 
1448 	bool have_inode = i && !i->whiteout;
1449 
1450 	if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1451 		ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1452 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1453 		if (ret)
1454 			goto err;
1455 
1456 		inode->last_pos.inode--;
1457 		ret = bch_err_throw(c, transaction_restart_nested);
1458 		goto err;
1459 	}
1460 
1461 	if (fsck_err_on(!have_inode,
1462 			trans, key_in_missing_inode,
1463 			"key in missing inode:\n%s",
1464 			(printbuf_reset(&buf),
1465 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1466 		goto delete;
1467 
1468 	if (fsck_err_on(have_inode && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1469 			trans, key_in_wrong_inode_type,
1470 			"key for wrong inode mode %o:\n%s",
1471 			i->inode.bi_mode,
1472 			(printbuf_reset(&buf),
1473 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1474 		goto delete;
1475 out:
1476 err:
1477 fsck_err:
1478 	printbuf_exit(&buf);
1479 	bch_err_fn(c, ret);
1480 	return ret;
1481 delete:
1482 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1483 	goto out;
1484 }
1485 
1486 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1487 {
1488 	struct bch_fs *c = trans->c;
1489 	int ret = 0;
1490 	s64 count2;
1491 
1492 	darray_for_each(w->inodes, i) {
1493 		if (i->inode.bi_sectors == i->count)
1494 			continue;
1495 
1496 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot);
1497 
1498 		if (w->recalculate_sums)
1499 			i->count = count2;
1500 
1501 		if (i->count != count2) {
1502 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1503 					    w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
1504 			i->count = count2;
1505 		}
1506 
1507 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1508 				trans, inode_i_sectors_wrong,
1509 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1510 				w->last_pos.inode, i->inode.bi_snapshot,
1511 				i->inode.bi_sectors, i->count)) {
1512 			i->inode.bi_sectors = i->count;
1513 			ret = bch2_fsck_write_inode(trans, &i->inode);
1514 			if (ret)
1515 				break;
1516 		}
1517 	}
1518 fsck_err:
1519 	bch_err_fn(c, ret);
1520 	return ret;
1521 }
1522 
1523 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1524 {
1525 	u32 restart_count = trans->restart_count;
1526 	return check_i_sectors_notnested(trans, w) ?:
1527 		trans_was_restarted(trans, restart_count);
1528 }
1529 
1530 struct extent_end {
1531 	u32			snapshot;
1532 	u64			offset;
1533 	struct snapshots_seen	seen;
1534 };
1535 
1536 struct extent_ends {
1537 	struct bpos			last_pos;
1538 	DARRAY(struct extent_end)	e;
1539 };
1540 
1541 static void extent_ends_reset(struct extent_ends *extent_ends)
1542 {
1543 	darray_for_each(extent_ends->e, i)
1544 		snapshots_seen_exit(&i->seen);
1545 	extent_ends->e.nr = 0;
1546 }
1547 
1548 static void extent_ends_exit(struct extent_ends *extent_ends)
1549 {
1550 	extent_ends_reset(extent_ends);
1551 	darray_exit(&extent_ends->e);
1552 }
1553 
1554 static void extent_ends_init(struct extent_ends *extent_ends)
1555 {
1556 	memset(extent_ends, 0, sizeof(*extent_ends));
1557 }
1558 
1559 static int extent_ends_at(struct bch_fs *c,
1560 			  struct extent_ends *extent_ends,
1561 			  struct snapshots_seen *seen,
1562 			  struct bkey_s_c k)
1563 {
1564 	struct extent_end *i, n = (struct extent_end) {
1565 		.offset		= k.k->p.offset,
1566 		.snapshot	= k.k->p.snapshot,
1567 		.seen		= *seen,
1568 	};
1569 
1570 	n.seen.ids.data = kmemdup(seen->ids.data,
1571 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1572 			      GFP_KERNEL);
1573 	if (!n.seen.ids.data)
1574 		return bch_err_throw(c, ENOMEM_fsck_extent_ends_at);
1575 
1576 	__darray_for_each(extent_ends->e, i) {
1577 		if (i->snapshot == k.k->p.snapshot) {
1578 			snapshots_seen_exit(&i->seen);
1579 			*i = n;
1580 			return 0;
1581 		}
1582 
1583 		if (i->snapshot >= k.k->p.snapshot)
1584 			break;
1585 	}
1586 
1587 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1588 }
1589 
1590 static int overlapping_extents_found(struct btree_trans *trans,
1591 				     enum btree_id btree,
1592 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1593 				     struct bkey pos2,
1594 				     bool *fixed,
1595 				     struct extent_end *extent_end)
1596 {
1597 	struct bch_fs *c = trans->c;
1598 	struct printbuf buf = PRINTBUF;
1599 	struct btree_iter iter1, iter2 = {};
1600 	struct bkey_s_c k1, k2;
1601 	int ret;
1602 
1603 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1604 
1605 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1606 			     BTREE_ITER_all_snapshots|
1607 			     BTREE_ITER_not_extents);
1608 	k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX));
1609 	ret = bkey_err(k1);
1610 	if (ret)
1611 		goto err;
1612 
1613 	prt_newline(&buf);
1614 	bch2_bkey_val_to_text(&buf, c, k1);
1615 
1616 	if (!bpos_eq(pos1, k1.k->p)) {
1617 		prt_str(&buf, "\nwanted\n  ");
1618 		bch2_bpos_to_text(&buf, pos1);
1619 		prt_str(&buf, "\n");
1620 		bch2_bkey_to_text(&buf, &pos2);
1621 
1622 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1623 			__func__, buf.buf);
1624 		ret = bch_err_throw(c, internal_fsck_err);
1625 		goto err;
1626 	}
1627 
1628 	bch2_trans_copy_iter(trans, &iter2, &iter1);
1629 
1630 	while (1) {
1631 		bch2_btree_iter_advance(trans, &iter2);
1632 
1633 		k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX));
1634 		ret = bkey_err(k2);
1635 		if (ret)
1636 			goto err;
1637 
1638 		if (bpos_ge(k2.k->p, pos2.p))
1639 			break;
1640 	}
1641 
1642 	prt_newline(&buf);
1643 	bch2_bkey_val_to_text(&buf, c, k2);
1644 
1645 	if (bpos_gt(k2.k->p, pos2.p) ||
1646 	    pos2.size != k2.k->size) {
1647 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1648 			__func__, buf.buf);
1649 		ret = bch_err_throw(c, internal_fsck_err);
1650 		goto err;
1651 	}
1652 
1653 	prt_printf(&buf, "\noverwriting %s extent",
1654 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1655 
1656 	if (fsck_err(trans, extent_overlapping,
1657 		     "overlapping extents%s", buf.buf)) {
1658 		struct btree_iter *old_iter = &iter1;
1659 		struct disk_reservation res = { 0 };
1660 
1661 		if (pos1.snapshot < pos2.p.snapshot) {
1662 			old_iter = &iter2;
1663 			swap(k1, k2);
1664 		}
1665 
1666 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1667 
1668 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1669 				BTREE_UPDATE_internal_snapshot_node,
1670 				k1, k2) ?:
1671 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1672 		bch2_disk_reservation_put(c, &res);
1673 
1674 		bch_info(c, "repair ret %s", bch2_err_str(ret));
1675 
1676 		if (ret)
1677 			goto err;
1678 
1679 		*fixed = true;
1680 
1681 		if (pos1.snapshot == pos2.p.snapshot) {
1682 			/*
1683 			 * We overwrote the first extent, and did the overwrite
1684 			 * in the same snapshot:
1685 			 */
1686 			extent_end->offset = bkey_start_offset(&pos2);
1687 		} else if (pos1.snapshot > pos2.p.snapshot) {
1688 			/*
1689 			 * We overwrote the first extent in pos2's snapshot:
1690 			 */
1691 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1692 		} else {
1693 			/*
1694 			 * We overwrote the second extent - restart
1695 			 * check_extent() from the top:
1696 			 */
1697 			ret = bch_err_throw(c, transaction_restart_nested);
1698 		}
1699 	}
1700 fsck_err:
1701 err:
1702 	bch2_trans_iter_exit(trans, &iter2);
1703 	bch2_trans_iter_exit(trans, &iter1);
1704 	printbuf_exit(&buf);
1705 	return ret;
1706 }
1707 
1708 static int check_overlapping_extents(struct btree_trans *trans,
1709 			      struct snapshots_seen *seen,
1710 			      struct extent_ends *extent_ends,
1711 			      struct bkey_s_c k,
1712 			      struct btree_iter *iter,
1713 			      bool *fixed)
1714 {
1715 	struct bch_fs *c = trans->c;
1716 	int ret = 0;
1717 
1718 	/* transaction restart, running again */
1719 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1720 		return 0;
1721 
1722 	if (extent_ends->last_pos.inode != k.k->p.inode)
1723 		extent_ends_reset(extent_ends);
1724 
1725 	darray_for_each(extent_ends->e, i) {
1726 		if (i->offset <= bkey_start_offset(k.k))
1727 			continue;
1728 
1729 		if (!ref_visible2(c,
1730 				  k.k->p.snapshot, seen,
1731 				  i->snapshot, &i->seen))
1732 			continue;
1733 
1734 		ret = overlapping_extents_found(trans, iter->btree_id,
1735 						SPOS(iter->pos.inode,
1736 						     i->offset,
1737 						     i->snapshot),
1738 						&i->seen,
1739 						*k.k, fixed, i);
1740 		if (ret)
1741 			goto err;
1742 	}
1743 
1744 	extent_ends->last_pos = k.k->p;
1745 err:
1746 	return ret;
1747 }
1748 
1749 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1750 				struct bkey_s_c k)
1751 {
1752 	struct bch_fs *c = trans->c;
1753 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1754 	struct bch_extent_crc_unpacked crc;
1755 	const union bch_extent_entry *i;
1756 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1757 
1758 	bkey_for_each_crc(k.k, ptrs, crc, i)
1759 		if (crc_is_encoded(crc) &&
1760 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1761 			struct printbuf buf = PRINTBUF;
1762 
1763 			bch2_bkey_val_to_text(&buf, c, k);
1764 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1765 			printbuf_exit(&buf);
1766 		}
1767 
1768 	return 0;
1769 }
1770 
1771 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1772 			struct bkey_s_c k,
1773 			struct inode_walker *inode,
1774 			struct snapshots_seen *s,
1775 			struct extent_ends *extent_ends,
1776 			struct disk_reservation *res)
1777 {
1778 	struct bch_fs *c = trans->c;
1779 	struct printbuf buf = PRINTBUF;
1780 	int ret = 0;
1781 
1782 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1783 	if (ret) {
1784 		ret = ret < 0 ? ret : 0;
1785 		goto out;
1786 	}
1787 
1788 	if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
1789 		ret = check_i_sectors(trans, inode);
1790 		if (ret)
1791 			goto err;
1792 	}
1793 
1794 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1795 	if (ret)
1796 		goto err;
1797 
1798 	struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
1799 	ret = PTR_ERR_OR_ZERO(extent_i);
1800 	if (ret)
1801 		goto err;
1802 
1803 	ret = check_key_has_inode(trans, iter, inode, extent_i, k);
1804 	if (ret)
1805 		goto err;
1806 
1807 	if (k.k->type != KEY_TYPE_whiteout) {
1808 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1809 						&inode->recalculate_sums);
1810 		if (ret)
1811 			goto err;
1812 
1813 		/*
1814 		 * Check inodes in reverse order, from oldest snapshots to
1815 		 * newest, starting from the inode that matches this extent's
1816 		 * snapshot. If we didn't have one, iterate over all inodes:
1817 		 */
1818 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1819 		     inode->inodes.data && i >= inode->inodes.data;
1820 		     --i) {
1821 			if (i->inode.bi_snapshot > k.k->p.snapshot ||
1822 			    !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
1823 				continue;
1824 
1825 			if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1826 					!bkey_extent_is_reservation(k),
1827 					trans, extent_past_end_of_inode,
1828 					"extent type past end of inode %llu:%u, i_size %llu\n%s",
1829 					i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
1830 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1831 				struct btree_iter iter2;
1832 
1833 				bch2_trans_copy_iter(trans, &iter2, iter);
1834 				bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot);
1835 				ret =   bch2_btree_iter_traverse(trans, &iter2) ?:
1836 					bch2_btree_delete_at(trans, &iter2,
1837 						BTREE_UPDATE_internal_snapshot_node);
1838 				bch2_trans_iter_exit(trans, &iter2);
1839 				if (ret)
1840 					goto err;
1841 
1842 				iter->k.type = KEY_TYPE_whiteout;
1843 				break;
1844 			}
1845 		}
1846 	}
1847 
1848 	ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
1849 	if (ret)
1850 		goto err;
1851 
1852 	if (bkey_extent_is_allocation(k.k)) {
1853 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1854 		     inode->inodes.data && i >= inode->inodes.data;
1855 		     --i) {
1856 			if (i->whiteout ||
1857 			    i->inode.bi_snapshot > k.k->p.snapshot ||
1858 			    !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
1859 				continue;
1860 
1861 			i->count += k.k->size;
1862 		}
1863 	}
1864 
1865 	if (k.k->type != KEY_TYPE_whiteout) {
1866 		ret = extent_ends_at(c, extent_ends, s, k);
1867 		if (ret)
1868 			goto err;
1869 	}
1870 out:
1871 err:
1872 fsck_err:
1873 	printbuf_exit(&buf);
1874 	bch_err_fn(c, ret);
1875 	return ret;
1876 }
1877 
1878 /*
1879  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1880  * that i_size an i_sectors are consistent
1881  */
1882 int bch2_check_extents(struct bch_fs *c)
1883 {
1884 	struct inode_walker w = inode_walker_init();
1885 	struct snapshots_seen s;
1886 	struct extent_ends extent_ends;
1887 	struct disk_reservation res = { 0 };
1888 
1889 	snapshots_seen_init(&s);
1890 	extent_ends_init(&extent_ends);
1891 
1892 	int ret = bch2_trans_run(c,
1893 		for_each_btree_key(trans, iter, BTREE_ID_extents,
1894 				POS(BCACHEFS_ROOT_INO, 0),
1895 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
1896 			bch2_disk_reservation_put(c, &res);
1897 			check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
1898 			check_extent_overbig(trans, &iter, k);
1899 		})) ?:
1900 		check_i_sectors_notnested(trans, &w));
1901 
1902 	bch2_disk_reservation_put(c, &res);
1903 	extent_ends_exit(&extent_ends);
1904 	inode_walker_exit(&w);
1905 	snapshots_seen_exit(&s);
1906 
1907 	bch_err_fn(c, ret);
1908 	return ret;
1909 }
1910 
1911 int bch2_check_indirect_extents(struct bch_fs *c)
1912 {
1913 	struct disk_reservation res = { 0 };
1914 
1915 	int ret = bch2_trans_run(c,
1916 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1917 				POS_MIN,
1918 				BTREE_ITER_prefetch, k,
1919 				&res, NULL,
1920 				BCH_TRANS_COMMIT_no_enospc, ({
1921 			bch2_disk_reservation_put(c, &res);
1922 			check_extent_overbig(trans, &iter, k);
1923 		})));
1924 
1925 	bch2_disk_reservation_put(c, &res);
1926 	bch_err_fn(c, ret);
1927 	return ret;
1928 }
1929 
1930 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1931 {
1932 	struct bch_fs *c = trans->c;
1933 	int ret = 0;
1934 	s64 count2;
1935 
1936 	darray_for_each(w->inodes, i) {
1937 		if (i->inode.bi_nlink == i->count)
1938 			continue;
1939 
1940 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot);
1941 		if (count2 < 0)
1942 			return count2;
1943 
1944 		if (i->count != count2) {
1945 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1946 					    w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
1947 			i->count = count2;
1948 			if (i->inode.bi_nlink == i->count)
1949 				continue;
1950 		}
1951 
1952 		if (fsck_err_on(i->inode.bi_nlink != i->count,
1953 				trans, inode_dir_wrong_nlink,
1954 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1955 				w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) {
1956 			i->inode.bi_nlink = i->count;
1957 			ret = bch2_fsck_write_inode(trans, &i->inode);
1958 			if (ret)
1959 				break;
1960 		}
1961 	}
1962 fsck_err:
1963 	bch_err_fn(c, ret);
1964 	return ret;
1965 }
1966 
1967 static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
1968 {
1969 	u32 restart_count = trans->restart_count;
1970 	return check_subdir_count_notnested(trans, w) ?:
1971 		trans_was_restarted(trans, restart_count);
1972 }
1973 
1974 /* find a subvolume that's a descendent of @snapshot: */
1975 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1976 {
1977 	struct btree_iter iter;
1978 	struct bkey_s_c k;
1979 	int ret;
1980 
1981 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1982 		if (k.k->type != KEY_TYPE_subvolume)
1983 			continue;
1984 
1985 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1986 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1987 			bch2_trans_iter_exit(trans, &iter);
1988 			*subvolid = k.k->p.offset;
1989 			goto found;
1990 		}
1991 	}
1992 	if (!ret)
1993 		ret = -ENOENT;
1994 found:
1995 	bch2_trans_iter_exit(trans, &iter);
1996 	return ret;
1997 }
1998 
1999 noinline_for_stack
2000 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
2001 				  struct bkey_s_c_dirent d)
2002 {
2003 	struct bch_fs *c = trans->c;
2004 	struct btree_iter subvol_iter = {};
2005 	struct bch_inode_unpacked subvol_root;
2006 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
2007 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
2008 	u32 parent_snapshot;
2009 	u32 new_parent_subvol = 0;
2010 	u64 parent_inum;
2011 	struct printbuf buf = PRINTBUF;
2012 	int ret = 0;
2013 
2014 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
2015 	if (ret && !bch2_err_matches(ret, ENOENT))
2016 		return ret;
2017 
2018 	if (ret ||
2019 	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
2020 		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
2021 		if (ret2 && !bch2_err_matches(ret, ENOENT))
2022 			return ret2;
2023 	}
2024 
2025 	if (ret &&
2026 	    !new_parent_subvol &&
2027 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
2028 		/*
2029 		 * Couldn't find a subvol for dirent's snapshot - but we lost
2030 		 * subvols, so we need to reconstruct:
2031 		 */
2032 		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
2033 		if (ret)
2034 			return ret;
2035 
2036 		parent_snapshot = d.k->p.snapshot;
2037 	}
2038 
2039 	if (fsck_err_on(ret,
2040 			trans, dirent_to_missing_parent_subvol,
2041 			"dirent parent_subvol points to missing subvolume\n%s",
2042 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
2043 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
2044 			trans, dirent_not_visible_in_parent_subvol,
2045 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
2046 			parent_snapshot,
2047 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
2048 		if (!new_parent_subvol) {
2049 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
2050 			return bch_err_throw(c, fsck_repair_unimplemented);
2051 		}
2052 
2053 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
2054 		ret = PTR_ERR_OR_ZERO(new_dirent);
2055 		if (ret)
2056 			goto err;
2057 
2058 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
2059 	}
2060 
2061 	struct bkey_s_c_subvolume s =
2062 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
2063 					 BTREE_ID_subvolumes, POS(0, target_subvol),
2064 					 0, subvolume);
2065 	ret = bkey_err(s.s_c);
2066 	if (ret && !bch2_err_matches(ret, ENOENT))
2067 		goto err;
2068 
2069 	if (ret) {
2070 		if (fsck_err(trans, dirent_to_missing_subvol,
2071 			     "dirent points to missing subvolume\n%s",
2072 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
2073 			return bch2_fsck_remove_dirent(trans, d.k->p);
2074 		ret = 0;
2075 		goto out;
2076 	}
2077 
2078 	if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) {
2079 		printbuf_reset(&buf);
2080 
2081 		prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n",
2082 			   parent_subvol);
2083 
2084 		ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset,
2085 					le64_to_cpu(s.v->inode) }, &buf);
2086 		if (ret)
2087 			goto err;
2088 		prt_newline(&buf);
2089 		bch2_bkey_val_to_text(&buf, c, s.s_c);
2090 
2091 		if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) {
2092 			struct bkey_i_subvolume *n =
2093 				bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
2094 			ret = PTR_ERR_OR_ZERO(n);
2095 			if (ret)
2096 				goto err;
2097 
2098 			n->v.fs_path_parent = cpu_to_le32(parent_subvol);
2099 		}
2100 	}
2101 
2102 	u64 target_inum = le64_to_cpu(s.v->inode);
2103 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2104 
2105 	ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot,
2106 					       &subvol_root, 0);
2107 	if (ret && !bch2_err_matches(ret, ENOENT))
2108 		goto err;
2109 
2110 	if (ret) {
2111 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2112 		ret = bch_err_throw(c, fsck_repair_unimplemented);
2113 		goto err;
2114 	}
2115 
2116 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2117 			trans, inode_bi_parent_wrong,
2118 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2119 			target_inum,
2120 			subvol_root.bi_parent_subvol, parent_subvol)) {
2121 		subvol_root.bi_parent_subvol = parent_subvol;
2122 		subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
2123 		ret = __bch2_fsck_write_inode(trans, &subvol_root);
2124 		if (ret)
2125 			goto err;
2126 	}
2127 
2128 	ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
2129 	if (ret)
2130 		goto err;
2131 out:
2132 err:
2133 fsck_err:
2134 	bch2_trans_iter_exit(trans, &subvol_iter);
2135 	printbuf_exit(&buf);
2136 	return ret;
2137 }
2138 
2139 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2140 			struct bkey_s_c k,
2141 			struct bch_hash_info *hash_info,
2142 			struct inode_walker *dir,
2143 			struct inode_walker *target,
2144 			struct snapshots_seen *s,
2145 			bool *need_second_pass)
2146 {
2147 	struct bch_fs *c = trans->c;
2148 	struct inode_walker_entry *i;
2149 	struct printbuf buf = PRINTBUF;
2150 	int ret = 0;
2151 
2152 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2153 	if (ret) {
2154 		ret = ret < 0 ? ret : 0;
2155 		goto out;
2156 	}
2157 
2158 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2159 	if (ret)
2160 		goto err;
2161 
2162 	if (k.k->type == KEY_TYPE_whiteout)
2163 		goto out;
2164 
2165 	if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
2166 		ret = check_subdir_dirents_count(trans, dir);
2167 		if (ret)
2168 			goto err;
2169 	}
2170 
2171 	i = walk_inode(trans, dir, k);
2172 	ret = PTR_ERR_OR_ZERO(i);
2173 	if (ret < 0)
2174 		goto err;
2175 
2176 	ret = check_key_has_inode(trans, iter, dir, i, k);
2177 	if (ret)
2178 		goto err;
2179 
2180 	if (!i || i->whiteout)
2181 		goto out;
2182 
2183 	if (dir->first_this_inode)
2184 		*hash_info = bch2_hash_info_init(c, &i->inode);
2185 	dir->first_this_inode = false;
2186 
2187 #ifdef CONFIG_UNICODE
2188 	hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL;
2189 #endif
2190 
2191 	ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info,
2192 				      iter, k, need_second_pass);
2193 	if (ret < 0)
2194 		goto err;
2195 	if (ret) {
2196 		/* dirent has been deleted */
2197 		ret = 0;
2198 		goto out;
2199 	}
2200 
2201 	if (k.k->type != KEY_TYPE_dirent)
2202 		goto out;
2203 
2204 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2205 
2206 	/* check casefold */
2207 	if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding,
2208 			trans, dirent_casefold_mismatch,
2209 			"dirent casefold does not match dir casefold\n%s",
2210 			(printbuf_reset(&buf),
2211 			 bch2_bkey_val_to_text(&buf, c, k),
2212 			 buf.buf))) {
2213 		subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL
2214 				? le32_to_cpu(d.v->d_parent_subvol)
2215 				: 0,
2216 		};
2217 		u64 target = d.v->d_type == DT_SUBVOL
2218 			? le32_to_cpu(d.v->d_child_subvol)
2219 			: le64_to_cpu(d.v->d_inum);
2220 		struct qstr name = bch2_dirent_get_name(d);
2221 
2222 		struct bkey_i_dirent *new_d =
2223 			bch2_dirent_create_key(trans, hash_info, dir_inum,
2224 					       d.v->d_type, &name, NULL, target);
2225 		ret = PTR_ERR_OR_ZERO(new_d);
2226 		if (ret)
2227 			goto out;
2228 
2229 		new_d->k.p.inode	= d.k->p.inode;
2230 		new_d->k.p.snapshot	= d.k->p.snapshot;
2231 
2232 		struct btree_iter dup_iter = {};
2233 		ret =	bch2_hash_delete_at(trans,
2234 					    bch2_dirent_hash_desc, hash_info, iter,
2235 					    BTREE_UPDATE_internal_snapshot_node) ?:
2236 			bch2_str_hash_repair_key(trans, s,
2237 						 &bch2_dirent_hash_desc, hash_info,
2238 						 iter, bkey_i_to_s_c(&new_d->k_i),
2239 						 &dup_iter, bkey_s_c_null,
2240 						 need_second_pass);
2241 		goto out;
2242 	}
2243 
2244 	if (d.v->d_type == DT_SUBVOL) {
2245 		ret = check_dirent_to_subvol(trans, iter, d);
2246 		if (ret)
2247 			goto err;
2248 	} else {
2249 		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2250 		if (ret)
2251 			goto err;
2252 
2253 		if (fsck_err_on(!target->inodes.nr,
2254 				trans, dirent_to_missing_inode,
2255 				"dirent points to missing inode:\n%s",
2256 				(printbuf_reset(&buf),
2257 				 bch2_bkey_val_to_text(&buf, c, k),
2258 				 buf.buf))) {
2259 			ret = bch2_fsck_remove_dirent(trans, d.k->p);
2260 			if (ret)
2261 				goto err;
2262 		}
2263 
2264 		darray_for_each(target->inodes, i) {
2265 			ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
2266 			if (ret)
2267 				goto err;
2268 		}
2269 
2270 		darray_for_each(target->deletes, i)
2271 			if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
2272 					trans, dirent_to_overwritten_inode,
2273 					"dirent points to inode overwritten in snapshot %u:\n%s",
2274 					*i,
2275 					(printbuf_reset(&buf),
2276 					 bch2_bkey_val_to_text(&buf, c, k),
2277 					 buf.buf))) {
2278 				struct btree_iter delete_iter;
2279 				bch2_trans_iter_init(trans, &delete_iter,
2280 						     BTREE_ID_dirents,
2281 						     SPOS(k.k->p.inode, k.k->p.offset, *i),
2282 						     BTREE_ITER_intent);
2283 				ret =   bch2_btree_iter_traverse(trans, &delete_iter) ?:
2284 					bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
2285 							  hash_info,
2286 							  &delete_iter,
2287 							  BTREE_UPDATE_internal_snapshot_node);
2288 				bch2_trans_iter_exit(trans, &delete_iter);
2289 				if (ret)
2290 					goto err;
2291 
2292 			}
2293 	}
2294 
2295 	ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2296 	if (ret)
2297 		goto err;
2298 
2299 	for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
2300 		if (d.v->d_type == DT_DIR)
2301 			i->count++;
2302 		i->i_size += bkey_bytes(d.k);
2303 	}
2304 out:
2305 err:
2306 fsck_err:
2307 	printbuf_exit(&buf);
2308 	return ret;
2309 }
2310 
2311 /*
2312  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2313  * validate d_type
2314  */
2315 int bch2_check_dirents(struct bch_fs *c)
2316 {
2317 	struct inode_walker dir = inode_walker_init();
2318 	struct inode_walker target = inode_walker_init();
2319 	struct snapshots_seen s;
2320 	struct bch_hash_info hash_info;
2321 	bool need_second_pass = false, did_second_pass = false;
2322 	int ret;
2323 
2324 	snapshots_seen_init(&s);
2325 again:
2326 	ret = bch2_trans_run(c,
2327 		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2328 				POS(BCACHEFS_ROOT_INO, 0),
2329 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2330 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2331 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s,
2332 				     &need_second_pass)) ?:
2333 		check_subdir_count_notnested(trans, &dir));
2334 
2335 	if (!ret && need_second_pass && !did_second_pass) {
2336 		bch_info(c, "check_dirents requires second pass");
2337 		swap(did_second_pass, need_second_pass);
2338 		goto again;
2339 	}
2340 
2341 	if (!ret && need_second_pass) {
2342 		bch_err(c, "dirents not repairing");
2343 		ret = -EINVAL;
2344 	}
2345 
2346 	snapshots_seen_exit(&s);
2347 	inode_walker_exit(&dir);
2348 	inode_walker_exit(&target);
2349 	bch_err_fn(c, ret);
2350 	return ret;
2351 }
2352 
2353 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2354 		       struct bkey_s_c k,
2355 		       struct bch_hash_info *hash_info,
2356 		       struct inode_walker *inode)
2357 {
2358 	struct bch_fs *c = trans->c;
2359 
2360 	int ret = bch2_check_key_has_snapshot(trans, iter, k);
2361 	if (ret < 0)
2362 		return ret;
2363 	if (ret)
2364 		return 0;
2365 
2366 	struct inode_walker_entry *i = walk_inode(trans, inode, k);
2367 	ret = PTR_ERR_OR_ZERO(i);
2368 	if (ret)
2369 		return ret;
2370 
2371 	ret = check_key_has_inode(trans, iter, inode, i, k);
2372 	if (ret)
2373 		return ret;
2374 
2375 	if (!i || i->whiteout)
2376 		return 0;
2377 
2378 	if (inode->first_this_inode)
2379 		*hash_info = bch2_hash_info_init(c, &i->inode);
2380 	inode->first_this_inode = false;
2381 
2382 	bool need_second_pass = false;
2383 	return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info,
2384 				      iter, k, &need_second_pass);
2385 }
2386 
2387 /*
2388  * Walk xattrs: verify that they all have a corresponding inode
2389  */
2390 int bch2_check_xattrs(struct bch_fs *c)
2391 {
2392 	struct inode_walker inode = inode_walker_init();
2393 	struct bch_hash_info hash_info;
2394 	int ret = 0;
2395 
2396 	ret = bch2_trans_run(c,
2397 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2398 			POS(BCACHEFS_ROOT_INO, 0),
2399 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2400 			k,
2401 			NULL, NULL,
2402 			BCH_TRANS_COMMIT_no_enospc,
2403 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2404 
2405 	inode_walker_exit(&inode);
2406 	bch_err_fn(c, ret);
2407 	return ret;
2408 }
2409 
2410 static int check_root_trans(struct btree_trans *trans)
2411 {
2412 	struct bch_fs *c = trans->c;
2413 	struct bch_inode_unpacked root_inode;
2414 	u32 snapshot;
2415 	u64 inum;
2416 	int ret;
2417 
2418 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2419 	if (ret && !bch2_err_matches(ret, ENOENT))
2420 		return ret;
2421 
2422 	if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2423 				"root subvol missing")) {
2424 		struct bkey_i_subvolume *root_subvol =
2425 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2426 		ret = PTR_ERR_OR_ZERO(root_subvol);
2427 		if (ret)
2428 			goto err;
2429 
2430 		snapshot	= U32_MAX;
2431 		inum		= BCACHEFS_ROOT_INO;
2432 
2433 		bkey_subvolume_init(&root_subvol->k_i);
2434 		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2435 		root_subvol->v.flags	= 0;
2436 		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2437 		root_subvol->v.inode	= cpu_to_le64(inum);
2438 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2439 		bch_err_msg(c, ret, "writing root subvol");
2440 		if (ret)
2441 			goto err;
2442 	}
2443 
2444 	ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot,
2445 					       &root_inode, 0);
2446 	if (ret && !bch2_err_matches(ret, ENOENT))
2447 		return ret;
2448 
2449 	if (mustfix_fsck_err_on(ret,
2450 				trans, root_dir_missing,
2451 				"root directory missing") ||
2452 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2453 				trans, root_inode_not_dir,
2454 				"root inode not a directory")) {
2455 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2456 				0, NULL);
2457 		root_inode.bi_inum = inum;
2458 		root_inode.bi_snapshot = snapshot;
2459 
2460 		ret = __bch2_fsck_write_inode(trans, &root_inode);
2461 		bch_err_msg(c, ret, "writing root inode");
2462 	}
2463 err:
2464 fsck_err:
2465 	return ret;
2466 }
2467 
2468 /* Get root directory, create if it doesn't exist: */
2469 int bch2_check_root(struct bch_fs *c)
2470 {
2471 	int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2472 		check_root_trans(trans));
2473 	bch_err_fn(c, ret);
2474 	return ret;
2475 }
2476 
2477 static bool darray_u32_has(darray_u32 *d, u32 v)
2478 {
2479 	darray_for_each(*d, i)
2480 		if (*i == v)
2481 			return true;
2482 	return false;
2483 }
2484 
2485 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2486 {
2487 	struct bch_fs *c = trans->c;
2488 	struct btree_iter parent_iter = {};
2489 	darray_u32 subvol_path = {};
2490 	struct printbuf buf = PRINTBUF;
2491 	int ret = 0;
2492 
2493 	if (k.k->type != KEY_TYPE_subvolume)
2494 		return 0;
2495 
2496 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2497 		ret = darray_push(&subvol_path, k.k->p.offset);
2498 		if (ret)
2499 			goto err;
2500 
2501 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2502 
2503 		struct bch_inode_unpacked subvol_root;
2504 		ret = bch2_inode_find_by_inum_trans(trans,
2505 					(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2506 					&subvol_root);
2507 		if (ret)
2508 			break;
2509 
2510 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2511 
2512 		if (darray_u32_has(&subvol_path, parent)) {
2513 			printbuf_reset(&buf);
2514 			prt_printf(&buf, "subvolume loop:\n");
2515 
2516 			darray_for_each_reverse(subvol_path, i)
2517 				prt_printf(&buf, "%u ", *i);
2518 			prt_printf(&buf, "%u", parent);
2519 
2520 			if (fsck_err(trans, subvol_loop, "%s", buf.buf))
2521 				ret = reattach_subvol(trans, s);
2522 			break;
2523 		}
2524 
2525 		bch2_trans_iter_exit(trans, &parent_iter);
2526 		bch2_trans_iter_init(trans, &parent_iter,
2527 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2528 		k = bch2_btree_iter_peek_slot(trans, &parent_iter);
2529 		ret = bkey_err(k);
2530 		if (ret)
2531 			goto err;
2532 
2533 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2534 				trans, subvol_unreachable,
2535 				"unreachable subvolume %s",
2536 				(printbuf_reset(&buf),
2537 				 bch2_bkey_val_to_text(&buf, c, s.s_c),
2538 				 buf.buf))) {
2539 			ret = reattach_subvol(trans, s);
2540 			break;
2541 		}
2542 	}
2543 fsck_err:
2544 err:
2545 	printbuf_exit(&buf);
2546 	darray_exit(&subvol_path);
2547 	bch2_trans_iter_exit(trans, &parent_iter);
2548 	return ret;
2549 }
2550 
2551 int bch2_check_subvolume_structure(struct bch_fs *c)
2552 {
2553 	int ret = bch2_trans_run(c,
2554 		for_each_btree_key_commit(trans, iter,
2555 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2556 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2557 			check_subvol_path(trans, &iter, k)));
2558 	bch_err_fn(c, ret);
2559 	return ret;
2560 }
2561 
2562 struct pathbuf_entry {
2563 	u64	inum;
2564 	u32	snapshot;
2565 };
2566 
2567 typedef DARRAY(struct pathbuf_entry) pathbuf;
2568 
2569 static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
2570 				      u32 new_depth)
2571 {
2572 	struct btree_iter iter;
2573 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
2574 					       SPOS(0, p->inum, p->snapshot), 0);
2575 
2576 	struct bch_inode_unpacked inode;
2577 	int ret = bkey_err(k) ?:
2578 		!bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
2579 		: bch2_inode_unpack(k, &inode);
2580 	if (ret)
2581 		goto err;
2582 
2583 	if (inode.bi_depth != new_depth) {
2584 		inode.bi_depth = new_depth;
2585 		ret = __bch2_fsck_write_inode(trans, &inode) ?:
2586 			bch2_trans_commit(trans, NULL, NULL, 0);
2587 	}
2588 err:
2589 	bch2_trans_iter_exit(trans, &iter);
2590 	return ret;
2591 }
2592 
2593 static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
2594 {
2595 	u32 restart_count = trans->restart_count;
2596 	int ret = 0;
2597 
2598 	darray_for_each_reverse(*path, i) {
2599 		ret = nested_lockrestart_do(trans,
2600 				bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
2601 		bch_err_fn(trans->c, ret);
2602 		if (ret)
2603 			break;
2604 
2605 		new_bi_depth++;
2606 	}
2607 
2608 	return ret ?: trans_was_restarted(trans, restart_count);
2609 }
2610 
2611 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2612 {
2613 	darray_for_each(*p, i)
2614 		if (i->inum	== inum &&
2615 		    i->snapshot	== snapshot)
2616 			return true;
2617 	return false;
2618 }
2619 
2620 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
2621 {
2622 	struct bch_fs *c = trans->c;
2623 	struct btree_iter inode_iter = {};
2624 	pathbuf path = {};
2625 	struct printbuf buf = PRINTBUF;
2626 	u32 snapshot = inode_k.k->p.snapshot;
2627 	bool redo_bi_depth = false;
2628 	u32 min_bi_depth = U32_MAX;
2629 	int ret = 0;
2630 
2631 	struct bch_inode_unpacked inode;
2632 	ret = bch2_inode_unpack(inode_k, &inode);
2633 	if (ret)
2634 		return ret;
2635 
2636 	while (!inode.bi_subvol) {
2637 		struct btree_iter dirent_iter;
2638 		struct bkey_s_c_dirent d;
2639 		u32 parent_snapshot = snapshot;
2640 
2641 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2642 		ret = bkey_err(d.s_c);
2643 		if (ret && !bch2_err_matches(ret, ENOENT))
2644 			goto out;
2645 
2646 		if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
2647 			bch2_trans_iter_exit(trans, &dirent_iter);
2648 
2649 		if (bch2_err_matches(ret, ENOENT)) {
2650 			printbuf_reset(&buf);
2651 			bch2_bkey_val_to_text(&buf, c, inode_k);
2652 			bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
2653 				bch2_err_str(ret), buf.buf);
2654 			goto out;
2655 		}
2656 
2657 		bch2_trans_iter_exit(trans, &dirent_iter);
2658 
2659 		ret = darray_push(&path, ((struct pathbuf_entry) {
2660 			.inum		= inode.bi_inum,
2661 			.snapshot	= snapshot,
2662 		}));
2663 		if (ret)
2664 			return ret;
2665 
2666 		snapshot = parent_snapshot;
2667 
2668 		bch2_trans_iter_exit(trans, &inode_iter);
2669 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2670 					     SPOS(0, inode.bi_dir, snapshot), 0);
2671 
2672 		struct bch_inode_unpacked parent_inode;
2673 		ret = bkey_err(inode_k) ?:
2674 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2675 			: bch2_inode_unpack(inode_k, &parent_inode);
2676 		if (ret) {
2677 			/* Should have been caught in dirents pass */
2678 			bch_err_msg(c, ret, "error looking up parent directory");
2679 			goto out;
2680 		}
2681 
2682 		min_bi_depth = parent_inode.bi_depth;
2683 
2684 		if (parent_inode.bi_depth < inode.bi_depth &&
2685 		    min_bi_depth < U16_MAX)
2686 			break;
2687 
2688 		inode = parent_inode;
2689 		snapshot = inode_k.k->p.snapshot;
2690 		redo_bi_depth = true;
2691 
2692 		if (path_is_dup(&path, inode.bi_inum, snapshot)) {
2693 			printbuf_reset(&buf);
2694 			prt_printf(&buf, "directory structure loop:\n");
2695 			darray_for_each_reverse(path, i)
2696 				prt_printf(&buf, "%llu:%u ", i->inum, i->snapshot);
2697 			prt_printf(&buf, "%llu:%u", inode.bi_inum, snapshot);
2698 
2699 			if (fsck_err(trans, dir_loop, "%s", buf.buf)) {
2700 				ret = remove_backpointer(trans, &inode);
2701 				bch_err_msg(c, ret, "removing dirent");
2702 				if (ret)
2703 					break;
2704 
2705 				ret = reattach_inode(trans, &inode);
2706 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2707 			}
2708 
2709 			goto out;
2710 		}
2711 	}
2712 
2713 	if (inode.bi_subvol)
2714 		min_bi_depth = 0;
2715 
2716 	if (redo_bi_depth)
2717 		ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
2718 out:
2719 fsck_err:
2720 	bch2_trans_iter_exit(trans, &inode_iter);
2721 	darray_exit(&path);
2722 	printbuf_exit(&buf);
2723 	bch_err_fn(c, ret);
2724 	return ret;
2725 }
2726 
2727 /*
2728  * Check for loops in the directory structure: all other connectivity issues
2729  * have been fixed by prior passes
2730  */
2731 int bch2_check_directory_structure(struct bch_fs *c)
2732 {
2733 	int ret = bch2_trans_run(c,
2734 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2735 					  BTREE_ITER_intent|
2736 					  BTREE_ITER_prefetch|
2737 					  BTREE_ITER_all_snapshots, k,
2738 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2739 			if (!S_ISDIR(bkey_inode_mode(k)))
2740 				continue;
2741 
2742 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2743 				continue;
2744 
2745 			check_path_loop(trans, k);
2746 		})));
2747 
2748 	bch_err_fn(c, ret);
2749 	return ret;
2750 }
2751 
2752 struct nlink_table {
2753 	size_t		nr;
2754 	size_t		size;
2755 
2756 	struct nlink {
2757 		u64	inum;
2758 		u32	snapshot;
2759 		u32	count;
2760 	}		*d;
2761 };
2762 
2763 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2764 		     u64 inum, u32 snapshot)
2765 {
2766 	if (t->nr == t->size) {
2767 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2768 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2769 
2770 		if (!d) {
2771 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2772 				new_size);
2773 			return bch_err_throw(c, ENOMEM_fsck_add_nlink);
2774 		}
2775 
2776 		if (t->d)
2777 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2778 		kvfree(t->d);
2779 
2780 		t->d = d;
2781 		t->size = new_size;
2782 	}
2783 
2784 
2785 	t->d[t->nr++] = (struct nlink) {
2786 		.inum		= inum,
2787 		.snapshot	= snapshot,
2788 	};
2789 
2790 	return 0;
2791 }
2792 
2793 static int nlink_cmp(const void *_l, const void *_r)
2794 {
2795 	const struct nlink *l = _l;
2796 	const struct nlink *r = _r;
2797 
2798 	return cmp_int(l->inum, r->inum);
2799 }
2800 
2801 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2802 		     struct nlink_table *links,
2803 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2804 {
2805 	struct nlink *link, key = {
2806 		.inum = inum, .snapshot = U32_MAX,
2807 	};
2808 
2809 	if (inum < range_start || inum >= range_end)
2810 		return;
2811 
2812 	link = __inline_bsearch(&key, links->d, links->nr,
2813 				sizeof(links->d[0]), nlink_cmp);
2814 	if (!link)
2815 		return;
2816 
2817 	while (link > links->d && link[0].inum == link[-1].inum)
2818 		--link;
2819 
2820 	for (; link < links->d + links->nr && link->inum == inum; link++)
2821 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2822 			link->count++;
2823 			if (link->snapshot >= snapshot)
2824 				break;
2825 		}
2826 }
2827 
2828 noinline_for_stack
2829 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2830 				       struct nlink_table *t,
2831 				       u64 start, u64 *end)
2832 {
2833 	int ret = bch2_trans_run(c,
2834 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2835 				   POS(0, start),
2836 				   BTREE_ITER_intent|
2837 				   BTREE_ITER_prefetch|
2838 				   BTREE_ITER_all_snapshots, k, ({
2839 			if (!bkey_is_inode(k.k))
2840 				continue;
2841 
2842 			/* Should never fail, checked by bch2_inode_invalid: */
2843 			struct bch_inode_unpacked u;
2844 			_ret3 = bch2_inode_unpack(k, &u);
2845 			if (_ret3)
2846 				break;
2847 
2848 			/*
2849 			 * Backpointer and directory structure checks are sufficient for
2850 			 * directories, since they can't have hardlinks:
2851 			 */
2852 			if (S_ISDIR(u.bi_mode))
2853 				continue;
2854 
2855 			/*
2856 			 * Previous passes ensured that bi_nlink is nonzero if
2857 			 * it had multiple hardlinks:
2858 			 */
2859 			if (!u.bi_nlink)
2860 				continue;
2861 
2862 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2863 			if (ret) {
2864 				*end = k.k->p.offset;
2865 				ret = 0;
2866 				break;
2867 			}
2868 			0;
2869 		})));
2870 
2871 	bch_err_fn(c, ret);
2872 	return ret;
2873 }
2874 
2875 noinline_for_stack
2876 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2877 				     u64 range_start, u64 range_end)
2878 {
2879 	struct snapshots_seen s;
2880 
2881 	snapshots_seen_init(&s);
2882 
2883 	int ret = bch2_trans_run(c,
2884 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2885 				   BTREE_ITER_intent|
2886 				   BTREE_ITER_prefetch|
2887 				   BTREE_ITER_all_snapshots, k, ({
2888 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2889 			if (ret)
2890 				break;
2891 
2892 			if (k.k->type == KEY_TYPE_dirent) {
2893 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2894 
2895 				if (d.v->d_type != DT_DIR &&
2896 				    d.v->d_type != DT_SUBVOL)
2897 					inc_link(c, &s, links, range_start, range_end,
2898 						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2899 			}
2900 			0;
2901 		})));
2902 
2903 	snapshots_seen_exit(&s);
2904 
2905 	bch_err_fn(c, ret);
2906 	return ret;
2907 }
2908 
2909 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2910 				     struct bkey_s_c k,
2911 				     struct nlink_table *links,
2912 				     size_t *idx, u64 range_end)
2913 {
2914 	struct bch_inode_unpacked u;
2915 	struct nlink *link = &links->d[*idx];
2916 	int ret = 0;
2917 
2918 	if (k.k->p.offset >= range_end)
2919 		return 1;
2920 
2921 	if (!bkey_is_inode(k.k))
2922 		return 0;
2923 
2924 	ret = bch2_inode_unpack(k, &u);
2925 	if (ret)
2926 		return ret;
2927 
2928 	if (S_ISDIR(u.bi_mode))
2929 		return 0;
2930 
2931 	if (!u.bi_nlink)
2932 		return 0;
2933 
2934 	while ((cmp_int(link->inum, k.k->p.offset) ?:
2935 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2936 		BUG_ON(*idx == links->nr);
2937 		link = &links->d[++*idx];
2938 	}
2939 
2940 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2941 			trans, inode_wrong_nlink,
2942 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
2943 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2944 			bch2_inode_nlink_get(&u), link->count)) {
2945 		bch2_inode_nlink_set(&u, link->count);
2946 		ret = __bch2_fsck_write_inode(trans, &u);
2947 	}
2948 fsck_err:
2949 	return ret;
2950 }
2951 
2952 noinline_for_stack
2953 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2954 			       struct nlink_table *links,
2955 			       u64 range_start, u64 range_end)
2956 {
2957 	size_t idx = 0;
2958 
2959 	int ret = bch2_trans_run(c,
2960 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2961 				POS(0, range_start),
2962 				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2963 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2964 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2965 	if (ret < 0) {
2966 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2967 		return ret;
2968 	}
2969 
2970 	return 0;
2971 }
2972 
2973 int bch2_check_nlinks(struct bch_fs *c)
2974 {
2975 	struct nlink_table links = { 0 };
2976 	u64 this_iter_range_start, next_iter_range_start = 0;
2977 	int ret = 0;
2978 
2979 	do {
2980 		this_iter_range_start = next_iter_range_start;
2981 		next_iter_range_start = U64_MAX;
2982 
2983 		ret = check_nlinks_find_hardlinks(c, &links,
2984 						  this_iter_range_start,
2985 						  &next_iter_range_start);
2986 
2987 		ret = check_nlinks_walk_dirents(c, &links,
2988 					  this_iter_range_start,
2989 					  next_iter_range_start);
2990 		if (ret)
2991 			break;
2992 
2993 		ret = check_nlinks_update_hardlinks(c, &links,
2994 					 this_iter_range_start,
2995 					 next_iter_range_start);
2996 		if (ret)
2997 			break;
2998 
2999 		links.nr = 0;
3000 	} while (next_iter_range_start != U64_MAX);
3001 
3002 	kvfree(links.d);
3003 	bch_err_fn(c, ret);
3004 	return ret;
3005 }
3006 
3007 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
3008 			     struct bkey_s_c k)
3009 {
3010 	struct bkey_s_c_reflink_p p;
3011 	struct bkey_i_reflink_p *u;
3012 
3013 	if (k.k->type != KEY_TYPE_reflink_p)
3014 		return 0;
3015 
3016 	p = bkey_s_c_to_reflink_p(k);
3017 
3018 	if (!p.v->front_pad && !p.v->back_pad)
3019 		return 0;
3020 
3021 	u = bch2_trans_kmalloc(trans, sizeof(*u));
3022 	int ret = PTR_ERR_OR_ZERO(u);
3023 	if (ret)
3024 		return ret;
3025 
3026 	bkey_reassemble(&u->k_i, k);
3027 	u->v.front_pad	= 0;
3028 	u->v.back_pad	= 0;
3029 
3030 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
3031 }
3032 
3033 int bch2_fix_reflink_p(struct bch_fs *c)
3034 {
3035 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
3036 		return 0;
3037 
3038 	int ret = bch2_trans_run(c,
3039 		for_each_btree_key_commit(trans, iter,
3040 				BTREE_ID_extents, POS_MIN,
3041 				BTREE_ITER_intent|BTREE_ITER_prefetch|
3042 				BTREE_ITER_all_snapshots, k,
3043 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
3044 			fix_reflink_p_key(trans, &iter, k)));
3045 	bch_err_fn(c, ret);
3046 	return ret;
3047 }
3048 
3049 #ifndef NO_BCACHEFS_CHARDEV
3050 
3051 struct fsck_thread {
3052 	struct thread_with_stdio thr;
3053 	struct bch_fs		*c;
3054 	struct bch_opts		opts;
3055 };
3056 
3057 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
3058 {
3059 	struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
3060 	kfree(thr);
3061 }
3062 
3063 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
3064 {
3065 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3066 	struct bch_fs *c = thr->c;
3067 
3068 	int ret = PTR_ERR_OR_ZERO(c);
3069 	if (ret)
3070 		return ret;
3071 
3072 	ret = bch2_fs_start(thr->c);
3073 	if (ret)
3074 		goto err;
3075 
3076 	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
3077 		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
3078 		ret |= 1;
3079 	}
3080 	if (test_bit(BCH_FS_error, &c->flags)) {
3081 		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
3082 		ret |= 4;
3083 	}
3084 err:
3085 	bch2_fs_stop(c);
3086 	return ret;
3087 }
3088 
3089 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
3090 	.exit		= bch2_fsck_thread_exit,
3091 	.fn		= bch2_fsck_offline_thread_fn,
3092 };
3093 
3094 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
3095 {
3096 	struct bch_ioctl_fsck_offline arg;
3097 	struct fsck_thread *thr = NULL;
3098 	darray_const_str devs = {};
3099 	long ret = 0;
3100 
3101 	if (copy_from_user(&arg, user_arg, sizeof(arg)))
3102 		return -EFAULT;
3103 
3104 	if (arg.flags)
3105 		return -EINVAL;
3106 
3107 	if (!capable(CAP_SYS_ADMIN))
3108 		return -EPERM;
3109 
3110 	for (size_t i = 0; i < arg.nr_devs; i++) {
3111 		u64 dev_u64;
3112 		ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
3113 		if (ret)
3114 			goto err;
3115 
3116 		char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
3117 		ret = PTR_ERR_OR_ZERO(dev_str);
3118 		if (ret)
3119 			goto err;
3120 
3121 		ret = darray_push(&devs, dev_str);
3122 		if (ret) {
3123 			kfree(dev_str);
3124 			goto err;
3125 		}
3126 	}
3127 
3128 	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3129 	if (!thr) {
3130 		ret = -ENOMEM;
3131 		goto err;
3132 	}
3133 
3134 	thr->opts = bch2_opts_empty();
3135 
3136 	if (arg.opts) {
3137 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3138 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
3139 			bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false);
3140 		if (!IS_ERR(optstr))
3141 			kfree(optstr);
3142 
3143 		if (ret)
3144 			goto err;
3145 	}
3146 
3147 	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
3148 	opt_set(thr->opts, read_only, 1);
3149 	opt_set(thr->opts, ratelimit_errors, 0);
3150 
3151 	/* We need request_key() to be called before we punt to kthread: */
3152 	opt_set(thr->opts, nostart, true);
3153 
3154 	bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
3155 
3156 	thr->c = bch2_fs_open(&devs, &thr->opts);
3157 
3158 	if (!IS_ERR(thr->c) &&
3159 	    thr->c->opts.errors == BCH_ON_ERROR_panic)
3160 		thr->c->opts.errors = BCH_ON_ERROR_ro;
3161 
3162 	ret = __bch2_run_thread_with_stdio(&thr->thr);
3163 out:
3164 	darray_for_each(devs, i)
3165 		kfree(*i);
3166 	darray_exit(&devs);
3167 	return ret;
3168 err:
3169 	if (thr)
3170 		bch2_fsck_thread_exit(&thr->thr);
3171 	pr_err("ret %s", bch2_err_str(ret));
3172 	goto out;
3173 }
3174 
3175 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
3176 {
3177 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3178 	struct bch_fs *c = thr->c;
3179 
3180 	c->stdio_filter = current;
3181 	c->stdio = &thr->thr.stdio;
3182 
3183 	/*
3184 	 * XXX: can we figure out a way to do this without mucking with c->opts?
3185 	 */
3186 	unsigned old_fix_errors = c->opts.fix_errors;
3187 	if (opt_defined(thr->opts, fix_errors))
3188 		c->opts.fix_errors = thr->opts.fix_errors;
3189 	else
3190 		c->opts.fix_errors = FSCK_FIX_ask;
3191 
3192 	c->opts.fsck = true;
3193 	set_bit(BCH_FS_in_fsck, &c->flags);
3194 
3195 	int ret = bch2_run_online_recovery_passes(c, ~0ULL);
3196 
3197 	clear_bit(BCH_FS_in_fsck, &c->flags);
3198 	bch_err_fn(c, ret);
3199 
3200 	c->stdio = NULL;
3201 	c->stdio_filter = NULL;
3202 	c->opts.fix_errors = old_fix_errors;
3203 
3204 	up(&c->recovery.run_lock);
3205 	bch2_ro_ref_put(c);
3206 	return ret;
3207 }
3208 
3209 static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
3210 	.exit		= bch2_fsck_thread_exit,
3211 	.fn		= bch2_fsck_online_thread_fn,
3212 };
3213 
3214 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
3215 {
3216 	struct fsck_thread *thr = NULL;
3217 	long ret = 0;
3218 
3219 	if (arg.flags)
3220 		return -EINVAL;
3221 
3222 	if (!capable(CAP_SYS_ADMIN))
3223 		return -EPERM;
3224 
3225 	if (!bch2_ro_ref_tryget(c))
3226 		return -EROFS;
3227 
3228 	if (down_trylock(&c->recovery.run_lock)) {
3229 		bch2_ro_ref_put(c);
3230 		return -EAGAIN;
3231 	}
3232 
3233 	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3234 	if (!thr) {
3235 		ret = -ENOMEM;
3236 		goto err;
3237 	}
3238 
3239 	thr->c = c;
3240 	thr->opts = bch2_opts_empty();
3241 
3242 	if (arg.opts) {
3243 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3244 
3245 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
3246 			bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false);
3247 		if (!IS_ERR(optstr))
3248 			kfree(optstr);
3249 
3250 		if (ret)
3251 			goto err;
3252 	}
3253 
3254 	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
3255 err:
3256 	if (ret < 0) {
3257 		bch_err_fn(c, ret);
3258 		if (thr)
3259 			bch2_fsck_thread_exit(&thr->thr);
3260 		up(&c->recovery.run_lock);
3261 		bch2_ro_ref_put(c);
3262 	}
3263 	return ret;
3264 }
3265 
3266 #endif /* NO_BCACHEFS_CHARDEV */
3267