xref: /linux/fs/bcachefs/fsck.c (revision 6f2a71a99ebd5dfaa7948a2e9c59eae94b741bd8)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bcachefs_ioctl.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "darray.h"
10 #include "dirent.h"
11 #include "error.h"
12 #include "fs.h"
13 #include "fsck.h"
14 #include "inode.h"
15 #include "keylist.h"
16 #include "namei.h"
17 #include "recovery_passes.h"
18 #include "snapshot.h"
19 #include "super.h"
20 #include "thread_with_file.h"
21 #include "xattr.h"
22 
23 #include <linux/bsearch.h>
24 #include <linux/dcache.h> /* struct qstr */
25 
dirent_points_to_inode_nowarn(struct bch_fs * c,struct bkey_s_c_dirent d,struct bch_inode_unpacked * inode)26 static int dirent_points_to_inode_nowarn(struct bch_fs *c,
27 					 struct bkey_s_c_dirent d,
28 					 struct bch_inode_unpacked *inode)
29 {
30 	if (d.v->d_type == DT_SUBVOL
31 	    ? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
32 	    : le64_to_cpu(d.v->d_inum)		== inode->bi_inum)
33 		return 0;
34 	return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
35 }
36 
dirent_inode_mismatch_msg(struct printbuf * out,struct bch_fs * c,struct bkey_s_c_dirent dirent,struct bch_inode_unpacked * inode)37 static void dirent_inode_mismatch_msg(struct printbuf *out,
38 				      struct bch_fs *c,
39 				      struct bkey_s_c_dirent dirent,
40 				      struct bch_inode_unpacked *inode)
41 {
42 	prt_str(out, "inode points to dirent that does not point back:");
43 	prt_newline(out);
44 	bch2_bkey_val_to_text(out, c, dirent.s_c);
45 	prt_newline(out);
46 	bch2_inode_unpacked_to_text(out, inode);
47 }
48 
dirent_points_to_inode(struct bch_fs * c,struct bkey_s_c_dirent dirent,struct bch_inode_unpacked * inode)49 static int dirent_points_to_inode(struct bch_fs *c,
50 				  struct bkey_s_c_dirent dirent,
51 				  struct bch_inode_unpacked *inode)
52 {
53 	int ret = dirent_points_to_inode_nowarn(c, dirent, inode);
54 	if (ret) {
55 		struct printbuf buf = PRINTBUF;
56 		dirent_inode_mismatch_msg(&buf, c, dirent, inode);
57 		bch_warn(c, "%s", buf.buf);
58 		printbuf_exit(&buf);
59 	}
60 	return ret;
61 }
62 
63 /*
64  * XXX: this is handling transaction restarts without returning
65  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
66  */
bch2_count_inode_sectors(struct btree_trans * trans,u64 inum,u32 snapshot)67 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
68 				    u32 snapshot)
69 {
70 	u64 sectors = 0;
71 
72 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
73 				SPOS(inum, 0, snapshot),
74 				POS(inum, U64_MAX),
75 				0, k, ({
76 		if (bkey_extent_is_allocation(k.k))
77 			sectors += k.k->size;
78 		0;
79 	}));
80 
81 	return ret ?: sectors;
82 }
83 
bch2_count_subdirs(struct btree_trans * trans,u64 inum,u32 snapshot)84 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
85 				    u32 snapshot)
86 {
87 	u64 subdirs = 0;
88 
89 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
90 				    SPOS(inum, 0, snapshot),
91 				    POS(inum, U64_MAX),
92 				    0, k, ({
93 		if (k.k->type == KEY_TYPE_dirent &&
94 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
95 			subdirs++;
96 		0;
97 	}));
98 
99 	return ret ?: subdirs;
100 }
101 
subvol_lookup(struct btree_trans * trans,u32 subvol,u32 * snapshot,u64 * inum)102 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
103 			 u32 *snapshot, u64 *inum)
104 {
105 	struct bch_subvolume s;
106 	int ret = bch2_subvolume_get(trans, subvol, false, &s);
107 
108 	*snapshot = le32_to_cpu(s.snapshot);
109 	*inum = le64_to_cpu(s.inode);
110 	return ret;
111 }
112 
lookup_dirent_in_snapshot(struct btree_trans * trans,struct bch_hash_info hash_info,subvol_inum dir,struct qstr * name,u64 * target,unsigned * type,u32 snapshot)113 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
114 			   struct bch_hash_info hash_info,
115 			   subvol_inum dir, struct qstr *name,
116 			   u64 *target, unsigned *type, u32 snapshot)
117 {
118 	struct btree_iter iter;
119 	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
120 							 &hash_info, dir, name, 0, snapshot);
121 	int ret = bkey_err(k);
122 	if (ret)
123 		return ret;
124 
125 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
126 	*target = le64_to_cpu(d.v->d_inum);
127 	*type = d.v->d_type;
128 	bch2_trans_iter_exit(trans, &iter);
129 	return 0;
130 }
131 
132 /*
133  * Find any subvolume associated with a tree of snapshots
134  * We can't rely on master_subvol - it might have been deleted.
135  */
find_snapshot_tree_subvol(struct btree_trans * trans,u32 tree_id,u32 * subvol)136 static int find_snapshot_tree_subvol(struct btree_trans *trans,
137 				     u32 tree_id, u32 *subvol)
138 {
139 	struct btree_iter iter;
140 	struct bkey_s_c k;
141 	int ret;
142 
143 	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
144 		if (k.k->type != KEY_TYPE_snapshot)
145 			continue;
146 
147 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
148 		if (le32_to_cpu(s.v->tree) != tree_id)
149 			continue;
150 
151 		if (s.v->subvol) {
152 			*subvol = le32_to_cpu(s.v->subvol);
153 			goto found;
154 		}
155 	}
156 	ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol);
157 found:
158 	bch2_trans_iter_exit(trans, &iter);
159 	return ret;
160 }
161 
162 /* Get lost+found, create if it doesn't exist: */
lookup_lostfound(struct btree_trans * trans,u32 snapshot,struct bch_inode_unpacked * lostfound,u64 reattaching_inum)163 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
164 			    struct bch_inode_unpacked *lostfound,
165 			    u64 reattaching_inum)
166 {
167 	struct bch_fs *c = trans->c;
168 	struct qstr lostfound_str = QSTR("lost+found");
169 	struct btree_iter lostfound_iter = {};
170 	u64 inum = 0;
171 	unsigned d_type = 0;
172 	int ret;
173 
174 	struct bch_snapshot_tree st;
175 	ret = bch2_snapshot_tree_lookup(trans,
176 			bch2_snapshot_tree(c, snapshot), &st);
177 	if (ret)
178 		return ret;
179 
180 	u32 subvolid;
181 	ret = find_snapshot_tree_subvol(trans,
182 				bch2_snapshot_tree(c, snapshot), &subvolid);
183 	bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
184 		    bch2_snapshot_tree(c, snapshot));
185 	if (ret)
186 		return ret;
187 
188 	struct bch_subvolume subvol;
189 	ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
190 	bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
191 	if (ret)
192 		return ret;
193 
194 	if (!subvol.inode) {
195 		struct btree_iter iter;
196 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
197 				BTREE_ID_subvolumes, POS(0, subvolid),
198 				0, subvolume);
199 		ret = PTR_ERR_OR_ZERO(subvol);
200 		if (ret)
201 			return ret;
202 
203 		subvol->v.inode = cpu_to_le64(reattaching_inum);
204 		bch2_trans_iter_exit(trans, &iter);
205 	}
206 
207 	subvol_inum root_inum = {
208 		.subvol = subvolid,
209 		.inum = le64_to_cpu(subvol.inode)
210 	};
211 
212 	struct bch_inode_unpacked root_inode;
213 	struct bch_hash_info root_hash_info;
214 	ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0);
215 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
216 		    root_inum.inum, subvolid);
217 	if (ret)
218 		return ret;
219 
220 	root_hash_info = bch2_hash_info_init(c, &root_inode);
221 
222 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
223 			      &lostfound_str, &inum, &d_type, snapshot);
224 	if (bch2_err_matches(ret, ENOENT))
225 		goto create_lostfound;
226 
227 	bch_err_fn(c, ret);
228 	if (ret)
229 		return ret;
230 
231 	if (d_type != DT_DIR) {
232 		bch_err(c, "error looking up lost+found: not a directory");
233 		return bch_err_throw(c, ENOENT_not_directory);
234 	}
235 
236 	/*
237 	 * The bch2_check_dirents pass has already run, dangling dirents
238 	 * shouldn't exist here:
239 	 */
240 	ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0);
241 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
242 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
243 	return ret;
244 
245 create_lostfound:
246 	/*
247 	 * we always create lost+found in the root snapshot; we don't want
248 	 * different branches of the snapshot tree to have different lost+found
249 	 */
250 	snapshot = le32_to_cpu(st.root_snapshot);
251 	/*
252 	 * XXX: we could have a nicer log message here  if we had a nice way to
253 	 * walk backpointers to print a path
254 	 */
255 	struct printbuf path = PRINTBUF;
256 	ret = bch2_inum_to_path(trans, root_inum, &path);
257 	if (ret)
258 		goto err;
259 
260 	bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
261 		   path.buf, root_inum.subvol, snapshot);
262 	printbuf_exit(&path);
263 
264 	u64 now = bch2_current_time(c);
265 	u64 cpu = raw_smp_processor_id();
266 
267 	bch2_inode_init_early(c, lostfound);
268 	bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
269 	lostfound->bi_dir = root_inode.bi_inum;
270 	lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
271 
272 	root_inode.bi_nlink++;
273 
274 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
275 	if (ret)
276 		goto err;
277 
278 	bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot);
279 	ret = bch2_btree_iter_traverse(trans, &lostfound_iter);
280 	if (ret)
281 		goto err;
282 
283 	ret =   bch2_dirent_create_snapshot(trans,
284 				0, root_inode.bi_inum, snapshot, &root_hash_info,
285 				mode_to_type(lostfound->bi_mode),
286 				&lostfound_str,
287 				lostfound->bi_inum,
288 				&lostfound->bi_dir_offset,
289 				BTREE_UPDATE_internal_snapshot_node|
290 				STR_HASH_must_create) ?:
291 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
292 				       BTREE_UPDATE_internal_snapshot_node);
293 err:
294 	bch_err_msg(c, ret, "creating lost+found");
295 	bch2_trans_iter_exit(trans, &lostfound_iter);
296 	return ret;
297 }
298 
inode_should_reattach(struct bch_inode_unpacked * inode)299 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
300 {
301 	if (inode->bi_inum == BCACHEFS_ROOT_INO &&
302 	    inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
303 		return false;
304 
305 	/*
306 	 * Subvolume roots are special: older versions of subvolume roots may be
307 	 * disconnected, it's only the newest version that matters.
308 	 *
309 	 * We only keep a single dirent pointing to a subvolume root, i.e.
310 	 * older versions of snapshots will not have a different dirent pointing
311 	 * to the same subvolume root.
312 	 *
313 	 * This is because dirents that point to subvolumes are only visible in
314 	 * the parent subvolume - versioning is not needed - and keeping them
315 	 * around would break fsck, because when we're crossing subvolumes we
316 	 * don't have a consistent snapshot ID to do check the inode <-> dirent
317 	 * relationships.
318 	 *
319 	 * Thus, a subvolume root that's been renamed after a snapshot will have
320 	 * a disconnected older version - that's expected.
321 	 *
322 	 * Note that taking a snapshot always updates the root inode (to update
323 	 * the dirent backpointer), so a subvolume root inode with
324 	 * BCH_INODE_has_child_snapshot is never visible.
325 	 */
326 	if (inode->bi_subvol &&
327 	    (inode->bi_flags & BCH_INODE_has_child_snapshot))
328 		return false;
329 
330 	return !bch2_inode_has_backpointer(inode) &&
331 		!(inode->bi_flags & BCH_INODE_unlinked);
332 }
333 
maybe_delete_dirent(struct btree_trans * trans,struct bpos d_pos,u32 snapshot)334 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
335 {
336 	struct btree_iter iter;
337 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
338 					SPOS(d_pos.inode, d_pos.offset, snapshot),
339 					BTREE_ITER_intent|
340 					BTREE_ITER_with_updates);
341 	int ret = bkey_err(k);
342 	if (ret)
343 		return ret;
344 
345 	if (bpos_eq(k.k->p, d_pos)) {
346 		/*
347 		 * delet_at() doesn't work because the update path doesn't
348 		 * internally use BTREE_ITER_with_updates yet
349 		 */
350 		struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
351 		ret = PTR_ERR_OR_ZERO(k);
352 		if (ret)
353 			goto err;
354 
355 		bkey_init(&k->k);
356 		k->k.type = KEY_TYPE_whiteout;
357 		k->k.p = iter.pos;
358 		ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
359 	}
360 err:
361 	bch2_trans_iter_exit(trans, &iter);
362 	return ret;
363 }
364 
reattach_inode(struct btree_trans * trans,struct bch_inode_unpacked * inode)365 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
366 {
367 	struct bch_fs *c = trans->c;
368 	struct bch_inode_unpacked lostfound;
369 	char name_buf[20];
370 	int ret;
371 
372 	u32 dirent_snapshot = inode->bi_snapshot;
373 	if (inode->bi_subvol) {
374 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
375 
376 		struct btree_iter subvol_iter;
377 		struct bkey_i_subvolume *subvol =
378 			bch2_bkey_get_mut_typed(trans, &subvol_iter,
379 						BTREE_ID_subvolumes, POS(0, inode->bi_subvol),
380 						0, subvolume);
381 		ret = PTR_ERR_OR_ZERO(subvol);
382 		if (ret)
383 			return ret;
384 
385 		subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL;
386 		bch2_trans_iter_exit(trans, &subvol_iter);
387 
388 		u64 root_inum;
389 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
390 				    &dirent_snapshot, &root_inum);
391 		if (ret)
392 			return ret;
393 
394 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
395 	} else {
396 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
397 	}
398 
399 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
400 	if (ret)
401 		return ret;
402 
403 	bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum);
404 
405 	lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
406 
407 	/* ensure lost+found inode is also present in inode snapshot */
408 	if (!inode->bi_subvol) {
409 		BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
410 		lostfound.bi_snapshot = inode->bi_snapshot;
411 	}
412 
413 	ret = __bch2_fsck_write_inode(trans, &lostfound);
414 	if (ret)
415 		return ret;
416 
417 	struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
418 	struct qstr name = QSTR(name_buf);
419 
420 	inode->bi_dir = lostfound.bi_inum;
421 
422 	ret = bch2_dirent_create_snapshot(trans,
423 				inode->bi_parent_subvol, lostfound.bi_inum,
424 				dirent_snapshot,
425 				&dir_hash,
426 				inode_d_type(inode),
427 				&name,
428 				inode->bi_subvol ?: inode->bi_inum,
429 				&inode->bi_dir_offset,
430 				BTREE_UPDATE_internal_snapshot_node|
431 				STR_HASH_must_create);
432 	if (ret) {
433 		bch_err_msg(c, ret, "error creating dirent");
434 		return ret;
435 	}
436 
437 	ret = __bch2_fsck_write_inode(trans, inode);
438 	if (ret)
439 		return ret;
440 
441 	{
442 		CLASS(printbuf, buf)();
443 		ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum,
444 						 inode->bi_snapshot, NULL, &buf);
445 		if (ret)
446 			return ret;
447 
448 		bch_info(c, "reattached at %s", buf.buf);
449 	}
450 
451 	/*
452 	 * Fix up inodes in child snapshots: if they should also be reattached
453 	 * update the backpointer field, if they should not be we need to emit
454 	 * whiteouts for the dirent we just created.
455 	 */
456 	if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
457 		snapshot_id_list whiteouts_done;
458 		struct btree_iter iter;
459 		struct bkey_s_c k;
460 
461 		darray_init(&whiteouts_done);
462 
463 		for_each_btree_key_reverse_norestart(trans, iter,
464 				BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
465 				BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
466 			if (k.k->p.offset != inode->bi_inum)
467 				break;
468 
469 			if (!bkey_is_inode(k.k) ||
470 			    !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
471 			    snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
472 				continue;
473 
474 			struct bch_inode_unpacked child_inode;
475 			ret = bch2_inode_unpack(k, &child_inode);
476 			if (ret)
477 				break;
478 
479 			if (!inode_should_reattach(&child_inode)) {
480 				ret = maybe_delete_dirent(trans,
481 							  SPOS(lostfound.bi_inum, inode->bi_dir_offset,
482 							       dirent_snapshot),
483 							  k.k->p.snapshot);
484 				if (ret)
485 					break;
486 
487 				ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
488 				if (ret)
489 					break;
490 			} else {
491 				iter.snapshot = k.k->p.snapshot;
492 				child_inode.bi_dir = inode->bi_dir;
493 				child_inode.bi_dir_offset = inode->bi_dir_offset;
494 
495 				ret = bch2_inode_write_flags(trans, &iter, &child_inode,
496 							     BTREE_UPDATE_internal_snapshot_node);
497 				if (ret)
498 					break;
499 			}
500 		}
501 		darray_exit(&whiteouts_done);
502 		bch2_trans_iter_exit(trans, &iter);
503 	}
504 
505 	return ret;
506 }
507 
dirent_get_by_pos(struct btree_trans * trans,struct btree_iter * iter,struct bpos pos)508 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
509 						struct btree_iter *iter,
510 						struct bpos pos)
511 {
512 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
513 }
514 
remove_backpointer(struct btree_trans * trans,struct bch_inode_unpacked * inode)515 static int remove_backpointer(struct btree_trans *trans,
516 			      struct bch_inode_unpacked *inode)
517 {
518 	if (!bch2_inode_has_backpointer(inode))
519 		return 0;
520 
521 	u32 snapshot = inode->bi_snapshot;
522 
523 	if (inode->bi_parent_subvol) {
524 		int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot);
525 		if (ret)
526 			return ret;
527 	}
528 
529 	struct bch_fs *c = trans->c;
530 	struct btree_iter iter;
531 	struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
532 				     SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
533 	int ret = bkey_err(d) ?:
534 		  dirent_points_to_inode(c, d, inode) ?:
535 		  bch2_fsck_remove_dirent(trans, d.k->p);
536 	bch2_trans_iter_exit(trans, &iter);
537 	return ret;
538 }
539 
reattach_subvol(struct btree_trans * trans,struct bkey_s_c_subvolume s)540 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
541 {
542 	struct bch_fs *c = trans->c;
543 
544 	struct bch_inode_unpacked inode;
545 	int ret = bch2_inode_find_by_inum_trans(trans,
546 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
547 				&inode);
548 	if (ret)
549 		return ret;
550 
551 	ret = remove_backpointer(trans, &inode);
552 	if (!bch2_err_matches(ret, ENOENT))
553 		bch_err_msg(c, ret, "removing dirent");
554 	if (ret)
555 		return ret;
556 
557 	ret = reattach_inode(trans, &inode);
558 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
559 	return ret;
560 }
561 
reconstruct_subvol(struct btree_trans * trans,u32 snapshotid,u32 subvolid,u64 inum)562 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
563 {
564 	struct bch_fs *c = trans->c;
565 
566 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
567 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
568 		return bch_err_throw(c, fsck_repair_unimplemented);
569 	}
570 
571 	/*
572 	 * If inum isn't set, that means we're being called from check_dirents,
573 	 * not check_inodes - the root of this subvolume doesn't exist or we
574 	 * would have found it there:
575 	 */
576 	if (!inum) {
577 		struct btree_iter inode_iter = {};
578 		struct bch_inode_unpacked new_inode;
579 		u64 cpu = raw_smp_processor_id();
580 
581 		bch2_inode_init_early(c, &new_inode);
582 		bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
583 
584 		new_inode.bi_subvol = subvolid;
585 
586 		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
587 			  bch2_btree_iter_traverse(trans, &inode_iter) ?:
588 			  bch2_inode_write(trans, &inode_iter, &new_inode);
589 		bch2_trans_iter_exit(trans, &inode_iter);
590 		if (ret)
591 			return ret;
592 
593 		inum = new_inode.bi_inum;
594 	}
595 
596 	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
597 
598 	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
599 	int ret = PTR_ERR_OR_ZERO(new_subvol);
600 	if (ret)
601 		return ret;
602 
603 	bkey_subvolume_init(&new_subvol->k_i);
604 	new_subvol->k.p.offset	= subvolid;
605 	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
606 	new_subvol->v.inode	= cpu_to_le64(inum);
607 	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
608 	if (ret)
609 		return ret;
610 
611 	struct btree_iter iter;
612 	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
613 			BTREE_ID_snapshots, POS(0, snapshotid),
614 			0, snapshot);
615 	ret = PTR_ERR_OR_ZERO(s);
616 	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
617 	if (ret)
618 		return ret;
619 
620 	u32 snapshot_tree = le32_to_cpu(s->v.tree);
621 
622 	s->v.subvol = cpu_to_le32(subvolid);
623 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
624 	bch2_trans_iter_exit(trans, &iter);
625 
626 	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
627 			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
628 			0, snapshot_tree);
629 	ret = PTR_ERR_OR_ZERO(st);
630 	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
631 	if (ret)
632 		return ret;
633 
634 	if (!st->v.master_subvol)
635 		st->v.master_subvol = cpu_to_le32(subvolid);
636 
637 	bch2_trans_iter_exit(trans, &iter);
638 	return 0;
639 }
640 
reconstruct_inode(struct btree_trans * trans,enum btree_id btree,u32 snapshot,u64 inum)641 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
642 {
643 	struct bch_fs *c = trans->c;
644 	unsigned i_mode = S_IFREG;
645 	u64 i_size = 0;
646 
647 	switch (btree) {
648 	case BTREE_ID_extents: {
649 		struct btree_iter iter = {};
650 
651 		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
652 		struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0));
653 		bch2_trans_iter_exit(trans, &iter);
654 		int ret = bkey_err(k);
655 		if (ret)
656 			return ret;
657 
658 		i_size = k.k->p.offset << 9;
659 		break;
660 	}
661 	case BTREE_ID_dirents:
662 		i_mode = S_IFDIR;
663 		break;
664 	case BTREE_ID_xattrs:
665 		break;
666 	default:
667 		BUG();
668 	}
669 
670 	struct bch_inode_unpacked new_inode;
671 	bch2_inode_init_early(c, &new_inode);
672 	bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
673 	new_inode.bi_size = i_size;
674 	new_inode.bi_inum = inum;
675 	new_inode.bi_snapshot = snapshot;
676 
677 	return __bch2_fsck_write_inode(trans, &new_inode);
678 }
679 
snapshots_seen_exit(struct snapshots_seen * s)680 static inline void snapshots_seen_exit(struct snapshots_seen *s)
681 {
682 	darray_exit(&s->ids);
683 }
684 
snapshots_seen_init(struct snapshots_seen * s)685 static inline void snapshots_seen_init(struct snapshots_seen *s)
686 {
687 	memset(s, 0, sizeof(*s));
688 }
689 
snapshots_seen_add_inorder(struct bch_fs * c,struct snapshots_seen * s,u32 id)690 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
691 {
692 	u32 *i;
693 	__darray_for_each(s->ids, i) {
694 		if (*i == id)
695 			return 0;
696 		if (*i > id)
697 			break;
698 	}
699 
700 	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
701 	if (ret)
702 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
703 			s->ids.size);
704 	return ret;
705 }
706 
snapshots_seen_update(struct bch_fs * c,struct snapshots_seen * s,enum btree_id btree_id,struct bpos pos)707 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
708 				 enum btree_id btree_id, struct bpos pos)
709 {
710 	if (!bkey_eq(s->pos, pos))
711 		s->ids.nr = 0;
712 	s->pos = pos;
713 
714 	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
715 }
716 
717 /**
718  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
719  * and @ancestor hasn't been overwritten in @seen
720  *
721  * @c:		filesystem handle
722  * @seen:	list of snapshot ids already seen at current position
723  * @id:		descendent snapshot id
724  * @ancestor:	ancestor snapshot id
725  *
726  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
727  */
key_visible_in_snapshot(struct bch_fs * c,struct snapshots_seen * seen,u32 id,u32 ancestor)728 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
729 				    u32 id, u32 ancestor)
730 {
731 	EBUG_ON(id > ancestor);
732 
733 	if (id == ancestor)
734 		return true;
735 
736 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
737 		return false;
738 
739 	/*
740 	 * We know that @id is a descendant of @ancestor, we're checking if
741 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
742 	 * @ascestor and with @id as a descendent.
743 	 *
744 	 * But we already know that we're scanning IDs between @id and @ancestor
745 	 * numerically, since snapshot ID lists are kept sorted, so if we find
746 	 * an id that's an ancestor of @id we're done:
747 	 */
748 	darray_for_each_reverse(seen->ids, i)
749 		if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i))
750 			return false;
751 
752 	return true;
753 }
754 
755 /**
756  * ref_visible - given a key with snapshot id @src that points to a key with
757  * snapshot id @dst, test whether there is some snapshot in which @dst is
758  * visible.
759  *
760  * @c:		filesystem handle
761  * @s:		list of snapshot IDs already seen at @src
762  * @src:	snapshot ID of src key
763  * @dst:	snapshot ID of dst key
764  * Returns:	true if there is some snapshot in which @dst is visible
765  *
766  * Assumes we're visiting @src keys in natural key order
767  */
ref_visible(struct bch_fs * c,struct snapshots_seen * s,u32 src,u32 dst)768 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
769 			u32 src, u32 dst)
770 {
771 	return dst <= src
772 		? key_visible_in_snapshot(c, s, dst, src)
773 		: bch2_snapshot_is_ancestor(c, src, dst);
774 }
775 
ref_visible2(struct bch_fs * c,u32 src,struct snapshots_seen * src_seen,u32 dst,struct snapshots_seen * dst_seen)776 static int ref_visible2(struct bch_fs *c,
777 			u32 src, struct snapshots_seen *src_seen,
778 			u32 dst, struct snapshots_seen *dst_seen)
779 {
780 	if (dst > src) {
781 		swap(dst, src);
782 		swap(dst_seen, src_seen);
783 	}
784 	return key_visible_in_snapshot(c, src_seen, dst, src);
785 }
786 
787 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
788 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
789 	     (_i)->inode.bi_snapshot <= (_snapshot); _i++)				\
790 		if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot))
791 
792 struct inode_walker_entry {
793 	struct bch_inode_unpacked inode;
794 	bool			whiteout;
795 	u64			count;
796 	u64			i_size;
797 };
798 
799 struct inode_walker {
800 	bool				first_this_inode;
801 	bool				have_inodes;
802 	bool				recalculate_sums;
803 	struct bpos			last_pos;
804 
805 	DARRAY(struct inode_walker_entry) inodes;
806 	snapshot_id_list		deletes;
807 };
808 
inode_walker_exit(struct inode_walker * w)809 static void inode_walker_exit(struct inode_walker *w)
810 {
811 	darray_exit(&w->inodes);
812 	darray_exit(&w->deletes);
813 }
814 
inode_walker_init(void)815 static struct inode_walker inode_walker_init(void)
816 {
817 	return (struct inode_walker) { 0, };
818 }
819 
add_inode(struct bch_fs * c,struct inode_walker * w,struct bkey_s_c inode)820 static int add_inode(struct bch_fs *c, struct inode_walker *w,
821 		     struct bkey_s_c inode)
822 {
823 	int ret = darray_push(&w->inodes, ((struct inode_walker_entry) {
824 		.whiteout	= !bkey_is_inode(inode.k),
825 	}));
826 	if (ret)
827 		return ret;
828 
829 	struct inode_walker_entry *n = &darray_last(w->inodes);
830 	if (!n->whiteout) {
831 		return bch2_inode_unpack(inode, &n->inode);
832 	} else {
833 		n->inode.bi_inum	= inode.k->p.offset;
834 		n->inode.bi_snapshot	= inode.k->p.snapshot;
835 		return 0;
836 	}
837 }
838 
get_inodes_all_snapshots(struct btree_trans * trans,struct inode_walker * w,u64 inum)839 static int get_inodes_all_snapshots(struct btree_trans *trans,
840 				    struct inode_walker *w, u64 inum)
841 {
842 	struct bch_fs *c = trans->c;
843 	struct btree_iter iter;
844 	struct bkey_s_c k;
845 	int ret;
846 
847 	/*
848 	 * We no longer have inodes for w->last_pos; clear this to avoid
849 	 * screwing up check_i_sectors/check_subdir_count if we take a
850 	 * transaction restart here:
851 	 */
852 	w->have_inodes = false;
853 	w->recalculate_sums = false;
854 	w->inodes.nr = 0;
855 
856 	for_each_btree_key_max_norestart(trans, iter,
857 			BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX),
858 			BTREE_ITER_all_snapshots, k, ret) {
859 		ret = add_inode(c, w, k);
860 		if (ret)
861 			break;
862 	}
863 	bch2_trans_iter_exit(trans, &iter);
864 
865 	if (ret)
866 		return ret;
867 
868 	w->first_this_inode = true;
869 	w->have_inodes = true;
870 	return 0;
871 }
872 
get_visible_inodes(struct btree_trans * trans,struct inode_walker * w,struct snapshots_seen * s,u64 inum)873 static int get_visible_inodes(struct btree_trans *trans,
874 			      struct inode_walker *w,
875 			      struct snapshots_seen *s,
876 			      u64 inum)
877 {
878 	struct bch_fs *c = trans->c;
879 	struct btree_iter iter;
880 	struct bkey_s_c k;
881 	int ret;
882 
883 	w->inodes.nr = 0;
884 	w->deletes.nr = 0;
885 
886 	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
887 			   BTREE_ITER_all_snapshots, k, ret) {
888 		if (k.k->p.offset != inum)
889 			break;
890 
891 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
892 			continue;
893 
894 		if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
895 			continue;
896 
897 		ret = bkey_is_inode(k.k)
898 			? add_inode(c, w, k)
899 			: snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
900 		if (ret)
901 			break;
902 	}
903 	bch2_trans_iter_exit(trans, &iter);
904 
905 	return ret;
906 }
907 
908 static struct inode_walker_entry *
lookup_inode_for_snapshot(struct btree_trans * trans,struct inode_walker * w,struct bkey_s_c k)909 lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
910 {
911 	struct bch_fs *c = trans->c;
912 
913 	struct inode_walker_entry *i = darray_find_p(w->inodes, i,
914 		    bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot));
915 
916 	if (!i)
917 		return NULL;
918 
919 	struct printbuf buf = PRINTBUF;
920 	int ret = 0;
921 
922 	if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
923 			trans, snapshot_key_missing_inode_snapshot,
924 			 "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
925 			 "unexpected because we should always update the inode when we update a key in that inode\n"
926 			 "%s",
927 			 w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
928 			 (bch2_bkey_val_to_text(&buf, c, k),
929 			  buf.buf))) {
930 		if (!i->whiteout) {
931 			struct bch_inode_unpacked new = i->inode;
932 			new.bi_snapshot = k.k->p.snapshot;
933 			ret = __bch2_fsck_write_inode(trans, &new);
934 		} else {
935 			struct bkey_i whiteout;
936 			bkey_init(&whiteout.k);
937 			whiteout.k.type = KEY_TYPE_whiteout;
938 			whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot);
939 			ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
940 							  &whiteout,
941 							  BTREE_UPDATE_internal_snapshot_node);
942 		}
943 
944 		if (ret)
945 			goto fsck_err;
946 
947 		ret = bch2_trans_commit(trans, NULL, NULL, 0);
948 		if (ret)
949 			goto fsck_err;
950 
951 		struct inode_walker_entry new_entry = *i;
952 
953 		new_entry.inode.bi_snapshot	= k.k->p.snapshot;
954 		new_entry.count			= 0;
955 		new_entry.i_size		= 0;
956 
957 		while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot)
958 			--i;
959 
960 		size_t pos = i - w->inodes.data;
961 		ret = darray_insert_item(&w->inodes, pos, new_entry);
962 		if (ret)
963 			goto fsck_err;
964 
965 		ret = bch_err_throw(c, transaction_restart_nested);
966 		goto fsck_err;
967 	}
968 
969 	printbuf_exit(&buf);
970 	return i;
971 fsck_err:
972 	printbuf_exit(&buf);
973 	return ERR_PTR(ret);
974 }
975 
walk_inode(struct btree_trans * trans,struct inode_walker * w,struct bkey_s_c k)976 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
977 					     struct inode_walker *w,
978 					     struct bkey_s_c k)
979 {
980 	if (w->last_pos.inode != k.k->p.inode) {
981 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
982 		if (ret)
983 			return ERR_PTR(ret);
984 	}
985 
986 	w->last_pos = k.k->p;
987 
988 	return lookup_inode_for_snapshot(trans, w, k);
989 }
990 
991 /*
992  * Prefer to delete the first one, since that will be the one at the wrong
993  * offset:
994  * return value: 0 -> delete k1, 1 -> delete k2
995  */
bch2_fsck_update_backpointers(struct btree_trans * trans,struct snapshots_seen * s,const struct bch_hash_desc desc,struct bch_hash_info * hash_info,struct bkey_i * new)996 int bch2_fsck_update_backpointers(struct btree_trans *trans,
997 				  struct snapshots_seen *s,
998 				  const struct bch_hash_desc desc,
999 				  struct bch_hash_info *hash_info,
1000 				  struct bkey_i *new)
1001 {
1002 	if (new->k.type != KEY_TYPE_dirent)
1003 		return 0;
1004 
1005 	struct bkey_i_dirent *d = bkey_i_to_dirent(new);
1006 	struct inode_walker target = inode_walker_init();
1007 	int ret = 0;
1008 
1009 	if (d->v.d_type == DT_SUBVOL) {
1010 		bch_err(trans->c, "%s does not support DT_SUBVOL", __func__);
1011 		ret = -BCH_ERR_fsck_repair_unimplemented;
1012 	} else {
1013 		ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
1014 		if (ret)
1015 			goto err;
1016 
1017 		darray_for_each(target.inodes, i) {
1018 			i->inode.bi_dir_offset = d->k.p.offset;
1019 			ret = __bch2_fsck_write_inode(trans, &i->inode);
1020 			if (ret)
1021 				goto err;
1022 		}
1023 	}
1024 err:
1025 	inode_walker_exit(&target);
1026 	return ret;
1027 }
1028 
inode_get_dirent(struct btree_trans * trans,struct btree_iter * iter,struct bch_inode_unpacked * inode,u32 * snapshot)1029 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
1030 					       struct btree_iter *iter,
1031 					       struct bch_inode_unpacked *inode,
1032 					       u32 *snapshot)
1033 {
1034 	if (inode->bi_subvol) {
1035 		u64 inum;
1036 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
1037 		if (ret)
1038 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
1039 	}
1040 
1041 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
1042 }
1043 
check_inode_deleted_list(struct btree_trans * trans,struct bpos p)1044 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
1045 {
1046 	struct btree_iter iter;
1047 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
1048 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
1049 	bch2_trans_iter_exit(trans, &iter);
1050 	return ret;
1051 }
1052 
check_inode_dirent_inode(struct btree_trans * trans,struct bch_inode_unpacked * inode,bool * write_inode)1053 static int check_inode_dirent_inode(struct btree_trans *trans,
1054 				    struct bch_inode_unpacked *inode,
1055 				    bool *write_inode)
1056 {
1057 	struct bch_fs *c = trans->c;
1058 	struct printbuf buf = PRINTBUF;
1059 
1060 	u32 inode_snapshot = inode->bi_snapshot;
1061 	struct btree_iter dirent_iter = {};
1062 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
1063 	int ret = bkey_err(d);
1064 	if (ret && !bch2_err_matches(ret, ENOENT))
1065 		return ret;
1066 
1067 	if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) &&
1068 	    inode->bi_subvol &&
1069 	    (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
1070 		/* Older version of a renamed subvolume root: we won't have a
1071 		 * correct dirent for it. That's expected, see
1072 		 * inode_should_reattach().
1073 		 *
1074 		 * We don't clear the backpointer field when doing the rename
1075 		 * because there might be arbitrarily many versions in older
1076 		 * snapshots.
1077 		 */
1078 		inode->bi_dir = 0;
1079 		inode->bi_dir_offset = 0;
1080 		*write_inode = true;
1081 		goto out;
1082 	}
1083 
1084 	if (fsck_err_on(ret,
1085 			trans, inode_points_to_missing_dirent,
1086 			"inode points to missing dirent\n%s",
1087 			(bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
1088 	    fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode),
1089 			trans, inode_points_to_wrong_dirent,
1090 			"%s",
1091 			(printbuf_reset(&buf),
1092 			 dirent_inode_mismatch_msg(&buf, c, d, inode),
1093 			 buf.buf))) {
1094 		/*
1095 		 * We just clear the backpointer fields for now. If we find a
1096 		 * dirent that points to this inode in check_dirents(), we'll
1097 		 * update it then; then when we get to check_path() if the
1098 		 * backpointer is still 0 we'll reattach it.
1099 		 */
1100 		inode->bi_dir = 0;
1101 		inode->bi_dir_offset = 0;
1102 		*write_inode = true;
1103 	}
1104 out:
1105 	ret = 0;
1106 fsck_err:
1107 	bch2_trans_iter_exit(trans, &dirent_iter);
1108 	printbuf_exit(&buf);
1109 	bch_err_fn(c, ret);
1110 	return ret;
1111 }
1112 
check_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_inode_unpacked * snapshot_root,struct snapshots_seen * s)1113 static int check_inode(struct btree_trans *trans,
1114 		       struct btree_iter *iter,
1115 		       struct bkey_s_c k,
1116 		       struct bch_inode_unpacked *snapshot_root,
1117 		       struct snapshots_seen *s)
1118 {
1119 	struct bch_fs *c = trans->c;
1120 	struct printbuf buf = PRINTBUF;
1121 	struct bch_inode_unpacked u;
1122 	bool do_update = false;
1123 	int ret;
1124 
1125 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1126 	if (ret < 0)
1127 		goto err;
1128 	if (ret)
1129 		return 0;
1130 
1131 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1132 	if (ret)
1133 		goto err;
1134 
1135 	if (!bkey_is_inode(k.k))
1136 		return 0;
1137 
1138 	ret = bch2_inode_unpack(k, &u);
1139 	if (ret)
1140 		goto err;
1141 
1142 	if (snapshot_root->bi_inum != u.bi_inum) {
1143 		ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root);
1144 		if (ret)
1145 			goto err;
1146 	}
1147 
1148 	if (u.bi_hash_seed	!= snapshot_root->bi_hash_seed ||
1149 	    INODE_STR_HASH(&u)	!= INODE_STR_HASH(snapshot_root)) {
1150 		ret = bch2_repair_inode_hash_info(trans, snapshot_root);
1151 		BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented);
1152 		if (ret)
1153 			goto err;
1154 	}
1155 
1156 	ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update);
1157 	if (ret)
1158 		goto err;
1159 
1160 	if (bch2_inode_has_backpointer(&u)) {
1161 		ret = check_inode_dirent_inode(trans, &u, &do_update);
1162 		if (ret)
1163 			goto err;
1164 	}
1165 
1166 	if (fsck_err_on(bch2_inode_has_backpointer(&u) &&
1167 			(u.bi_flags & BCH_INODE_unlinked),
1168 			trans, inode_unlinked_but_has_dirent,
1169 			"inode unlinked but has dirent\n%s",
1170 			(printbuf_reset(&buf),
1171 			 bch2_inode_unpacked_to_text(&buf, &u),
1172 			 buf.buf))) {
1173 		u.bi_flags &= ~BCH_INODE_unlinked;
1174 		do_update = true;
1175 	}
1176 
1177 	if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
1178 		/* Check for this early so that check_unreachable_inode() will reattach it */
1179 
1180 		ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
1181 		if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
1182 			goto err;
1183 
1184 		fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
1185 			    "dir unlinked but not empty\n%s",
1186 			    (printbuf_reset(&buf),
1187 			     bch2_inode_unpacked_to_text(&buf, &u),
1188 			     buf.buf));
1189 		u.bi_flags &= ~BCH_INODE_unlinked;
1190 		do_update = true;
1191 		ret = 0;
1192 	}
1193 
1194 	if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size,
1195 			trans, inode_dir_has_nonzero_i_size,
1196 			"directory %llu:%u with nonzero i_size %lli",
1197 			u.bi_inum, u.bi_snapshot, u.bi_size)) {
1198 		u.bi_size = 0;
1199 		do_update = true;
1200 	}
1201 
1202 	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
1203 	if (ret < 0)
1204 		goto err;
1205 
1206 	if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
1207 			trans, inode_has_child_snapshots_wrong,
1208 			"inode has_child_snapshots flag wrong (should be %u)\n%s",
1209 			ret,
1210 			(printbuf_reset(&buf),
1211 			 bch2_inode_unpacked_to_text(&buf, &u),
1212 			 buf.buf))) {
1213 		if (ret)
1214 			u.bi_flags |= BCH_INODE_has_child_snapshot;
1215 		else
1216 			u.bi_flags &= ~BCH_INODE_has_child_snapshot;
1217 		do_update = true;
1218 	}
1219 	ret = 0;
1220 
1221 	if ((u.bi_flags & BCH_INODE_unlinked) &&
1222 	    !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
1223 		if (!test_bit(BCH_FS_started, &c->flags)) {
1224 			/*
1225 			 * If we're not in online fsck, don't delete unlinked
1226 			 * inodes, just make sure they're on the deleted list.
1227 			 *
1228 			 * They might be referred to by a logged operation -
1229 			 * i.e. we might have crashed in the middle of a
1230 			 * truncate on an unlinked but open file - so we want to
1231 			 * let the delete_dead_inodes kill it after resuming
1232 			 * logged ops.
1233 			 */
1234 			ret = check_inode_deleted_list(trans, k.k->p);
1235 			if (ret < 0)
1236 				goto err_noprint;
1237 
1238 			fsck_err_on(!ret,
1239 				    trans, unlinked_inode_not_on_deleted_list,
1240 				    "inode %llu:%u unlinked, but not on deleted list",
1241 				    u.bi_inum, k.k->p.snapshot);
1242 
1243 			ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
1244 			if (ret)
1245 				goto err;
1246 		} else {
1247 			ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
1248 			if (ret < 0)
1249 				goto err;
1250 
1251 			if (fsck_err_on(!ret,
1252 					trans, inode_unlinked_and_not_open,
1253 				      "inode %llu:%u unlinked and not open",
1254 				      u.bi_inum, u.bi_snapshot)) {
1255 				ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1256 				bch_err_msg(c, ret, "in fsck deleting inode");
1257 				goto err_noprint;
1258 			}
1259 			ret = 0;
1260 		}
1261 	}
1262 
1263 	if (fsck_err_on(u.bi_parent_subvol &&
1264 			(u.bi_subvol == 0 ||
1265 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1266 			trans, inode_bi_parent_nonzero,
1267 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1268 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1269 		u.bi_parent_subvol = 0;
1270 		do_update = true;
1271 	}
1272 
1273 	if (u.bi_subvol) {
1274 		struct bch_subvolume s;
1275 
1276 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
1277 		if (ret && !bch2_err_matches(ret, ENOENT))
1278 			goto err;
1279 
1280 		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1281 			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1282 			goto do_update;
1283 		}
1284 
1285 		if (fsck_err_on(ret,
1286 				trans, inode_bi_subvol_missing,
1287 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1288 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1289 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1290 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1291 							   k.k->p.snapshot),
1292 				trans, inode_bi_subvol_wrong,
1293 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1294 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1295 				le64_to_cpu(s.inode),
1296 				le32_to_cpu(s.snapshot))) {
1297 			u.bi_subvol = 0;
1298 			u.bi_parent_subvol = 0;
1299 			do_update = true;
1300 		}
1301 	}
1302 
1303 	if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
1304 			trans, inode_journal_seq_in_future,
1305 			"inode journal seq in future (currently at %llu)\n%s",
1306 			journal_cur_seq(&c->journal),
1307 			(printbuf_reset(&buf),
1308 			 bch2_inode_unpacked_to_text(&buf, &u),
1309 			buf.buf))) {
1310 		u.bi_journal_seq = journal_cur_seq(&c->journal);
1311 		do_update = true;
1312 	}
1313 do_update:
1314 	if (do_update) {
1315 		ret = __bch2_fsck_write_inode(trans, &u);
1316 		bch_err_msg(c, ret, "in fsck updating inode");
1317 		if (ret)
1318 			goto err_noprint;
1319 	}
1320 err:
1321 fsck_err:
1322 	bch_err_fn(c, ret);
1323 err_noprint:
1324 	printbuf_exit(&buf);
1325 	return ret;
1326 }
1327 
bch2_check_inodes(struct bch_fs * c)1328 int bch2_check_inodes(struct bch_fs *c)
1329 {
1330 	struct bch_inode_unpacked snapshot_root = {};
1331 	struct snapshots_seen s;
1332 
1333 	snapshots_seen_init(&s);
1334 
1335 	int ret = bch2_trans_run(c,
1336 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1337 				POS_MIN,
1338 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1339 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1340 			check_inode(trans, &iter, k, &snapshot_root, &s)));
1341 
1342 	snapshots_seen_exit(&s);
1343 	bch_err_fn(c, ret);
1344 	return ret;
1345 }
1346 
find_oldest_inode_needs_reattach(struct btree_trans * trans,struct bch_inode_unpacked * inode)1347 static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
1348 					    struct bch_inode_unpacked *inode)
1349 {
1350 	struct bch_fs *c = trans->c;
1351 	struct btree_iter iter;
1352 	struct bkey_s_c k;
1353 	int ret = 0;
1354 
1355 	/*
1356 	 * We look for inodes to reattach in natural key order, leaves first,
1357 	 * but we should do the reattach at the oldest version that needs to be
1358 	 * reattached:
1359 	 */
1360 	for_each_btree_key_norestart(trans, iter,
1361 				     BTREE_ID_inodes,
1362 				     SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
1363 				     BTREE_ITER_all_snapshots, k, ret) {
1364 		if (k.k->p.offset != inode->bi_inum)
1365 			break;
1366 
1367 		if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
1368 			continue;
1369 
1370 		if (!bkey_is_inode(k.k))
1371 			break;
1372 
1373 		struct bch_inode_unpacked parent_inode;
1374 		ret = bch2_inode_unpack(k, &parent_inode);
1375 		if (ret)
1376 			break;
1377 
1378 		if (!inode_should_reattach(&parent_inode))
1379 			break;
1380 
1381 		*inode = parent_inode;
1382 	}
1383 	bch2_trans_iter_exit(trans, &iter);
1384 
1385 	return ret;
1386 }
1387 
check_unreachable_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1388 static int check_unreachable_inode(struct btree_trans *trans,
1389 				   struct btree_iter *iter,
1390 				   struct bkey_s_c k)
1391 {
1392 	struct printbuf buf = PRINTBUF;
1393 	int ret = 0;
1394 
1395 	if (!bkey_is_inode(k.k))
1396 		return 0;
1397 
1398 	struct bch_inode_unpacked inode;
1399 	ret = bch2_inode_unpack(k, &inode);
1400 	if (ret)
1401 		return ret;
1402 
1403 	if (!inode_should_reattach(&inode))
1404 		return 0;
1405 
1406 	ret = find_oldest_inode_needs_reattach(trans, &inode);
1407 	if (ret)
1408 		return ret;
1409 
1410 	if (fsck_err(trans, inode_unreachable,
1411 		     "unreachable inode:\n%s",
1412 		     (bch2_inode_unpacked_to_text(&buf, &inode),
1413 		      buf.buf)))
1414 		ret = reattach_inode(trans, &inode);
1415 fsck_err:
1416 	printbuf_exit(&buf);
1417 	return ret;
1418 }
1419 
1420 /*
1421  * Reattach unreachable (but not unlinked) inodes
1422  *
1423  * Run after check_inodes() and check_dirents(), so we node that inode
1424  * backpointer fields point to valid dirents, and every inode that has a dirent
1425  * that points to it has its backpointer field set - so we're just looking for
1426  * non-unlinked inodes without backpointers:
1427  *
1428  * XXX: this is racy w.r.t. hardlink removal in online fsck
1429  */
bch2_check_unreachable_inodes(struct bch_fs * c)1430 int bch2_check_unreachable_inodes(struct bch_fs *c)
1431 {
1432 	int ret = bch2_trans_run(c,
1433 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1434 				POS_MIN,
1435 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1436 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1437 			check_unreachable_inode(trans, &iter, k)));
1438 	bch_err_fn(c, ret);
1439 	return ret;
1440 }
1441 
btree_matches_i_mode(enum btree_id btree,unsigned mode)1442 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1443 {
1444 	switch (btree) {
1445 	case BTREE_ID_extents:
1446 		return S_ISREG(mode) || S_ISLNK(mode);
1447 	case BTREE_ID_dirents:
1448 		return S_ISDIR(mode);
1449 	case BTREE_ID_xattrs:
1450 		return true;
1451 	default:
1452 		BUG();
1453 	}
1454 }
1455 
check_key_has_inode(struct btree_trans * trans,struct btree_iter * iter,struct inode_walker * inode,struct inode_walker_entry * i,struct bkey_s_c k)1456 static int check_key_has_inode(struct btree_trans *trans,
1457 			       struct btree_iter *iter,
1458 			       struct inode_walker *inode,
1459 			       struct inode_walker_entry *i,
1460 			       struct bkey_s_c k)
1461 {
1462 	struct bch_fs *c = trans->c;
1463 	struct printbuf buf = PRINTBUF;
1464 	struct btree_iter iter2 = {};
1465 	int ret = PTR_ERR_OR_ZERO(i);
1466 	if (ret)
1467 		return ret;
1468 
1469 	if (k.k->type == KEY_TYPE_whiteout)
1470 		goto out;
1471 
1472 	bool have_inode = i && !i->whiteout;
1473 
1474 	if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes)))
1475 		goto reconstruct;
1476 
1477 	if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode))
1478 		goto out;
1479 
1480 	prt_printf(&buf, ", ");
1481 
1482 	bool have_old_inode = false;
1483 	darray_for_each(inode->inodes, i2)
1484 		if (!i2->whiteout &&
1485 		    bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) &&
1486 		    btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) {
1487 			prt_printf(&buf, "but found good inode in older snapshot\n");
1488 			bch2_inode_unpacked_to_text(&buf, &i2->inode);
1489 			prt_newline(&buf);
1490 			have_old_inode = true;
1491 			break;
1492 		}
1493 
1494 	struct bkey_s_c k2;
1495 	unsigned nr_keys = 0;
1496 
1497 	prt_printf(&buf, "found keys:\n");
1498 
1499 	for_each_btree_key_max_norestart(trans, iter2, iter->btree_id,
1500 					 SPOS(k.k->p.inode, 0, k.k->p.snapshot),
1501 					 POS(k.k->p.inode, U64_MAX),
1502 					 0, k2, ret) {
1503 		nr_keys++;
1504 		if (nr_keys <= 10) {
1505 			bch2_bkey_val_to_text(&buf, c, k2);
1506 			prt_newline(&buf);
1507 		}
1508 		if (nr_keys >= 100)
1509 			break;
1510 	}
1511 
1512 	if (ret)
1513 		goto err;
1514 
1515 	if (nr_keys > 100)
1516 		prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys);
1517 	else if (nr_keys > 10)
1518 		prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys);
1519 
1520 	if (!have_inode) {
1521 		if (fsck_err_on(!have_inode,
1522 				trans, key_in_missing_inode,
1523 				"key in missing inode%s", buf.buf)) {
1524 			/*
1525 			 * Maybe a deletion that raced with data move, or something
1526 			 * weird like that? But if we know the inode was deleted, or
1527 			 * it's just a few keys, we can safely delete them.
1528 			 *
1529 			 * If it's many keys, we should probably recreate the inode
1530 			 */
1531 			if (have_old_inode || nr_keys <= 2)
1532 				goto delete;
1533 			else
1534 				goto reconstruct;
1535 		}
1536 	} else {
1537 		/*
1538 		 * not autofix, this one would be a giant wtf - bit error in the
1539 		 * inode corrupting i_mode?
1540 		 *
1541 		 * may want to try repairing inode instead of deleting
1542 		 */
1543 		if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1544 				trans, key_in_wrong_inode_type,
1545 				"key for wrong inode mode %o%s",
1546 				i->inode.bi_mode, buf.buf))
1547 			goto delete;
1548 	}
1549 out:
1550 err:
1551 fsck_err:
1552 	bch2_trans_iter_exit(trans, &iter2);
1553 	printbuf_exit(&buf);
1554 	bch_err_fn(c, ret);
1555 	return ret;
1556 delete:
1557 	/*
1558 	 * XXX: print out more info
1559 	 * count up extents for this inode, check if we have different inode in
1560 	 * an older snapshot version, perhaps decide if we want to reconstitute
1561 	 */
1562 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1563 	goto out;
1564 reconstruct:
1565 	ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1566 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1567 	if (ret)
1568 		goto err;
1569 
1570 	inode->last_pos.inode--;
1571 	ret = bch_err_throw(c, transaction_restart_nested);
1572 	goto out;
1573 }
1574 
check_i_sectors_notnested(struct btree_trans * trans,struct inode_walker * w)1575 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1576 {
1577 	struct bch_fs *c = trans->c;
1578 	int ret = 0;
1579 	s64 count2;
1580 
1581 	darray_for_each(w->inodes, i) {
1582 		if (i->inode.bi_sectors == i->count)
1583 			continue;
1584 
1585 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot);
1586 
1587 		if (w->recalculate_sums)
1588 			i->count = count2;
1589 
1590 		if (i->count != count2) {
1591 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1592 					    w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
1593 			i->count = count2;
1594 		}
1595 
1596 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1597 				trans, inode_i_sectors_wrong,
1598 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1599 				w->last_pos.inode, i->inode.bi_snapshot,
1600 				i->inode.bi_sectors, i->count)) {
1601 			i->inode.bi_sectors = i->count;
1602 			ret = bch2_fsck_write_inode(trans, &i->inode);
1603 			if (ret)
1604 				break;
1605 		}
1606 	}
1607 fsck_err:
1608 	bch_err_fn(c, ret);
1609 	return ret;
1610 }
1611 
check_i_sectors(struct btree_trans * trans,struct inode_walker * w)1612 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1613 {
1614 	u32 restart_count = trans->restart_count;
1615 	return check_i_sectors_notnested(trans, w) ?:
1616 		trans_was_restarted(trans, restart_count);
1617 }
1618 
1619 struct extent_end {
1620 	u32			snapshot;
1621 	u64			offset;
1622 	struct snapshots_seen	seen;
1623 };
1624 
1625 struct extent_ends {
1626 	struct bpos			last_pos;
1627 	DARRAY(struct extent_end)	e;
1628 };
1629 
extent_ends_reset(struct extent_ends * extent_ends)1630 static void extent_ends_reset(struct extent_ends *extent_ends)
1631 {
1632 	darray_for_each(extent_ends->e, i)
1633 		snapshots_seen_exit(&i->seen);
1634 	extent_ends->e.nr = 0;
1635 }
1636 
extent_ends_exit(struct extent_ends * extent_ends)1637 static void extent_ends_exit(struct extent_ends *extent_ends)
1638 {
1639 	extent_ends_reset(extent_ends);
1640 	darray_exit(&extent_ends->e);
1641 }
1642 
extent_ends_init(struct extent_ends * extent_ends)1643 static void extent_ends_init(struct extent_ends *extent_ends)
1644 {
1645 	memset(extent_ends, 0, sizeof(*extent_ends));
1646 }
1647 
extent_ends_at(struct bch_fs * c,struct extent_ends * extent_ends,struct snapshots_seen * seen,struct bkey_s_c k)1648 static int extent_ends_at(struct bch_fs *c,
1649 			  struct extent_ends *extent_ends,
1650 			  struct snapshots_seen *seen,
1651 			  struct bkey_s_c k)
1652 {
1653 	struct extent_end *i, n = (struct extent_end) {
1654 		.offset		= k.k->p.offset,
1655 		.snapshot	= k.k->p.snapshot,
1656 		.seen		= *seen,
1657 	};
1658 
1659 	n.seen.ids.data = kmemdup(seen->ids.data,
1660 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1661 			      GFP_KERNEL);
1662 	if (!n.seen.ids.data)
1663 		return bch_err_throw(c, ENOMEM_fsck_extent_ends_at);
1664 
1665 	__darray_for_each(extent_ends->e, i) {
1666 		if (i->snapshot == k.k->p.snapshot) {
1667 			snapshots_seen_exit(&i->seen);
1668 			*i = n;
1669 			return 0;
1670 		}
1671 
1672 		if (i->snapshot >= k.k->p.snapshot)
1673 			break;
1674 	}
1675 
1676 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1677 }
1678 
overlapping_extents_found(struct btree_trans * trans,enum btree_id btree,struct bpos pos1,struct snapshots_seen * pos1_seen,struct bkey pos2,bool * fixed,struct extent_end * extent_end)1679 static int overlapping_extents_found(struct btree_trans *trans,
1680 				     enum btree_id btree,
1681 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1682 				     struct bkey pos2,
1683 				     bool *fixed,
1684 				     struct extent_end *extent_end)
1685 {
1686 	struct bch_fs *c = trans->c;
1687 	struct printbuf buf = PRINTBUF;
1688 	struct btree_iter iter1, iter2 = {};
1689 	struct bkey_s_c k1, k2;
1690 	int ret;
1691 
1692 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1693 
1694 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1695 			     BTREE_ITER_all_snapshots|
1696 			     BTREE_ITER_not_extents);
1697 	k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX));
1698 	ret = bkey_err(k1);
1699 	if (ret)
1700 		goto err;
1701 
1702 	prt_newline(&buf);
1703 	bch2_bkey_val_to_text(&buf, c, k1);
1704 
1705 	if (!bpos_eq(pos1, k1.k->p)) {
1706 		prt_str(&buf, "\nwanted\n  ");
1707 		bch2_bpos_to_text(&buf, pos1);
1708 		prt_str(&buf, "\n");
1709 		bch2_bkey_to_text(&buf, &pos2);
1710 
1711 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1712 			__func__, buf.buf);
1713 		ret = bch_err_throw(c, internal_fsck_err);
1714 		goto err;
1715 	}
1716 
1717 	bch2_trans_copy_iter(trans, &iter2, &iter1);
1718 
1719 	while (1) {
1720 		bch2_btree_iter_advance(trans, &iter2);
1721 
1722 		k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX));
1723 		ret = bkey_err(k2);
1724 		if (ret)
1725 			goto err;
1726 
1727 		if (bpos_ge(k2.k->p, pos2.p))
1728 			break;
1729 	}
1730 
1731 	prt_newline(&buf);
1732 	bch2_bkey_val_to_text(&buf, c, k2);
1733 
1734 	if (bpos_gt(k2.k->p, pos2.p) ||
1735 	    pos2.size != k2.k->size) {
1736 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1737 			__func__, buf.buf);
1738 		ret = bch_err_throw(c, internal_fsck_err);
1739 		goto err;
1740 	}
1741 
1742 	prt_printf(&buf, "\noverwriting %s extent",
1743 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1744 
1745 	if (fsck_err(trans, extent_overlapping,
1746 		     "overlapping extents%s", buf.buf)) {
1747 		struct btree_iter *old_iter = &iter1;
1748 		struct disk_reservation res = { 0 };
1749 
1750 		if (pos1.snapshot < pos2.p.snapshot) {
1751 			old_iter = &iter2;
1752 			swap(k1, k2);
1753 		}
1754 
1755 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1756 
1757 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1758 				BTREE_UPDATE_internal_snapshot_node,
1759 				k1, k2) ?:
1760 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1761 		bch2_disk_reservation_put(c, &res);
1762 
1763 		bch_info(c, "repair ret %s", bch2_err_str(ret));
1764 
1765 		if (ret)
1766 			goto err;
1767 
1768 		*fixed = true;
1769 
1770 		if (pos1.snapshot == pos2.p.snapshot) {
1771 			/*
1772 			 * We overwrote the first extent, and did the overwrite
1773 			 * in the same snapshot:
1774 			 */
1775 			extent_end->offset = bkey_start_offset(&pos2);
1776 		} else if (pos1.snapshot > pos2.p.snapshot) {
1777 			/*
1778 			 * We overwrote the first extent in pos2's snapshot:
1779 			 */
1780 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1781 		} else {
1782 			/*
1783 			 * We overwrote the second extent - restart
1784 			 * check_extent() from the top:
1785 			 */
1786 			ret = bch_err_throw(c, transaction_restart_nested);
1787 		}
1788 	}
1789 fsck_err:
1790 err:
1791 	bch2_trans_iter_exit(trans, &iter2);
1792 	bch2_trans_iter_exit(trans, &iter1);
1793 	printbuf_exit(&buf);
1794 	return ret;
1795 }
1796 
check_overlapping_extents(struct btree_trans * trans,struct snapshots_seen * seen,struct extent_ends * extent_ends,struct bkey_s_c k,struct btree_iter * iter,bool * fixed)1797 static int check_overlapping_extents(struct btree_trans *trans,
1798 			      struct snapshots_seen *seen,
1799 			      struct extent_ends *extent_ends,
1800 			      struct bkey_s_c k,
1801 			      struct btree_iter *iter,
1802 			      bool *fixed)
1803 {
1804 	struct bch_fs *c = trans->c;
1805 	int ret = 0;
1806 
1807 	/* transaction restart, running again */
1808 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1809 		return 0;
1810 
1811 	if (extent_ends->last_pos.inode != k.k->p.inode)
1812 		extent_ends_reset(extent_ends);
1813 
1814 	darray_for_each(extent_ends->e, i) {
1815 		if (i->offset <= bkey_start_offset(k.k))
1816 			continue;
1817 
1818 		if (!ref_visible2(c,
1819 				  k.k->p.snapshot, seen,
1820 				  i->snapshot, &i->seen))
1821 			continue;
1822 
1823 		ret = overlapping_extents_found(trans, iter->btree_id,
1824 						SPOS(iter->pos.inode,
1825 						     i->offset,
1826 						     i->snapshot),
1827 						&i->seen,
1828 						*k.k, fixed, i);
1829 		if (ret)
1830 			goto err;
1831 	}
1832 
1833 	extent_ends->last_pos = k.k->p;
1834 err:
1835 	return ret;
1836 }
1837 
check_extent_overbig(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1838 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1839 				struct bkey_s_c k)
1840 {
1841 	struct bch_fs *c = trans->c;
1842 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1843 	struct bch_extent_crc_unpacked crc;
1844 	const union bch_extent_entry *i;
1845 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1846 
1847 	bkey_for_each_crc(k.k, ptrs, crc, i)
1848 		if (crc_is_encoded(crc) &&
1849 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1850 			struct printbuf buf = PRINTBUF;
1851 
1852 			bch2_bkey_val_to_text(&buf, c, k);
1853 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1854 			printbuf_exit(&buf);
1855 		}
1856 
1857 	return 0;
1858 }
1859 
check_extent(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct inode_walker * inode,struct snapshots_seen * s,struct extent_ends * extent_ends,struct disk_reservation * res)1860 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1861 			struct bkey_s_c k,
1862 			struct inode_walker *inode,
1863 			struct snapshots_seen *s,
1864 			struct extent_ends *extent_ends,
1865 			struct disk_reservation *res)
1866 {
1867 	struct bch_fs *c = trans->c;
1868 	struct printbuf buf = PRINTBUF;
1869 	int ret = 0;
1870 
1871 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1872 	if (ret) {
1873 		ret = ret < 0 ? ret : 0;
1874 		goto out;
1875 	}
1876 
1877 	if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
1878 		ret = check_i_sectors(trans, inode);
1879 		if (ret)
1880 			goto err;
1881 	}
1882 
1883 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1884 	if (ret)
1885 		goto err;
1886 
1887 	struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
1888 	ret = PTR_ERR_OR_ZERO(extent_i);
1889 	if (ret)
1890 		goto err;
1891 
1892 	ret = check_key_has_inode(trans, iter, inode, extent_i, k);
1893 	if (ret)
1894 		goto err;
1895 
1896 	if (k.k->type != KEY_TYPE_whiteout) {
1897 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1898 						&inode->recalculate_sums);
1899 		if (ret)
1900 			goto err;
1901 
1902 		/*
1903 		 * Check inodes in reverse order, from oldest snapshots to
1904 		 * newest, starting from the inode that matches this extent's
1905 		 * snapshot. If we didn't have one, iterate over all inodes:
1906 		 */
1907 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1908 		     inode->inodes.data && i >= inode->inodes.data;
1909 		     --i) {
1910 			if (i->inode.bi_snapshot > k.k->p.snapshot ||
1911 			    !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
1912 				continue;
1913 
1914 			u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9;
1915 
1916 			if (fsck_err_on(k.k->p.offset > last_block &&
1917 					!bkey_extent_is_reservation(k),
1918 					trans, extent_past_end_of_inode,
1919 					"extent type past end of inode %llu:%u, i_size %llu\n%s",
1920 					i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
1921 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1922 				struct bkey_i *whiteout = bch2_trans_kmalloc(trans, sizeof(*whiteout));
1923 				ret = PTR_ERR_OR_ZERO(whiteout);
1924 				if (ret)
1925 					goto err;
1926 
1927 				bkey_init(&whiteout->k);
1928 				whiteout->k.p = SPOS(k.k->p.inode,
1929 						     last_block,
1930 						     i->inode.bi_snapshot);
1931 				bch2_key_resize(&whiteout->k,
1932 						min(KEY_SIZE_MAX & (~0 << c->block_bits),
1933 						    U64_MAX - whiteout->k.p.offset));
1934 
1935 
1936 				/*
1937 				 * Need a normal (not BTREE_ITER_all_snapshots)
1938 				 * iterator, if we're deleting in a different
1939 				 * snapshot and need to emit a whiteout
1940 				 */
1941 				struct btree_iter iter2;
1942 				bch2_trans_iter_init(trans, &iter2, BTREE_ID_extents,
1943 						     bkey_start_pos(&whiteout->k),
1944 						     BTREE_ITER_intent);
1945 				ret =   bch2_btree_iter_traverse(trans, &iter2) ?:
1946 					bch2_trans_update(trans, &iter2, whiteout,
1947 						BTREE_UPDATE_internal_snapshot_node);
1948 				bch2_trans_iter_exit(trans, &iter2);
1949 				if (ret)
1950 					goto err;
1951 
1952 				iter->k.type = KEY_TYPE_whiteout;
1953 				break;
1954 			}
1955 		}
1956 	}
1957 
1958 	ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
1959 	if (ret)
1960 		goto err;
1961 
1962 	if (bkey_extent_is_allocation(k.k)) {
1963 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1964 		     inode->inodes.data && i >= inode->inodes.data;
1965 		     --i) {
1966 			if (i->whiteout ||
1967 			    i->inode.bi_snapshot > k.k->p.snapshot ||
1968 			    !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
1969 				continue;
1970 
1971 			i->count += k.k->size;
1972 		}
1973 	}
1974 
1975 	if (k.k->type != KEY_TYPE_whiteout) {
1976 		ret = extent_ends_at(c, extent_ends, s, k);
1977 		if (ret)
1978 			goto err;
1979 	}
1980 out:
1981 err:
1982 fsck_err:
1983 	printbuf_exit(&buf);
1984 	bch_err_fn(c, ret);
1985 	return ret;
1986 }
1987 
1988 /*
1989  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1990  * that i_size an i_sectors are consistent
1991  */
bch2_check_extents(struct bch_fs * c)1992 int bch2_check_extents(struct bch_fs *c)
1993 {
1994 	struct inode_walker w = inode_walker_init();
1995 	struct snapshots_seen s;
1996 	struct extent_ends extent_ends;
1997 	struct disk_reservation res = { 0 };
1998 
1999 	snapshots_seen_init(&s);
2000 	extent_ends_init(&extent_ends);
2001 
2002 	int ret = bch2_trans_run(c,
2003 		for_each_btree_key(trans, iter, BTREE_ID_extents,
2004 				POS(BCACHEFS_ROOT_INO, 0),
2005 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
2006 			bch2_disk_reservation_put(c, &res);
2007 			check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
2008 			check_extent_overbig(trans, &iter, k);
2009 		})) ?:
2010 		check_i_sectors_notnested(trans, &w));
2011 
2012 	bch2_disk_reservation_put(c, &res);
2013 	extent_ends_exit(&extent_ends);
2014 	inode_walker_exit(&w);
2015 	snapshots_seen_exit(&s);
2016 
2017 	bch_err_fn(c, ret);
2018 	return ret;
2019 }
2020 
bch2_check_indirect_extents(struct bch_fs * c)2021 int bch2_check_indirect_extents(struct bch_fs *c)
2022 {
2023 	struct disk_reservation res = { 0 };
2024 
2025 	int ret = bch2_trans_run(c,
2026 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
2027 				POS_MIN,
2028 				BTREE_ITER_prefetch, k,
2029 				&res, NULL,
2030 				BCH_TRANS_COMMIT_no_enospc, ({
2031 			bch2_disk_reservation_put(c, &res);
2032 			check_extent_overbig(trans, &iter, k);
2033 		})));
2034 
2035 	bch2_disk_reservation_put(c, &res);
2036 	bch_err_fn(c, ret);
2037 	return ret;
2038 }
2039 
check_subdir_count_notnested(struct btree_trans * trans,struct inode_walker * w)2040 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
2041 {
2042 	struct bch_fs *c = trans->c;
2043 	int ret = 0;
2044 	s64 count2;
2045 
2046 	darray_for_each(w->inodes, i) {
2047 		if (i->inode.bi_nlink == i->count)
2048 			continue;
2049 
2050 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot);
2051 		if (count2 < 0)
2052 			return count2;
2053 
2054 		if (i->count != count2) {
2055 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
2056 					    w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
2057 			i->count = count2;
2058 			if (i->inode.bi_nlink == i->count)
2059 				continue;
2060 		}
2061 
2062 		if (i->inode.bi_nlink != i->count) {
2063 			CLASS(printbuf, buf)();
2064 
2065 			lockrestart_do(trans,
2066 				       bch2_inum_snapshot_to_path(trans, w->last_pos.inode,
2067 								  i->inode.bi_snapshot, NULL, &buf));
2068 
2069 			if (fsck_err_on(i->inode.bi_nlink != i->count,
2070 					trans, inode_dir_wrong_nlink,
2071 					"directory with wrong i_nlink: got %u, should be %llu\n%s",
2072 					i->inode.bi_nlink, i->count, buf.buf)) {
2073 				i->inode.bi_nlink = i->count;
2074 				ret = bch2_fsck_write_inode(trans, &i->inode);
2075 				if (ret)
2076 					break;
2077 			}
2078 		}
2079 	}
2080 fsck_err:
2081 	bch_err_fn(c, ret);
2082 	return ret;
2083 }
2084 
check_subdir_dirents_count(struct btree_trans * trans,struct inode_walker * w)2085 static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
2086 {
2087 	u32 restart_count = trans->restart_count;
2088 	return check_subdir_count_notnested(trans, w) ?:
2089 		trans_was_restarted(trans, restart_count);
2090 }
2091 
2092 /* find a subvolume that's a descendent of @snapshot: */
find_snapshot_subvol(struct btree_trans * trans,u32 snapshot,u32 * subvolid)2093 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
2094 {
2095 	struct btree_iter iter;
2096 	struct bkey_s_c k;
2097 	int ret;
2098 
2099 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
2100 		if (k.k->type != KEY_TYPE_subvolume)
2101 			continue;
2102 
2103 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2104 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
2105 			bch2_trans_iter_exit(trans, &iter);
2106 			*subvolid = k.k->p.offset;
2107 			goto found;
2108 		}
2109 	}
2110 	if (!ret)
2111 		ret = -ENOENT;
2112 found:
2113 	bch2_trans_iter_exit(trans, &iter);
2114 	return ret;
2115 }
2116 
2117 noinline_for_stack
check_dirent_to_subvol(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c_dirent d)2118 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
2119 				  struct bkey_s_c_dirent d)
2120 {
2121 	struct bch_fs *c = trans->c;
2122 	struct btree_iter subvol_iter = {};
2123 	struct bch_inode_unpacked subvol_root;
2124 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
2125 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
2126 	u32 parent_snapshot;
2127 	u32 new_parent_subvol = 0;
2128 	u64 parent_inum;
2129 	struct printbuf buf = PRINTBUF;
2130 	int ret = 0;
2131 
2132 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
2133 	if (ret && !bch2_err_matches(ret, ENOENT))
2134 		return ret;
2135 
2136 	if (ret ||
2137 	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
2138 		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
2139 		if (ret2 && !bch2_err_matches(ret, ENOENT))
2140 			return ret2;
2141 	}
2142 
2143 	if (ret &&
2144 	    !new_parent_subvol &&
2145 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
2146 		/*
2147 		 * Couldn't find a subvol for dirent's snapshot - but we lost
2148 		 * subvols, so we need to reconstruct:
2149 		 */
2150 		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
2151 		if (ret)
2152 			return ret;
2153 
2154 		parent_snapshot = d.k->p.snapshot;
2155 	}
2156 
2157 	if (fsck_err_on(ret,
2158 			trans, dirent_to_missing_parent_subvol,
2159 			"dirent parent_subvol points to missing subvolume\n%s",
2160 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
2161 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
2162 			trans, dirent_not_visible_in_parent_subvol,
2163 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
2164 			parent_snapshot,
2165 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
2166 		if (!new_parent_subvol) {
2167 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
2168 			return bch_err_throw(c, fsck_repair_unimplemented);
2169 		}
2170 
2171 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
2172 		ret = PTR_ERR_OR_ZERO(new_dirent);
2173 		if (ret)
2174 			goto err;
2175 
2176 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
2177 	}
2178 
2179 	struct bkey_s_c_subvolume s =
2180 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
2181 					 BTREE_ID_subvolumes, POS(0, target_subvol),
2182 					 0, subvolume);
2183 	ret = bkey_err(s.s_c);
2184 	if (ret && !bch2_err_matches(ret, ENOENT))
2185 		goto err;
2186 
2187 	if (ret) {
2188 		if (fsck_err(trans, dirent_to_missing_subvol,
2189 			     "dirent points to missing subvolume\n%s",
2190 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
2191 			return bch2_fsck_remove_dirent(trans, d.k->p);
2192 		ret = 0;
2193 		goto out;
2194 	}
2195 
2196 	if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) {
2197 		printbuf_reset(&buf);
2198 
2199 		prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n",
2200 			   parent_subvol);
2201 
2202 		ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset,
2203 					le64_to_cpu(s.v->inode) }, &buf);
2204 		if (ret)
2205 			goto err;
2206 		prt_newline(&buf);
2207 		bch2_bkey_val_to_text(&buf, c, s.s_c);
2208 
2209 		if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) {
2210 			struct bkey_i_subvolume *n =
2211 				bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
2212 			ret = PTR_ERR_OR_ZERO(n);
2213 			if (ret)
2214 				goto err;
2215 
2216 			n->v.fs_path_parent = cpu_to_le32(parent_subvol);
2217 		}
2218 	}
2219 
2220 	u64 target_inum = le64_to_cpu(s.v->inode);
2221 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2222 
2223 	ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot,
2224 					       &subvol_root, 0);
2225 	if (ret && !bch2_err_matches(ret, ENOENT))
2226 		goto err;
2227 
2228 	if (ret) {
2229 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2230 		ret = bch_err_throw(c, fsck_repair_unimplemented);
2231 		goto err;
2232 	}
2233 
2234 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2235 			trans, inode_bi_parent_wrong,
2236 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2237 			target_inum,
2238 			subvol_root.bi_parent_subvol, parent_subvol)) {
2239 		subvol_root.bi_parent_subvol = parent_subvol;
2240 		subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
2241 		ret = __bch2_fsck_write_inode(trans, &subvol_root);
2242 		if (ret)
2243 			goto err;
2244 	}
2245 
2246 	ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
2247 	if (ret)
2248 		goto err;
2249 out:
2250 err:
2251 fsck_err:
2252 	bch2_trans_iter_exit(trans, &subvol_iter);
2253 	printbuf_exit(&buf);
2254 	return ret;
2255 }
2256 
check_dirent(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_hash_info * hash_info,struct inode_walker * dir,struct inode_walker * target,struct snapshots_seen * s,bool * need_second_pass)2257 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2258 			struct bkey_s_c k,
2259 			struct bch_hash_info *hash_info,
2260 			struct inode_walker *dir,
2261 			struct inode_walker *target,
2262 			struct snapshots_seen *s,
2263 			bool *need_second_pass)
2264 {
2265 	struct bch_fs *c = trans->c;
2266 	struct inode_walker_entry *i;
2267 	struct printbuf buf = PRINTBUF;
2268 	int ret = 0;
2269 
2270 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2271 	if (ret) {
2272 		ret = ret < 0 ? ret : 0;
2273 		goto out;
2274 	}
2275 
2276 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2277 	if (ret)
2278 		goto err;
2279 
2280 	if (k.k->type == KEY_TYPE_whiteout)
2281 		goto out;
2282 
2283 	if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
2284 		ret = check_subdir_dirents_count(trans, dir);
2285 		if (ret)
2286 			goto err;
2287 	}
2288 
2289 	i = walk_inode(trans, dir, k);
2290 	ret = PTR_ERR_OR_ZERO(i);
2291 	if (ret < 0)
2292 		goto err;
2293 
2294 	ret = check_key_has_inode(trans, iter, dir, i, k);
2295 	if (ret)
2296 		goto err;
2297 
2298 	if (!i || i->whiteout)
2299 		goto out;
2300 
2301 	if (dir->first_this_inode)
2302 		*hash_info = bch2_hash_info_init(c, &i->inode);
2303 	dir->first_this_inode = false;
2304 
2305 #ifdef CONFIG_UNICODE
2306 	hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL;
2307 #endif
2308 
2309 	ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info,
2310 				      iter, k, need_second_pass);
2311 	if (ret < 0)
2312 		goto err;
2313 	if (ret) {
2314 		/* dirent has been deleted */
2315 		ret = 0;
2316 		goto out;
2317 	}
2318 
2319 	if (k.k->type != KEY_TYPE_dirent)
2320 		goto out;
2321 
2322 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2323 
2324 	/* check casefold */
2325 	if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding,
2326 			trans, dirent_casefold_mismatch,
2327 			"dirent casefold does not match dir casefold\n%s",
2328 			(printbuf_reset(&buf),
2329 			 bch2_bkey_val_to_text(&buf, c, k),
2330 			 buf.buf))) {
2331 		subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL
2332 				? le32_to_cpu(d.v->d_parent_subvol)
2333 				: 0,
2334 		};
2335 		u64 target = d.v->d_type == DT_SUBVOL
2336 			? le32_to_cpu(d.v->d_child_subvol)
2337 			: le64_to_cpu(d.v->d_inum);
2338 		struct qstr name = bch2_dirent_get_name(d);
2339 
2340 		struct bkey_i_dirent *new_d =
2341 			bch2_dirent_create_key(trans, hash_info, dir_inum,
2342 					       d.v->d_type, &name, NULL, target);
2343 		ret = PTR_ERR_OR_ZERO(new_d);
2344 		if (ret)
2345 			goto out;
2346 
2347 		new_d->k.p.inode	= d.k->p.inode;
2348 		new_d->k.p.snapshot	= d.k->p.snapshot;
2349 
2350 		struct btree_iter dup_iter = {};
2351 		ret =	bch2_hash_delete_at(trans,
2352 					    bch2_dirent_hash_desc, hash_info, iter,
2353 					    BTREE_UPDATE_internal_snapshot_node) ?:
2354 			bch2_str_hash_repair_key(trans, s,
2355 						 &bch2_dirent_hash_desc, hash_info,
2356 						 iter, bkey_i_to_s_c(&new_d->k_i),
2357 						 &dup_iter, bkey_s_c_null,
2358 						 need_second_pass);
2359 		goto out;
2360 	}
2361 
2362 	if (d.v->d_type == DT_SUBVOL) {
2363 		ret = check_dirent_to_subvol(trans, iter, d);
2364 		if (ret)
2365 			goto err;
2366 	} else {
2367 		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2368 		if (ret)
2369 			goto err;
2370 
2371 		if (fsck_err_on(!target->inodes.nr,
2372 				trans, dirent_to_missing_inode,
2373 				"dirent points to missing inode:\n%s",
2374 				(printbuf_reset(&buf),
2375 				 bch2_bkey_val_to_text(&buf, c, k),
2376 				 buf.buf))) {
2377 			ret = bch2_fsck_remove_dirent(trans, d.k->p);
2378 			if (ret)
2379 				goto err;
2380 		}
2381 
2382 		darray_for_each(target->inodes, i) {
2383 			ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
2384 			if (ret)
2385 				goto err;
2386 		}
2387 
2388 		darray_for_each(target->deletes, i)
2389 			if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
2390 					trans, dirent_to_overwritten_inode,
2391 					"dirent points to inode overwritten in snapshot %u:\n%s",
2392 					*i,
2393 					(printbuf_reset(&buf),
2394 					 bch2_bkey_val_to_text(&buf, c, k),
2395 					 buf.buf))) {
2396 				struct btree_iter delete_iter;
2397 				bch2_trans_iter_init(trans, &delete_iter,
2398 						     BTREE_ID_dirents,
2399 						     SPOS(k.k->p.inode, k.k->p.offset, *i),
2400 						     BTREE_ITER_intent);
2401 				ret =   bch2_btree_iter_traverse(trans, &delete_iter) ?:
2402 					bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
2403 							  hash_info,
2404 							  &delete_iter,
2405 							  BTREE_UPDATE_internal_snapshot_node);
2406 				bch2_trans_iter_exit(trans, &delete_iter);
2407 				if (ret)
2408 					goto err;
2409 
2410 			}
2411 	}
2412 
2413 	ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2414 	if (ret)
2415 		goto err;
2416 
2417 	for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
2418 		if (d.v->d_type == DT_DIR)
2419 			i->count++;
2420 		i->i_size += bkey_bytes(d.k);
2421 	}
2422 out:
2423 err:
2424 fsck_err:
2425 	printbuf_exit(&buf);
2426 	return ret;
2427 }
2428 
2429 /*
2430  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2431  * validate d_type
2432  */
bch2_check_dirents(struct bch_fs * c)2433 int bch2_check_dirents(struct bch_fs *c)
2434 {
2435 	struct inode_walker dir = inode_walker_init();
2436 	struct inode_walker target = inode_walker_init();
2437 	struct snapshots_seen s;
2438 	struct bch_hash_info hash_info;
2439 	bool need_second_pass = false, did_second_pass = false;
2440 	int ret;
2441 
2442 	snapshots_seen_init(&s);
2443 again:
2444 	ret = bch2_trans_run(c,
2445 		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2446 				POS(BCACHEFS_ROOT_INO, 0),
2447 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2448 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2449 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s,
2450 				     &need_second_pass)) ?:
2451 		check_subdir_count_notnested(trans, &dir));
2452 
2453 	if (!ret && need_second_pass && !did_second_pass) {
2454 		bch_info(c, "check_dirents requires second pass");
2455 		swap(did_second_pass, need_second_pass);
2456 		goto again;
2457 	}
2458 
2459 	if (!ret && need_second_pass) {
2460 		bch_err(c, "dirents not repairing");
2461 		ret = -EINVAL;
2462 	}
2463 
2464 	snapshots_seen_exit(&s);
2465 	inode_walker_exit(&dir);
2466 	inode_walker_exit(&target);
2467 	bch_err_fn(c, ret);
2468 	return ret;
2469 }
2470 
check_xattr(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_hash_info * hash_info,struct inode_walker * inode)2471 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2472 		       struct bkey_s_c k,
2473 		       struct bch_hash_info *hash_info,
2474 		       struct inode_walker *inode)
2475 {
2476 	struct bch_fs *c = trans->c;
2477 
2478 	int ret = bch2_check_key_has_snapshot(trans, iter, k);
2479 	if (ret < 0)
2480 		return ret;
2481 	if (ret)
2482 		return 0;
2483 
2484 	struct inode_walker_entry *i = walk_inode(trans, inode, k);
2485 	ret = PTR_ERR_OR_ZERO(i);
2486 	if (ret)
2487 		return ret;
2488 
2489 	ret = check_key_has_inode(trans, iter, inode, i, k);
2490 	if (ret)
2491 		return ret;
2492 
2493 	if (!i || i->whiteout)
2494 		return 0;
2495 
2496 	if (inode->first_this_inode)
2497 		*hash_info = bch2_hash_info_init(c, &i->inode);
2498 	inode->first_this_inode = false;
2499 
2500 	bool need_second_pass = false;
2501 	return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info,
2502 				      iter, k, &need_second_pass);
2503 }
2504 
2505 /*
2506  * Walk xattrs: verify that they all have a corresponding inode
2507  */
bch2_check_xattrs(struct bch_fs * c)2508 int bch2_check_xattrs(struct bch_fs *c)
2509 {
2510 	struct inode_walker inode = inode_walker_init();
2511 	struct bch_hash_info hash_info;
2512 	int ret = 0;
2513 
2514 	ret = bch2_trans_run(c,
2515 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2516 			POS(BCACHEFS_ROOT_INO, 0),
2517 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2518 			k,
2519 			NULL, NULL,
2520 			BCH_TRANS_COMMIT_no_enospc,
2521 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2522 
2523 	inode_walker_exit(&inode);
2524 	bch_err_fn(c, ret);
2525 	return ret;
2526 }
2527 
check_root_trans(struct btree_trans * trans)2528 static int check_root_trans(struct btree_trans *trans)
2529 {
2530 	struct bch_fs *c = trans->c;
2531 	struct bch_inode_unpacked root_inode;
2532 	u32 snapshot;
2533 	u64 inum;
2534 	int ret;
2535 
2536 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2537 	if (ret && !bch2_err_matches(ret, ENOENT))
2538 		return ret;
2539 
2540 	if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2541 				"root subvol missing")) {
2542 		struct bkey_i_subvolume *root_subvol =
2543 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2544 		ret = PTR_ERR_OR_ZERO(root_subvol);
2545 		if (ret)
2546 			goto err;
2547 
2548 		snapshot	= U32_MAX;
2549 		inum		= BCACHEFS_ROOT_INO;
2550 
2551 		bkey_subvolume_init(&root_subvol->k_i);
2552 		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2553 		root_subvol->v.flags	= 0;
2554 		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2555 		root_subvol->v.inode	= cpu_to_le64(inum);
2556 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2557 		bch_err_msg(c, ret, "writing root subvol");
2558 		if (ret)
2559 			goto err;
2560 	}
2561 
2562 	ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot,
2563 					       &root_inode, 0);
2564 	if (ret && !bch2_err_matches(ret, ENOENT))
2565 		return ret;
2566 
2567 	if (mustfix_fsck_err_on(ret,
2568 				trans, root_dir_missing,
2569 				"root directory missing") ||
2570 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2571 				trans, root_inode_not_dir,
2572 				"root inode not a directory")) {
2573 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2574 				0, NULL);
2575 		root_inode.bi_inum = inum;
2576 		root_inode.bi_snapshot = snapshot;
2577 
2578 		ret = __bch2_fsck_write_inode(trans, &root_inode);
2579 		bch_err_msg(c, ret, "writing root inode");
2580 	}
2581 err:
2582 fsck_err:
2583 	return ret;
2584 }
2585 
2586 /* Get root directory, create if it doesn't exist: */
bch2_check_root(struct bch_fs * c)2587 int bch2_check_root(struct bch_fs *c)
2588 {
2589 	int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2590 		check_root_trans(trans));
2591 	bch_err_fn(c, ret);
2592 	return ret;
2593 }
2594 
darray_u32_has(darray_u32 * d,u32 v)2595 static bool darray_u32_has(darray_u32 *d, u32 v)
2596 {
2597 	darray_for_each(*d, i)
2598 		if (*i == v)
2599 			return true;
2600 	return false;
2601 }
2602 
check_subvol_path(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)2603 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2604 {
2605 	struct bch_fs *c = trans->c;
2606 	struct btree_iter parent_iter = {};
2607 	darray_u32 subvol_path = {};
2608 	struct printbuf buf = PRINTBUF;
2609 	int ret = 0;
2610 
2611 	if (k.k->type != KEY_TYPE_subvolume)
2612 		return 0;
2613 
2614 	subvol_inum start = {
2615 		.subvol = k.k->p.offset,
2616 		.inum	= le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode),
2617 	};
2618 
2619 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2620 		ret = darray_push(&subvol_path, k.k->p.offset);
2621 		if (ret)
2622 			goto err;
2623 
2624 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2625 
2626 		struct bch_inode_unpacked subvol_root;
2627 		ret = bch2_inode_find_by_inum_trans(trans,
2628 					(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2629 					&subvol_root);
2630 		if (ret)
2631 			break;
2632 
2633 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2634 
2635 		if (darray_u32_has(&subvol_path, parent)) {
2636 			printbuf_reset(&buf);
2637 			prt_printf(&buf, "subvolume loop: ");
2638 
2639 			ret = bch2_inum_to_path(trans, start, &buf);
2640 			if (ret)
2641 				goto err;
2642 
2643 			if (fsck_err(trans, subvol_loop, "%s", buf.buf))
2644 				ret = reattach_subvol(trans, s);
2645 			break;
2646 		}
2647 
2648 		bch2_trans_iter_exit(trans, &parent_iter);
2649 		bch2_trans_iter_init(trans, &parent_iter,
2650 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2651 		k = bch2_btree_iter_peek_slot(trans, &parent_iter);
2652 		ret = bkey_err(k);
2653 		if (ret)
2654 			goto err;
2655 
2656 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2657 				trans, subvol_unreachable,
2658 				"unreachable subvolume %s",
2659 				(printbuf_reset(&buf),
2660 				 bch2_bkey_val_to_text(&buf, c, s.s_c),
2661 				 buf.buf))) {
2662 			ret = reattach_subvol(trans, s);
2663 			break;
2664 		}
2665 	}
2666 fsck_err:
2667 err:
2668 	printbuf_exit(&buf);
2669 	darray_exit(&subvol_path);
2670 	bch2_trans_iter_exit(trans, &parent_iter);
2671 	return ret;
2672 }
2673 
bch2_check_subvolume_structure(struct bch_fs * c)2674 int bch2_check_subvolume_structure(struct bch_fs *c)
2675 {
2676 	int ret = bch2_trans_run(c,
2677 		for_each_btree_key_commit(trans, iter,
2678 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2679 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2680 			check_subvol_path(trans, &iter, k)));
2681 	bch_err_fn(c, ret);
2682 	return ret;
2683 }
2684 
bch2_bi_depth_renumber_one(struct btree_trans * trans,u64 inum,u32 snapshot,u32 new_depth)2685 static int bch2_bi_depth_renumber_one(struct btree_trans *trans,
2686 				      u64 inum, u32 snapshot,
2687 				      u32 new_depth)
2688 {
2689 	struct btree_iter iter;
2690 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
2691 					       SPOS(0, inum, snapshot), 0);
2692 
2693 	struct bch_inode_unpacked inode;
2694 	int ret = bkey_err(k) ?:
2695 		!bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
2696 		: bch2_inode_unpack(k, &inode);
2697 	if (ret)
2698 		goto err;
2699 
2700 	if (inode.bi_depth != new_depth) {
2701 		inode.bi_depth = new_depth;
2702 		ret = __bch2_fsck_write_inode(trans, &inode) ?:
2703 			bch2_trans_commit(trans, NULL, NULL, 0);
2704 	}
2705 err:
2706 	bch2_trans_iter_exit(trans, &iter);
2707 	return ret;
2708 }
2709 
bch2_bi_depth_renumber(struct btree_trans * trans,darray_u64 * path,u32 snapshot,u32 new_bi_depth)2710 static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path,
2711 				  u32 snapshot, u32 new_bi_depth)
2712 {
2713 	u32 restart_count = trans->restart_count;
2714 	int ret = 0;
2715 
2716 	darray_for_each_reverse(*path, i) {
2717 		ret = nested_lockrestart_do(trans,
2718 				bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth));
2719 		bch_err_fn(trans->c, ret);
2720 		if (ret)
2721 			break;
2722 
2723 		new_bi_depth++;
2724 	}
2725 
2726 	return ret ?: trans_was_restarted(trans, restart_count);
2727 }
2728 
check_path_loop(struct btree_trans * trans,struct bkey_s_c inode_k)2729 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
2730 {
2731 	struct bch_fs *c = trans->c;
2732 	struct btree_iter inode_iter = {};
2733 	darray_u64 path = {};
2734 	struct printbuf buf = PRINTBUF;
2735 	u32 snapshot = inode_k.k->p.snapshot;
2736 	bool redo_bi_depth = false;
2737 	u32 min_bi_depth = U32_MAX;
2738 	int ret = 0;
2739 
2740 	struct bpos start = inode_k.k->p;
2741 
2742 	struct bch_inode_unpacked inode;
2743 	ret = bch2_inode_unpack(inode_k, &inode);
2744 	if (ret)
2745 		return ret;
2746 
2747 	/*
2748 	 * If we're running full fsck, check_dirents() will have already ran,
2749 	 * and we shouldn't see any missing backpointers here - otherwise that's
2750 	 * handled separately, by check_unreachable_inodes
2751 	 */
2752 	while (!inode.bi_subvol &&
2753 	       bch2_inode_has_backpointer(&inode)) {
2754 		struct btree_iter dirent_iter;
2755 		struct bkey_s_c_dirent d;
2756 
2757 		d = dirent_get_by_pos(trans, &dirent_iter,
2758 				      SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot));
2759 		ret = bkey_err(d.s_c);
2760 		if (ret && !bch2_err_matches(ret, ENOENT))
2761 			goto out;
2762 
2763 		if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
2764 			bch2_trans_iter_exit(trans, &dirent_iter);
2765 
2766 		if (bch2_err_matches(ret, ENOENT)) {
2767 			printbuf_reset(&buf);
2768 			bch2_bkey_val_to_text(&buf, c, inode_k);
2769 			bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
2770 				bch2_err_str(ret), buf.buf);
2771 			goto out;
2772 		}
2773 
2774 		bch2_trans_iter_exit(trans, &dirent_iter);
2775 
2776 		ret = darray_push(&path, inode.bi_inum);
2777 		if (ret)
2778 			return ret;
2779 
2780 		bch2_trans_iter_exit(trans, &inode_iter);
2781 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2782 					     SPOS(0, inode.bi_dir, snapshot), 0);
2783 
2784 		struct bch_inode_unpacked parent_inode;
2785 		ret = bkey_err(inode_k) ?:
2786 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2787 			: bch2_inode_unpack(inode_k, &parent_inode);
2788 		if (ret) {
2789 			/* Should have been caught in dirents pass */
2790 			bch_err_msg(c, ret, "error looking up parent directory");
2791 			goto out;
2792 		}
2793 
2794 		min_bi_depth = parent_inode.bi_depth;
2795 
2796 		if (parent_inode.bi_depth < inode.bi_depth &&
2797 		    min_bi_depth < U16_MAX)
2798 			break;
2799 
2800 		inode = parent_inode;
2801 		redo_bi_depth = true;
2802 
2803 		if (darray_find(path, inode.bi_inum)) {
2804 			printbuf_reset(&buf);
2805 			prt_printf(&buf, "directory structure loop in snapshot %u: ",
2806 				   snapshot);
2807 
2808 			ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf);
2809 			if (ret)
2810 				goto out;
2811 
2812 			if (c->opts.verbose) {
2813 				prt_newline(&buf);
2814 				darray_for_each(path, i)
2815 					prt_printf(&buf, "%llu ", *i);
2816 			}
2817 
2818 			if (fsck_err(trans, dir_loop, "%s", buf.buf)) {
2819 				ret = remove_backpointer(trans, &inode);
2820 				bch_err_msg(c, ret, "removing dirent");
2821 				if (ret)
2822 					break;
2823 
2824 				ret = reattach_inode(trans, &inode);
2825 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2826 			}
2827 
2828 			goto out;
2829 		}
2830 	}
2831 
2832 	if (inode.bi_subvol)
2833 		min_bi_depth = 0;
2834 
2835 	if (redo_bi_depth)
2836 		ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth);
2837 out:
2838 fsck_err:
2839 	bch2_trans_iter_exit(trans, &inode_iter);
2840 	darray_exit(&path);
2841 	printbuf_exit(&buf);
2842 	bch_err_fn(c, ret);
2843 	return ret;
2844 }
2845 
2846 /*
2847  * Check for loops in the directory structure: all other connectivity issues
2848  * have been fixed by prior passes
2849  */
bch2_check_directory_structure(struct bch_fs * c)2850 int bch2_check_directory_structure(struct bch_fs *c)
2851 {
2852 	int ret = bch2_trans_run(c,
2853 		for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2854 					  BTREE_ITER_intent|
2855 					  BTREE_ITER_prefetch|
2856 					  BTREE_ITER_all_snapshots, k,
2857 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2858 			if (!S_ISDIR(bkey_inode_mode(k)))
2859 				continue;
2860 
2861 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2862 				continue;
2863 
2864 			check_path_loop(trans, k);
2865 		})));
2866 
2867 	bch_err_fn(c, ret);
2868 	return ret;
2869 }
2870 
2871 struct nlink_table {
2872 	size_t		nr;
2873 	size_t		size;
2874 
2875 	struct nlink {
2876 		u64	inum;
2877 		u32	snapshot;
2878 		u32	count;
2879 	}		*d;
2880 };
2881 
add_nlink(struct bch_fs * c,struct nlink_table * t,u64 inum,u32 snapshot)2882 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2883 		     u64 inum, u32 snapshot)
2884 {
2885 	if (t->nr == t->size) {
2886 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2887 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2888 
2889 		if (!d) {
2890 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2891 				new_size);
2892 			return bch_err_throw(c, ENOMEM_fsck_add_nlink);
2893 		}
2894 
2895 		if (t->d)
2896 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2897 		kvfree(t->d);
2898 
2899 		t->d = d;
2900 		t->size = new_size;
2901 	}
2902 
2903 
2904 	t->d[t->nr++] = (struct nlink) {
2905 		.inum		= inum,
2906 		.snapshot	= snapshot,
2907 	};
2908 
2909 	return 0;
2910 }
2911 
nlink_cmp(const void * _l,const void * _r)2912 static int nlink_cmp(const void *_l, const void *_r)
2913 {
2914 	const struct nlink *l = _l;
2915 	const struct nlink *r = _r;
2916 
2917 	return cmp_int(l->inum, r->inum);
2918 }
2919 
inc_link(struct bch_fs * c,struct snapshots_seen * s,struct nlink_table * links,u64 range_start,u64 range_end,u64 inum,u32 snapshot)2920 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2921 		     struct nlink_table *links,
2922 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2923 {
2924 	struct nlink *link, key = {
2925 		.inum = inum, .snapshot = U32_MAX,
2926 	};
2927 
2928 	if (inum < range_start || inum >= range_end)
2929 		return;
2930 
2931 	link = __inline_bsearch(&key, links->d, links->nr,
2932 				sizeof(links->d[0]), nlink_cmp);
2933 	if (!link)
2934 		return;
2935 
2936 	while (link > links->d && link[0].inum == link[-1].inum)
2937 		--link;
2938 
2939 	for (; link < links->d + links->nr && link->inum == inum; link++)
2940 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2941 			link->count++;
2942 			if (link->snapshot >= snapshot)
2943 				break;
2944 		}
2945 }
2946 
2947 noinline_for_stack
check_nlinks_find_hardlinks(struct bch_fs * c,struct nlink_table * t,u64 start,u64 * end)2948 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2949 				       struct nlink_table *t,
2950 				       u64 start, u64 *end)
2951 {
2952 	int ret = bch2_trans_run(c,
2953 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2954 				   POS(0, start),
2955 				   BTREE_ITER_intent|
2956 				   BTREE_ITER_prefetch|
2957 				   BTREE_ITER_all_snapshots, k, ({
2958 			if (!bkey_is_inode(k.k))
2959 				continue;
2960 
2961 			/* Should never fail, checked by bch2_inode_invalid: */
2962 			struct bch_inode_unpacked u;
2963 			_ret3 = bch2_inode_unpack(k, &u);
2964 			if (_ret3)
2965 				break;
2966 
2967 			/*
2968 			 * Backpointer and directory structure checks are sufficient for
2969 			 * directories, since they can't have hardlinks:
2970 			 */
2971 			if (S_ISDIR(u.bi_mode))
2972 				continue;
2973 
2974 			/*
2975 			 * Previous passes ensured that bi_nlink is nonzero if
2976 			 * it had multiple hardlinks:
2977 			 */
2978 			if (!u.bi_nlink)
2979 				continue;
2980 
2981 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2982 			if (ret) {
2983 				*end = k.k->p.offset;
2984 				ret = 0;
2985 				break;
2986 			}
2987 			0;
2988 		})));
2989 
2990 	bch_err_fn(c, ret);
2991 	return ret;
2992 }
2993 
2994 noinline_for_stack
check_nlinks_walk_dirents(struct bch_fs * c,struct nlink_table * links,u64 range_start,u64 range_end)2995 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2996 				     u64 range_start, u64 range_end)
2997 {
2998 	struct snapshots_seen s;
2999 
3000 	snapshots_seen_init(&s);
3001 
3002 	int ret = bch2_trans_run(c,
3003 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
3004 				   BTREE_ITER_intent|
3005 				   BTREE_ITER_prefetch|
3006 				   BTREE_ITER_all_snapshots, k, ({
3007 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
3008 			if (ret)
3009 				break;
3010 
3011 			if (k.k->type == KEY_TYPE_dirent) {
3012 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
3013 
3014 				if (d.v->d_type != DT_DIR &&
3015 				    d.v->d_type != DT_SUBVOL)
3016 					inc_link(c, &s, links, range_start, range_end,
3017 						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
3018 			}
3019 			0;
3020 		})));
3021 
3022 	snapshots_seen_exit(&s);
3023 
3024 	bch_err_fn(c, ret);
3025 	return ret;
3026 }
3027 
check_nlinks_update_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct nlink_table * links,size_t * idx,u64 range_end)3028 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
3029 				     struct bkey_s_c k,
3030 				     struct nlink_table *links,
3031 				     size_t *idx, u64 range_end)
3032 {
3033 	struct bch_inode_unpacked u;
3034 	struct nlink *link = &links->d[*idx];
3035 	int ret = 0;
3036 
3037 	if (k.k->p.offset >= range_end)
3038 		return 1;
3039 
3040 	if (!bkey_is_inode(k.k))
3041 		return 0;
3042 
3043 	ret = bch2_inode_unpack(k, &u);
3044 	if (ret)
3045 		return ret;
3046 
3047 	if (S_ISDIR(u.bi_mode))
3048 		return 0;
3049 
3050 	if (!u.bi_nlink)
3051 		return 0;
3052 
3053 	while ((cmp_int(link->inum, k.k->p.offset) ?:
3054 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
3055 		BUG_ON(*idx == links->nr);
3056 		link = &links->d[++*idx];
3057 	}
3058 
3059 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
3060 			trans, inode_wrong_nlink,
3061 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
3062 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
3063 			bch2_inode_nlink_get(&u), link->count)) {
3064 		bch2_inode_nlink_set(&u, link->count);
3065 		ret = __bch2_fsck_write_inode(trans, &u);
3066 	}
3067 fsck_err:
3068 	return ret;
3069 }
3070 
3071 noinline_for_stack
check_nlinks_update_hardlinks(struct bch_fs * c,struct nlink_table * links,u64 range_start,u64 range_end)3072 static int check_nlinks_update_hardlinks(struct bch_fs *c,
3073 			       struct nlink_table *links,
3074 			       u64 range_start, u64 range_end)
3075 {
3076 	size_t idx = 0;
3077 
3078 	int ret = bch2_trans_run(c,
3079 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
3080 				POS(0, range_start),
3081 				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
3082 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
3083 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
3084 	if (ret < 0) {
3085 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
3086 		return ret;
3087 	}
3088 
3089 	return 0;
3090 }
3091 
bch2_check_nlinks(struct bch_fs * c)3092 int bch2_check_nlinks(struct bch_fs *c)
3093 {
3094 	struct nlink_table links = { 0 };
3095 	u64 this_iter_range_start, next_iter_range_start = 0;
3096 	int ret = 0;
3097 
3098 	do {
3099 		this_iter_range_start = next_iter_range_start;
3100 		next_iter_range_start = U64_MAX;
3101 
3102 		ret = check_nlinks_find_hardlinks(c, &links,
3103 						  this_iter_range_start,
3104 						  &next_iter_range_start);
3105 
3106 		ret = check_nlinks_walk_dirents(c, &links,
3107 					  this_iter_range_start,
3108 					  next_iter_range_start);
3109 		if (ret)
3110 			break;
3111 
3112 		ret = check_nlinks_update_hardlinks(c, &links,
3113 					 this_iter_range_start,
3114 					 next_iter_range_start);
3115 		if (ret)
3116 			break;
3117 
3118 		links.nr = 0;
3119 	} while (next_iter_range_start != U64_MAX);
3120 
3121 	kvfree(links.d);
3122 	bch_err_fn(c, ret);
3123 	return ret;
3124 }
3125 
fix_reflink_p_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)3126 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
3127 			     struct bkey_s_c k)
3128 {
3129 	struct bkey_s_c_reflink_p p;
3130 	struct bkey_i_reflink_p *u;
3131 
3132 	if (k.k->type != KEY_TYPE_reflink_p)
3133 		return 0;
3134 
3135 	p = bkey_s_c_to_reflink_p(k);
3136 
3137 	if (!p.v->front_pad && !p.v->back_pad)
3138 		return 0;
3139 
3140 	u = bch2_trans_kmalloc(trans, sizeof(*u));
3141 	int ret = PTR_ERR_OR_ZERO(u);
3142 	if (ret)
3143 		return ret;
3144 
3145 	bkey_reassemble(&u->k_i, k);
3146 	u->v.front_pad	= 0;
3147 	u->v.back_pad	= 0;
3148 
3149 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
3150 }
3151 
bch2_fix_reflink_p(struct bch_fs * c)3152 int bch2_fix_reflink_p(struct bch_fs *c)
3153 {
3154 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
3155 		return 0;
3156 
3157 	int ret = bch2_trans_run(c,
3158 		for_each_btree_key_commit(trans, iter,
3159 				BTREE_ID_extents, POS_MIN,
3160 				BTREE_ITER_intent|BTREE_ITER_prefetch|
3161 				BTREE_ITER_all_snapshots, k,
3162 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
3163 			fix_reflink_p_key(trans, &iter, k)));
3164 	bch_err_fn(c, ret);
3165 	return ret;
3166 }
3167 
3168 #ifndef NO_BCACHEFS_CHARDEV
3169 
3170 struct fsck_thread {
3171 	struct thread_with_stdio thr;
3172 	struct bch_fs		*c;
3173 	struct bch_opts		opts;
3174 };
3175 
bch2_fsck_thread_exit(struct thread_with_stdio * _thr)3176 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
3177 {
3178 	struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
3179 	kfree(thr);
3180 }
3181 
bch2_fsck_offline_thread_fn(struct thread_with_stdio * stdio)3182 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
3183 {
3184 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3185 	struct bch_fs *c = thr->c;
3186 
3187 	int ret = PTR_ERR_OR_ZERO(c);
3188 	if (ret)
3189 		return ret;
3190 
3191 	ret = bch2_fs_start(thr->c);
3192 	if (ret)
3193 		goto err;
3194 
3195 	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
3196 		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
3197 		ret |= 1;
3198 	}
3199 	if (test_bit(BCH_FS_error, &c->flags)) {
3200 		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
3201 		ret |= 4;
3202 	}
3203 err:
3204 	bch2_fs_stop(c);
3205 	return ret;
3206 }
3207 
3208 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
3209 	.exit		= bch2_fsck_thread_exit,
3210 	.fn		= bch2_fsck_offline_thread_fn,
3211 };
3212 
bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user * user_arg)3213 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
3214 {
3215 	struct bch_ioctl_fsck_offline arg;
3216 	struct fsck_thread *thr = NULL;
3217 	darray_const_str devs = {};
3218 	long ret = 0;
3219 
3220 	if (copy_from_user(&arg, user_arg, sizeof(arg)))
3221 		return -EFAULT;
3222 
3223 	if (arg.flags)
3224 		return -EINVAL;
3225 
3226 	if (!capable(CAP_SYS_ADMIN))
3227 		return -EPERM;
3228 
3229 	for (size_t i = 0; i < arg.nr_devs; i++) {
3230 		u64 dev_u64;
3231 		ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
3232 		if (ret)
3233 			goto err;
3234 
3235 		char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
3236 		ret = PTR_ERR_OR_ZERO(dev_str);
3237 		if (ret)
3238 			goto err;
3239 
3240 		ret = darray_push(&devs, dev_str);
3241 		if (ret) {
3242 			kfree(dev_str);
3243 			goto err;
3244 		}
3245 	}
3246 
3247 	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3248 	if (!thr) {
3249 		ret = -ENOMEM;
3250 		goto err;
3251 	}
3252 
3253 	thr->opts = bch2_opts_empty();
3254 
3255 	if (arg.opts) {
3256 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3257 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
3258 			bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false);
3259 		if (!IS_ERR(optstr))
3260 			kfree(optstr);
3261 
3262 		if (ret)
3263 			goto err;
3264 	}
3265 
3266 	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
3267 	opt_set(thr->opts, read_only, 1);
3268 	opt_set(thr->opts, ratelimit_errors, 0);
3269 
3270 	/* We need request_key() to be called before we punt to kthread: */
3271 	opt_set(thr->opts, nostart, true);
3272 
3273 	bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
3274 
3275 	thr->c = bch2_fs_open(&devs, &thr->opts);
3276 
3277 	if (!IS_ERR(thr->c) &&
3278 	    thr->c->opts.errors == BCH_ON_ERROR_panic)
3279 		thr->c->opts.errors = BCH_ON_ERROR_ro;
3280 
3281 	ret = __bch2_run_thread_with_stdio(&thr->thr);
3282 out:
3283 	darray_for_each(devs, i)
3284 		kfree(*i);
3285 	darray_exit(&devs);
3286 	return ret;
3287 err:
3288 	if (thr)
3289 		bch2_fsck_thread_exit(&thr->thr);
3290 	pr_err("ret %s", bch2_err_str(ret));
3291 	goto out;
3292 }
3293 
bch2_fsck_online_thread_fn(struct thread_with_stdio * stdio)3294 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
3295 {
3296 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3297 	struct bch_fs *c = thr->c;
3298 
3299 	c->stdio_filter = current;
3300 	c->stdio = &thr->thr.stdio;
3301 
3302 	/*
3303 	 * XXX: can we figure out a way to do this without mucking with c->opts?
3304 	 */
3305 	unsigned old_fix_errors = c->opts.fix_errors;
3306 	if (opt_defined(thr->opts, fix_errors))
3307 		c->opts.fix_errors = thr->opts.fix_errors;
3308 	else
3309 		c->opts.fix_errors = FSCK_FIX_ask;
3310 
3311 	c->opts.fsck = true;
3312 	set_bit(BCH_FS_in_fsck, &c->flags);
3313 
3314 	int ret = bch2_run_online_recovery_passes(c, ~0ULL);
3315 
3316 	clear_bit(BCH_FS_in_fsck, &c->flags);
3317 	bch_err_fn(c, ret);
3318 
3319 	c->stdio = NULL;
3320 	c->stdio_filter = NULL;
3321 	c->opts.fix_errors = old_fix_errors;
3322 
3323 	up(&c->recovery.run_lock);
3324 	bch2_ro_ref_put(c);
3325 	return ret;
3326 }
3327 
3328 static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
3329 	.exit		= bch2_fsck_thread_exit,
3330 	.fn		= bch2_fsck_online_thread_fn,
3331 };
3332 
bch2_ioctl_fsck_online(struct bch_fs * c,struct bch_ioctl_fsck_online arg)3333 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
3334 {
3335 	struct fsck_thread *thr = NULL;
3336 	long ret = 0;
3337 
3338 	if (arg.flags)
3339 		return -EINVAL;
3340 
3341 	if (!capable(CAP_SYS_ADMIN))
3342 		return -EPERM;
3343 
3344 	if (!bch2_ro_ref_tryget(c))
3345 		return -EROFS;
3346 
3347 	if (down_trylock(&c->recovery.run_lock)) {
3348 		bch2_ro_ref_put(c);
3349 		return -EAGAIN;
3350 	}
3351 
3352 	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3353 	if (!thr) {
3354 		ret = -ENOMEM;
3355 		goto err;
3356 	}
3357 
3358 	thr->c = c;
3359 	thr->opts = bch2_opts_empty();
3360 
3361 	if (arg.opts) {
3362 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3363 
3364 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
3365 			bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false);
3366 		if (!IS_ERR(optstr))
3367 			kfree(optstr);
3368 
3369 		if (ret)
3370 			goto err;
3371 	}
3372 
3373 	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
3374 err:
3375 	if (ret < 0) {
3376 		bch_err_fn(c, ret);
3377 		if (thr)
3378 			bch2_fsck_thread_exit(&thr->thr);
3379 		up(&c->recovery.run_lock);
3380 		bch2_ro_ref_put(c);
3381 	}
3382 	return ret;
3383 }
3384 
3385 #endif /* NO_BCACHEFS_CHARDEV */
3386