xref: /linux/fs/bcachefs/fsck.c (revision dd83757f6e686a2188997cb58b5975f744bb7786)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bcachefs_ioctl.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "darray.h"
10 #include "dirent.h"
11 #include "error.h"
12 #include "fs.h"
13 #include "fs-common.h"
14 #include "fsck.h"
15 #include "inode.h"
16 #include "keylist.h"
17 #include "recovery_passes.h"
18 #include "snapshot.h"
19 #include "super.h"
20 #include "thread_with_file.h"
21 #include "xattr.h"
22 
23 #include <linux/bsearch.h>
24 #include <linux/dcache.h> /* struct qstr */
25 
inode_points_to_dirent(struct bch_inode_unpacked * inode,struct bkey_s_c_dirent d)26 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
27 				   struct bkey_s_c_dirent d)
28 {
29 	return  inode->bi_dir		== d.k->p.inode &&
30 		inode->bi_dir_offset	== d.k->p.offset;
31 }
32 
dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,struct bch_inode_unpacked * inode)33 static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
34 					 struct bch_inode_unpacked *inode)
35 {
36 	if (d.v->d_type == DT_SUBVOL
37 	    ? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
38 	    : le64_to_cpu(d.v->d_inum)		== inode->bi_inum)
39 		return 0;
40 	return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
41 }
42 
dirent_inode_mismatch_msg(struct printbuf * out,struct bch_fs * c,struct bkey_s_c_dirent dirent,struct bch_inode_unpacked * inode)43 static void dirent_inode_mismatch_msg(struct printbuf *out,
44 				      struct bch_fs *c,
45 				      struct bkey_s_c_dirent dirent,
46 				      struct bch_inode_unpacked *inode)
47 {
48 	prt_str(out, "inode points to dirent that does not point back:");
49 	prt_newline(out);
50 	bch2_bkey_val_to_text(out, c, dirent.s_c);
51 	prt_newline(out);
52 	bch2_inode_unpacked_to_text(out, inode);
53 }
54 
dirent_points_to_inode(struct bch_fs * c,struct bkey_s_c_dirent dirent,struct bch_inode_unpacked * inode)55 static int dirent_points_to_inode(struct bch_fs *c,
56 				  struct bkey_s_c_dirent dirent,
57 				  struct bch_inode_unpacked *inode)
58 {
59 	int ret = dirent_points_to_inode_nowarn(dirent, inode);
60 	if (ret) {
61 		struct printbuf buf = PRINTBUF;
62 		dirent_inode_mismatch_msg(&buf, c, dirent, inode);
63 		bch_warn(c, "%s", buf.buf);
64 		printbuf_exit(&buf);
65 	}
66 	return ret;
67 }
68 
69 /*
70  * XXX: this is handling transaction restarts without returning
71  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
72  */
bch2_count_inode_sectors(struct btree_trans * trans,u64 inum,u32 snapshot)73 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
74 				    u32 snapshot)
75 {
76 	u64 sectors = 0;
77 
78 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
79 				SPOS(inum, 0, snapshot),
80 				POS(inum, U64_MAX),
81 				0, k, ({
82 		if (bkey_extent_is_allocation(k.k))
83 			sectors += k.k->size;
84 		0;
85 	}));
86 
87 	return ret ?: sectors;
88 }
89 
bch2_count_subdirs(struct btree_trans * trans,u64 inum,u32 snapshot)90 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
91 				    u32 snapshot)
92 {
93 	u64 subdirs = 0;
94 
95 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
96 				    SPOS(inum, 0, snapshot),
97 				    POS(inum, U64_MAX),
98 				    0, k, ({
99 		if (k.k->type == KEY_TYPE_dirent &&
100 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
101 			subdirs++;
102 		0;
103 	}));
104 
105 	return ret ?: subdirs;
106 }
107 
subvol_lookup(struct btree_trans * trans,u32 subvol,u32 * snapshot,u64 * inum)108 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
109 			 u32 *snapshot, u64 *inum)
110 {
111 	struct bch_subvolume s;
112 	int ret = bch2_subvolume_get(trans, subvol, false, &s);
113 
114 	*snapshot = le32_to_cpu(s.snapshot);
115 	*inum = le64_to_cpu(s.inode);
116 	return ret;
117 }
118 
lookup_first_inode(struct btree_trans * trans,u64 inode_nr,struct bch_inode_unpacked * inode)119 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
120 			      struct bch_inode_unpacked *inode)
121 {
122 	struct btree_iter iter;
123 	struct bkey_s_c k;
124 	int ret;
125 
126 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
127 				     BTREE_ITER_all_snapshots, k, ret) {
128 		if (k.k->p.offset != inode_nr)
129 			break;
130 		if (!bkey_is_inode(k.k))
131 			continue;
132 		ret = bch2_inode_unpack(k, inode);
133 		goto found;
134 	}
135 	ret = -BCH_ERR_ENOENT_inode;
136 found:
137 	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
138 	bch2_trans_iter_exit(trans, &iter);
139 	return ret;
140 }
141 
lookup_inode(struct btree_trans * trans,u64 inode_nr,u32 snapshot,struct bch_inode_unpacked * inode)142 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
143 			struct bch_inode_unpacked *inode)
144 {
145 	struct btree_iter iter;
146 	struct bkey_s_c k;
147 	int ret;
148 
149 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
150 			       SPOS(0, inode_nr, snapshot), 0);
151 	ret = bkey_err(k);
152 	if (ret)
153 		goto err;
154 
155 	ret = bkey_is_inode(k.k)
156 		? bch2_inode_unpack(k, inode)
157 		: -BCH_ERR_ENOENT_inode;
158 err:
159 	bch2_trans_iter_exit(trans, &iter);
160 	return ret;
161 }
162 
lookup_dirent_in_snapshot(struct btree_trans * trans,struct bch_hash_info hash_info,subvol_inum dir,struct qstr * name,u64 * target,unsigned * type,u32 snapshot)163 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
164 			   struct bch_hash_info hash_info,
165 			   subvol_inum dir, struct qstr *name,
166 			   u64 *target, unsigned *type, u32 snapshot)
167 {
168 	struct btree_iter iter;
169 	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
170 							 &hash_info, dir, name, 0, snapshot);
171 	int ret = bkey_err(k);
172 	if (ret)
173 		return ret;
174 
175 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
176 	*target = le64_to_cpu(d.v->d_inum);
177 	*type = d.v->d_type;
178 	bch2_trans_iter_exit(trans, &iter);
179 	return 0;
180 }
181 
__remove_dirent(struct btree_trans * trans,struct bpos pos)182 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
183 {
184 	struct bch_fs *c = trans->c;
185 	struct btree_iter iter;
186 	struct bch_inode_unpacked dir_inode;
187 	struct bch_hash_info dir_hash_info;
188 	int ret;
189 
190 	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
191 	if (ret)
192 		goto err;
193 
194 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
195 
196 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
197 
198 	ret =   bch2_btree_iter_traverse(&iter) ?:
199 		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
200 				    &dir_hash_info, &iter,
201 				    BTREE_UPDATE_internal_snapshot_node);
202 	bch2_trans_iter_exit(trans, &iter);
203 err:
204 	bch_err_fn(c, ret);
205 	return ret;
206 }
207 
208 /*
209  * Find any subvolume associated with a tree of snapshots
210  * We can't rely on master_subvol - it might have been deleted.
211  */
find_snapshot_tree_subvol(struct btree_trans * trans,u32 tree_id,u32 * subvol)212 static int find_snapshot_tree_subvol(struct btree_trans *trans,
213 				     u32 tree_id, u32 *subvol)
214 {
215 	struct btree_iter iter;
216 	struct bkey_s_c k;
217 	int ret;
218 
219 	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
220 		if (k.k->type != KEY_TYPE_snapshot)
221 			continue;
222 
223 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
224 		if (le32_to_cpu(s.v->tree) != tree_id)
225 			continue;
226 
227 		if (s.v->subvol) {
228 			*subvol = le32_to_cpu(s.v->subvol);
229 			goto found;
230 		}
231 	}
232 	ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
233 found:
234 	bch2_trans_iter_exit(trans, &iter);
235 	return ret;
236 }
237 
238 /* Get lost+found, create if it doesn't exist: */
lookup_lostfound(struct btree_trans * trans,u32 snapshot,struct bch_inode_unpacked * lostfound,u64 reattaching_inum)239 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
240 			    struct bch_inode_unpacked *lostfound,
241 			    u64 reattaching_inum)
242 {
243 	struct bch_fs *c = trans->c;
244 	struct qstr lostfound_str = QSTR("lost+found");
245 	struct btree_iter lostfound_iter = { NULL };
246 	u64 inum = 0;
247 	unsigned d_type = 0;
248 	int ret;
249 
250 	struct bch_snapshot_tree st;
251 	ret = bch2_snapshot_tree_lookup(trans,
252 			bch2_snapshot_tree(c, snapshot), &st);
253 	if (ret)
254 		return ret;
255 
256 	u32 subvolid;
257 	ret = find_snapshot_tree_subvol(trans,
258 				bch2_snapshot_tree(c, snapshot), &subvolid);
259 	bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
260 		    bch2_snapshot_tree(c, snapshot));
261 	if (ret)
262 		return ret;
263 
264 	struct bch_subvolume subvol;
265 	ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
266 	bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
267 	if (ret)
268 		return ret;
269 
270 	if (!subvol.inode) {
271 		struct btree_iter iter;
272 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
273 				BTREE_ID_subvolumes, POS(0, subvolid),
274 				0, subvolume);
275 		ret = PTR_ERR_OR_ZERO(subvol);
276 		if (ret)
277 			return ret;
278 
279 		subvol->v.inode = cpu_to_le64(reattaching_inum);
280 		bch2_trans_iter_exit(trans, &iter);
281 	}
282 
283 	subvol_inum root_inum = {
284 		.subvol = subvolid,
285 		.inum = le64_to_cpu(subvol.inode)
286 	};
287 
288 	struct bch_inode_unpacked root_inode;
289 	struct bch_hash_info root_hash_info;
290 	ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
291 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
292 		    root_inum.inum, subvolid);
293 	if (ret)
294 		return ret;
295 
296 	root_hash_info = bch2_hash_info_init(c, &root_inode);
297 
298 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
299 			      &lostfound_str, &inum, &d_type, snapshot);
300 	if (bch2_err_matches(ret, ENOENT))
301 		goto create_lostfound;
302 
303 	bch_err_fn(c, ret);
304 	if (ret)
305 		return ret;
306 
307 	if (d_type != DT_DIR) {
308 		bch_err(c, "error looking up lost+found: not a directory");
309 		return -BCH_ERR_ENOENT_not_directory;
310 	}
311 
312 	/*
313 	 * The bch2_check_dirents pass has already run, dangling dirents
314 	 * shouldn't exist here:
315 	 */
316 	ret = lookup_inode(trans, inum, snapshot, lostfound);
317 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
318 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
319 	return ret;
320 
321 create_lostfound:
322 	/*
323 	 * we always create lost+found in the root snapshot; we don't want
324 	 * different branches of the snapshot tree to have different lost+found
325 	 */
326 	snapshot = le32_to_cpu(st.root_snapshot);
327 	/*
328 	 * XXX: we could have a nicer log message here  if we had a nice way to
329 	 * walk backpointers to print a path
330 	 */
331 	struct printbuf path = PRINTBUF;
332 	ret = bch2_inum_to_path(trans, root_inum, &path);
333 	if (ret)
334 		goto err;
335 
336 	bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
337 		   path.buf, root_inum.subvol, snapshot);
338 	printbuf_exit(&path);
339 
340 	u64 now = bch2_current_time(c);
341 	u64 cpu = raw_smp_processor_id();
342 
343 	bch2_inode_init_early(c, lostfound);
344 	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
345 	lostfound->bi_dir = root_inode.bi_inum;
346 	lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
347 
348 	root_inode.bi_nlink++;
349 
350 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
351 	if (ret)
352 		goto err;
353 
354 	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
355 	ret = bch2_btree_iter_traverse(&lostfound_iter);
356 	if (ret)
357 		goto err;
358 
359 	ret =   bch2_dirent_create_snapshot(trans,
360 				0, root_inode.bi_inum, snapshot, &root_hash_info,
361 				mode_to_type(lostfound->bi_mode),
362 				&lostfound_str,
363 				lostfound->bi_inum,
364 				&lostfound->bi_dir_offset,
365 				STR_HASH_must_create) ?:
366 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
367 				       BTREE_UPDATE_internal_snapshot_node);
368 err:
369 	bch_err_msg(c, ret, "creating lost+found");
370 	bch2_trans_iter_exit(trans, &lostfound_iter);
371 	return ret;
372 }
373 
inode_should_reattach(struct bch_inode_unpacked * inode)374 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
375 {
376 	if (inode->bi_inum == BCACHEFS_ROOT_INO &&
377 	    inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
378 		return false;
379 
380 	return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
381 }
382 
maybe_delete_dirent(struct btree_trans * trans,struct bpos d_pos,u32 snapshot)383 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
384 {
385 	struct btree_iter iter;
386 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
387 					SPOS(d_pos.inode, d_pos.offset, snapshot),
388 					BTREE_ITER_intent|
389 					BTREE_ITER_with_updates);
390 	int ret = bkey_err(k);
391 	if (ret)
392 		return ret;
393 
394 	if (bpos_eq(k.k->p, d_pos)) {
395 		/*
396 		 * delet_at() doesn't work because the update path doesn't
397 		 * internally use BTREE_ITER_with_updates yet
398 		 */
399 		struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
400 		ret = PTR_ERR_OR_ZERO(k);
401 		if (ret)
402 			goto err;
403 
404 		bkey_init(&k->k);
405 		k->k.type = KEY_TYPE_whiteout;
406 		k->k.p = iter.pos;
407 		ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
408 	}
409 err:
410 	bch2_trans_iter_exit(trans, &iter);
411 	return ret;
412 }
413 
reattach_inode(struct btree_trans * trans,struct bch_inode_unpacked * inode)414 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
415 {
416 	struct bch_fs *c = trans->c;
417 	struct bch_inode_unpacked lostfound;
418 	char name_buf[20];
419 	int ret;
420 
421 	u32 dirent_snapshot = inode->bi_snapshot;
422 	if (inode->bi_subvol) {
423 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
424 
425 		u64 root_inum;
426 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
427 				    &dirent_snapshot, &root_inum);
428 		if (ret)
429 			return ret;
430 
431 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
432 	} else {
433 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
434 	}
435 
436 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
437 	if (ret)
438 		return ret;
439 
440 	lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
441 
442 	/* ensure lost+found inode is also present in inode snapshot */
443 	if (!inode->bi_subvol) {
444 		BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
445 		lostfound.bi_snapshot = inode->bi_snapshot;
446 	}
447 
448 	ret = __bch2_fsck_write_inode(trans, &lostfound);
449 	if (ret)
450 		return ret;
451 
452 	struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
453 	struct qstr name = QSTR(name_buf);
454 
455 	inode->bi_dir = lostfound.bi_inum;
456 
457 	ret = bch2_dirent_create_snapshot(trans,
458 				inode->bi_parent_subvol, lostfound.bi_inum,
459 				dirent_snapshot,
460 				&dir_hash,
461 				inode_d_type(inode),
462 				&name,
463 				inode->bi_subvol ?: inode->bi_inum,
464 				&inode->bi_dir_offset,
465 				STR_HASH_must_create);
466 	if (ret) {
467 		bch_err_msg(c, ret, "error creating dirent");
468 		return ret;
469 	}
470 
471 	ret = __bch2_fsck_write_inode(trans, inode);
472 	if (ret)
473 		return ret;
474 
475 	/*
476 	 * Fix up inodes in child snapshots: if they should also be reattached
477 	 * update the backpointer field, if they should not be we need to emit
478 	 * whiteouts for the dirent we just created.
479 	 */
480 	if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
481 		snapshot_id_list whiteouts_done;
482 		struct btree_iter iter;
483 		struct bkey_s_c k;
484 
485 		darray_init(&whiteouts_done);
486 
487 		for_each_btree_key_reverse_norestart(trans, iter,
488 				BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
489 				BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
490 			if (k.k->p.offset != inode->bi_inum)
491 				break;
492 
493 			if (!bkey_is_inode(k.k) ||
494 			    !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
495 			    snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
496 				continue;
497 
498 			struct bch_inode_unpacked child_inode;
499 			ret = bch2_inode_unpack(k, &child_inode);
500 			if (ret)
501 				break;
502 
503 			if (!inode_should_reattach(&child_inode)) {
504 				ret = maybe_delete_dirent(trans,
505 							  SPOS(lostfound.bi_inum, inode->bi_dir_offset,
506 							       dirent_snapshot),
507 							  k.k->p.snapshot);
508 				if (ret)
509 					break;
510 
511 				ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
512 				if (ret)
513 					break;
514 			} else {
515 				iter.snapshot = k.k->p.snapshot;
516 				child_inode.bi_dir = inode->bi_dir;
517 				child_inode.bi_dir_offset = inode->bi_dir_offset;
518 
519 				ret = bch2_inode_write_flags(trans, &iter, &child_inode,
520 							     BTREE_UPDATE_internal_snapshot_node);
521 				if (ret)
522 					break;
523 			}
524 		}
525 		darray_exit(&whiteouts_done);
526 		bch2_trans_iter_exit(trans, &iter);
527 	}
528 
529 	return ret;
530 }
531 
dirent_get_by_pos(struct btree_trans * trans,struct btree_iter * iter,struct bpos pos)532 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
533 						struct btree_iter *iter,
534 						struct bpos pos)
535 {
536 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
537 }
538 
remove_backpointer(struct btree_trans * trans,struct bch_inode_unpacked * inode)539 static int remove_backpointer(struct btree_trans *trans,
540 			      struct bch_inode_unpacked *inode)
541 {
542 	if (!inode->bi_dir)
543 		return 0;
544 
545 	struct bch_fs *c = trans->c;
546 	struct btree_iter iter;
547 	struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
548 				     SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
549 	int ret = bkey_err(d) ?:
550 		  dirent_points_to_inode(c, d, inode) ?:
551 		  __remove_dirent(trans, d.k->p);
552 	bch2_trans_iter_exit(trans, &iter);
553 	return ret;
554 }
555 
reattach_subvol(struct btree_trans * trans,struct bkey_s_c_subvolume s)556 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
557 {
558 	struct bch_fs *c = trans->c;
559 
560 	struct bch_inode_unpacked inode;
561 	int ret = bch2_inode_find_by_inum_trans(trans,
562 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
563 				&inode);
564 	if (ret)
565 		return ret;
566 
567 	ret = remove_backpointer(trans, &inode);
568 	if (!bch2_err_matches(ret, ENOENT))
569 		bch_err_msg(c, ret, "removing dirent");
570 	if (ret)
571 		return ret;
572 
573 	ret = reattach_inode(trans, &inode);
574 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
575 	return ret;
576 }
577 
reconstruct_subvol(struct btree_trans * trans,u32 snapshotid,u32 subvolid,u64 inum)578 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
579 {
580 	struct bch_fs *c = trans->c;
581 
582 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
583 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
584 		return -BCH_ERR_fsck_repair_unimplemented;
585 	}
586 
587 	/*
588 	 * If inum isn't set, that means we're being called from check_dirents,
589 	 * not check_inodes - the root of this subvolume doesn't exist or we
590 	 * would have found it there:
591 	 */
592 	if (!inum) {
593 		struct btree_iter inode_iter = {};
594 		struct bch_inode_unpacked new_inode;
595 		u64 cpu = raw_smp_processor_id();
596 
597 		bch2_inode_init_early(c, &new_inode);
598 		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
599 
600 		new_inode.bi_subvol = subvolid;
601 
602 		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
603 			  bch2_btree_iter_traverse(&inode_iter) ?:
604 			  bch2_inode_write(trans, &inode_iter, &new_inode);
605 		bch2_trans_iter_exit(trans, &inode_iter);
606 		if (ret)
607 			return ret;
608 
609 		inum = new_inode.bi_inum;
610 	}
611 
612 	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
613 
614 	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
615 	int ret = PTR_ERR_OR_ZERO(new_subvol);
616 	if (ret)
617 		return ret;
618 
619 	bkey_subvolume_init(&new_subvol->k_i);
620 	new_subvol->k.p.offset	= subvolid;
621 	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
622 	new_subvol->v.inode	= cpu_to_le64(inum);
623 	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
624 	if (ret)
625 		return ret;
626 
627 	struct btree_iter iter;
628 	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
629 			BTREE_ID_snapshots, POS(0, snapshotid),
630 			0, snapshot);
631 	ret = PTR_ERR_OR_ZERO(s);
632 	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
633 	if (ret)
634 		return ret;
635 
636 	u32 snapshot_tree = le32_to_cpu(s->v.tree);
637 
638 	s->v.subvol = cpu_to_le32(subvolid);
639 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
640 	bch2_trans_iter_exit(trans, &iter);
641 
642 	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
643 			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
644 			0, snapshot_tree);
645 	ret = PTR_ERR_OR_ZERO(st);
646 	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
647 	if (ret)
648 		return ret;
649 
650 	if (!st->v.master_subvol)
651 		st->v.master_subvol = cpu_to_le32(subvolid);
652 
653 	bch2_trans_iter_exit(trans, &iter);
654 	return 0;
655 }
656 
reconstruct_inode(struct btree_trans * trans,enum btree_id btree,u32 snapshot,u64 inum)657 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
658 {
659 	struct bch_fs *c = trans->c;
660 	unsigned i_mode = S_IFREG;
661 	u64 i_size = 0;
662 
663 	switch (btree) {
664 	case BTREE_ID_extents: {
665 		struct btree_iter iter = {};
666 
667 		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
668 		struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
669 		bch2_trans_iter_exit(trans, &iter);
670 		int ret = bkey_err(k);
671 		if (ret)
672 			return ret;
673 
674 		i_size = k.k->p.offset << 9;
675 		break;
676 	}
677 	case BTREE_ID_dirents:
678 		i_mode = S_IFDIR;
679 		break;
680 	case BTREE_ID_xattrs:
681 		break;
682 	default:
683 		BUG();
684 	}
685 
686 	struct bch_inode_unpacked new_inode;
687 	bch2_inode_init_early(c, &new_inode);
688 	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
689 	new_inode.bi_size = i_size;
690 	new_inode.bi_inum = inum;
691 	new_inode.bi_snapshot = snapshot;
692 
693 	return __bch2_fsck_write_inode(trans, &new_inode);
694 }
695 
696 struct snapshots_seen {
697 	struct bpos			pos;
698 	snapshot_id_list		ids;
699 };
700 
snapshots_seen_exit(struct snapshots_seen * s)701 static inline void snapshots_seen_exit(struct snapshots_seen *s)
702 {
703 	darray_exit(&s->ids);
704 }
705 
snapshots_seen_init(struct snapshots_seen * s)706 static inline void snapshots_seen_init(struct snapshots_seen *s)
707 {
708 	memset(s, 0, sizeof(*s));
709 }
710 
snapshots_seen_add_inorder(struct bch_fs * c,struct snapshots_seen * s,u32 id)711 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
712 {
713 	u32 *i;
714 	__darray_for_each(s->ids, i) {
715 		if (*i == id)
716 			return 0;
717 		if (*i > id)
718 			break;
719 	}
720 
721 	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
722 	if (ret)
723 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
724 			s->ids.size);
725 	return ret;
726 }
727 
snapshots_seen_update(struct bch_fs * c,struct snapshots_seen * s,enum btree_id btree_id,struct bpos pos)728 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
729 				 enum btree_id btree_id, struct bpos pos)
730 {
731 	if (!bkey_eq(s->pos, pos))
732 		s->ids.nr = 0;
733 	s->pos = pos;
734 
735 	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
736 }
737 
738 /**
739  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
740  * and @ancestor hasn't been overwritten in @seen
741  *
742  * @c:		filesystem handle
743  * @seen:	list of snapshot ids already seen at current position
744  * @id:		descendent snapshot id
745  * @ancestor:	ancestor snapshot id
746  *
747  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
748  */
key_visible_in_snapshot(struct bch_fs * c,struct snapshots_seen * seen,u32 id,u32 ancestor)749 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
750 				    u32 id, u32 ancestor)
751 {
752 	ssize_t i;
753 
754 	EBUG_ON(id > ancestor);
755 
756 	/* @ancestor should be the snapshot most recently added to @seen */
757 	EBUG_ON(ancestor != seen->pos.snapshot);
758 	EBUG_ON(ancestor != darray_last(seen->ids));
759 
760 	if (id == ancestor)
761 		return true;
762 
763 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
764 		return false;
765 
766 	/*
767 	 * We know that @id is a descendant of @ancestor, we're checking if
768 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
769 	 * @ascestor and with @id as a descendent.
770 	 *
771 	 * But we already know that we're scanning IDs between @id and @ancestor
772 	 * numerically, since snapshot ID lists are kept sorted, so if we find
773 	 * an id that's an ancestor of @id we're done:
774 	 */
775 
776 	for (i = seen->ids.nr - 2;
777 	     i >= 0 && seen->ids.data[i] >= id;
778 	     --i)
779 		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
780 			return false;
781 
782 	return true;
783 }
784 
785 /**
786  * ref_visible - given a key with snapshot id @src that points to a key with
787  * snapshot id @dst, test whether there is some snapshot in which @dst is
788  * visible.
789  *
790  * @c:		filesystem handle
791  * @s:		list of snapshot IDs already seen at @src
792  * @src:	snapshot ID of src key
793  * @dst:	snapshot ID of dst key
794  * Returns:	true if there is some snapshot in which @dst is visible
795  *
796  * Assumes we're visiting @src keys in natural key order
797  */
ref_visible(struct bch_fs * c,struct snapshots_seen * s,u32 src,u32 dst)798 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
799 			u32 src, u32 dst)
800 {
801 	return dst <= src
802 		? key_visible_in_snapshot(c, s, dst, src)
803 		: bch2_snapshot_is_ancestor(c, src, dst);
804 }
805 
ref_visible2(struct bch_fs * c,u32 src,struct snapshots_seen * src_seen,u32 dst,struct snapshots_seen * dst_seen)806 static int ref_visible2(struct bch_fs *c,
807 			u32 src, struct snapshots_seen *src_seen,
808 			u32 dst, struct snapshots_seen *dst_seen)
809 {
810 	if (dst > src) {
811 		swap(dst, src);
812 		swap(dst_seen, src_seen);
813 	}
814 	return key_visible_in_snapshot(c, src_seen, dst, src);
815 }
816 
817 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
818 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
819 	     (_i)->snapshot <= (_snapshot); _i++)					\
820 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
821 
822 struct inode_walker_entry {
823 	struct bch_inode_unpacked inode;
824 	u32			snapshot;
825 	u64			count;
826 	u64			i_size;
827 };
828 
829 struct inode_walker {
830 	bool				first_this_inode;
831 	bool				have_inodes;
832 	bool				recalculate_sums;
833 	struct bpos			last_pos;
834 
835 	DARRAY(struct inode_walker_entry) inodes;
836 	snapshot_id_list		deletes;
837 };
838 
inode_walker_exit(struct inode_walker * w)839 static void inode_walker_exit(struct inode_walker *w)
840 {
841 	darray_exit(&w->inodes);
842 	darray_exit(&w->deletes);
843 }
844 
inode_walker_init(void)845 static struct inode_walker inode_walker_init(void)
846 {
847 	return (struct inode_walker) { 0, };
848 }
849 
add_inode(struct bch_fs * c,struct inode_walker * w,struct bkey_s_c inode)850 static int add_inode(struct bch_fs *c, struct inode_walker *w,
851 		     struct bkey_s_c inode)
852 {
853 	struct bch_inode_unpacked u;
854 
855 	return bch2_inode_unpack(inode, &u) ?:
856 		darray_push(&w->inodes, ((struct inode_walker_entry) {
857 		.inode		= u,
858 		.snapshot	= inode.k->p.snapshot,
859 	}));
860 }
861 
get_inodes_all_snapshots(struct btree_trans * trans,struct inode_walker * w,u64 inum)862 static int get_inodes_all_snapshots(struct btree_trans *trans,
863 				    struct inode_walker *w, u64 inum)
864 {
865 	struct bch_fs *c = trans->c;
866 	struct btree_iter iter;
867 	struct bkey_s_c k;
868 	int ret;
869 
870 	/*
871 	 * We no longer have inodes for w->last_pos; clear this to avoid
872 	 * screwing up check_i_sectors/check_subdir_count if we take a
873 	 * transaction restart here:
874 	 */
875 	w->have_inodes = false;
876 	w->recalculate_sums = false;
877 	w->inodes.nr = 0;
878 
879 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
880 				     BTREE_ITER_all_snapshots, k, ret) {
881 		if (k.k->p.offset != inum)
882 			break;
883 
884 		if (bkey_is_inode(k.k))
885 			add_inode(c, w, k);
886 	}
887 	bch2_trans_iter_exit(trans, &iter);
888 
889 	if (ret)
890 		return ret;
891 
892 	w->first_this_inode = true;
893 	w->have_inodes = true;
894 	return 0;
895 }
896 
897 static struct inode_walker_entry *
lookup_inode_for_snapshot(struct bch_fs * c,struct inode_walker * w,struct bkey_s_c k)898 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
899 {
900 	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
901 
902 	struct inode_walker_entry *i;
903 	__darray_for_each(w->inodes, i)
904 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
905 			goto found;
906 
907 	return NULL;
908 found:
909 	BUG_ON(k.k->p.snapshot > i->snapshot);
910 
911 	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
912 		struct inode_walker_entry new = *i;
913 
914 		new.snapshot	= k.k->p.snapshot;
915 		new.count	= 0;
916 		new.i_size	= 0;
917 
918 		struct printbuf buf = PRINTBUF;
919 		bch2_bkey_val_to_text(&buf, c, k);
920 
921 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
922 			 "unexpected because we should always update the inode when we update a key in that inode\n"
923 			 "%s",
924 			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
925 		printbuf_exit(&buf);
926 
927 		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
928 			--i;
929 
930 		size_t pos = i - w->inodes.data;
931 		int ret = darray_insert_item(&w->inodes, pos, new);
932 		if (ret)
933 			return ERR_PTR(ret);
934 
935 		i = w->inodes.data + pos;
936 	}
937 
938 	return i;
939 }
940 
walk_inode(struct btree_trans * trans,struct inode_walker * w,struct bkey_s_c k)941 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
942 					     struct inode_walker *w,
943 					     struct bkey_s_c k)
944 {
945 	if (w->last_pos.inode != k.k->p.inode) {
946 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
947 		if (ret)
948 			return ERR_PTR(ret);
949 	}
950 
951 	w->last_pos = k.k->p;
952 
953 	return lookup_inode_for_snapshot(trans->c, w, k);
954 }
955 
get_visible_inodes(struct btree_trans * trans,struct inode_walker * w,struct snapshots_seen * s,u64 inum)956 static int get_visible_inodes(struct btree_trans *trans,
957 			      struct inode_walker *w,
958 			      struct snapshots_seen *s,
959 			      u64 inum)
960 {
961 	struct bch_fs *c = trans->c;
962 	struct btree_iter iter;
963 	struct bkey_s_c k;
964 	int ret;
965 
966 	w->inodes.nr = 0;
967 	w->deletes.nr = 0;
968 
969 	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
970 			   BTREE_ITER_all_snapshots, k, ret) {
971 		if (k.k->p.offset != inum)
972 			break;
973 
974 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
975 			continue;
976 
977 		if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
978 			continue;
979 
980 		ret = bkey_is_inode(k.k)
981 			? add_inode(c, w, k)
982 			: snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
983 		if (ret)
984 			break;
985 	}
986 	bch2_trans_iter_exit(trans, &iter);
987 
988 	return ret;
989 }
990 
991 /*
992  * Prefer to delete the first one, since that will be the one at the wrong
993  * offset:
994  * return value: 0 -> delete k1, 1 -> delete k2
995  */
bch2_fsck_update_backpointers(struct btree_trans * trans,struct snapshots_seen * s,const struct bch_hash_desc desc,struct bch_hash_info * hash_info,struct bkey_i * new)996 int bch2_fsck_update_backpointers(struct btree_trans *trans,
997 				  struct snapshots_seen *s,
998 				  const struct bch_hash_desc desc,
999 				  struct bch_hash_info *hash_info,
1000 				  struct bkey_i *new)
1001 {
1002 	if (new->k.type != KEY_TYPE_dirent)
1003 		return 0;
1004 
1005 	struct bkey_i_dirent *d = bkey_i_to_dirent(new);
1006 	struct inode_walker target = inode_walker_init();
1007 	int ret = 0;
1008 
1009 	if (d->v.d_type == DT_SUBVOL) {
1010 		BUG();
1011 	} else {
1012 		ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
1013 		if (ret)
1014 			goto err;
1015 
1016 		darray_for_each(target.inodes, i) {
1017 			i->inode.bi_dir_offset = d->k.p.offset;
1018 			ret = __bch2_fsck_write_inode(trans, &i->inode);
1019 			if (ret)
1020 				goto err;
1021 		}
1022 	}
1023 err:
1024 	inode_walker_exit(&target);
1025 	return ret;
1026 }
1027 
inode_get_dirent(struct btree_trans * trans,struct btree_iter * iter,struct bch_inode_unpacked * inode,u32 * snapshot)1028 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
1029 					       struct btree_iter *iter,
1030 					       struct bch_inode_unpacked *inode,
1031 					       u32 *snapshot)
1032 {
1033 	if (inode->bi_subvol) {
1034 		u64 inum;
1035 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
1036 		if (ret)
1037 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
1038 	}
1039 
1040 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
1041 }
1042 
check_inode_deleted_list(struct btree_trans * trans,struct bpos p)1043 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
1044 {
1045 	struct btree_iter iter;
1046 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
1047 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
1048 	bch2_trans_iter_exit(trans, &iter);
1049 	return ret;
1050 }
1051 
check_inode_dirent_inode(struct btree_trans * trans,struct bch_inode_unpacked * inode,bool * write_inode)1052 static int check_inode_dirent_inode(struct btree_trans *trans,
1053 				    struct bch_inode_unpacked *inode,
1054 				    bool *write_inode)
1055 {
1056 	struct bch_fs *c = trans->c;
1057 	struct printbuf buf = PRINTBUF;
1058 
1059 	u32 inode_snapshot = inode->bi_snapshot;
1060 	struct btree_iter dirent_iter = {};
1061 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
1062 	int ret = bkey_err(d);
1063 	if (ret && !bch2_err_matches(ret, ENOENT))
1064 		return ret;
1065 
1066 	if (fsck_err_on(ret,
1067 			trans, inode_points_to_missing_dirent,
1068 			"inode points to missing dirent\n%s",
1069 			(bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
1070 	    fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
1071 			trans, inode_points_to_wrong_dirent,
1072 			"%s",
1073 			(printbuf_reset(&buf),
1074 			 dirent_inode_mismatch_msg(&buf, c, d, inode),
1075 			 buf.buf))) {
1076 		/*
1077 		 * We just clear the backpointer fields for now. If we find a
1078 		 * dirent that points to this inode in check_dirents(), we'll
1079 		 * update it then; then when we get to check_path() if the
1080 		 * backpointer is still 0 we'll reattach it.
1081 		 */
1082 		inode->bi_dir = 0;
1083 		inode->bi_dir_offset = 0;
1084 		*write_inode = true;
1085 	}
1086 
1087 	ret = 0;
1088 fsck_err:
1089 	bch2_trans_iter_exit(trans, &dirent_iter);
1090 	printbuf_exit(&buf);
1091 	bch_err_fn(c, ret);
1092 	return ret;
1093 }
1094 
get_snapshot_root_inode(struct btree_trans * trans,struct bch_inode_unpacked * root,u64 inum)1095 static int get_snapshot_root_inode(struct btree_trans *trans,
1096 				   struct bch_inode_unpacked *root,
1097 				   u64 inum)
1098 {
1099 	struct btree_iter iter;
1100 	struct bkey_s_c k;
1101 	int ret = 0;
1102 
1103 	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
1104 					     SPOS(0, inum, U32_MAX),
1105 					     BTREE_ITER_all_snapshots, k, ret) {
1106 		if (k.k->p.offset != inum)
1107 			break;
1108 		if (bkey_is_inode(k.k))
1109 			goto found_root;
1110 	}
1111 	if (ret)
1112 		goto err;
1113 	BUG();
1114 found_root:
1115 	ret = bch2_inode_unpack(k, root);
1116 err:
1117 	bch2_trans_iter_exit(trans, &iter);
1118 	return ret;
1119 }
1120 
check_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_inode_unpacked * snapshot_root,struct snapshots_seen * s)1121 static int check_inode(struct btree_trans *trans,
1122 		       struct btree_iter *iter,
1123 		       struct bkey_s_c k,
1124 		       struct bch_inode_unpacked *snapshot_root,
1125 		       struct snapshots_seen *s)
1126 {
1127 	struct bch_fs *c = trans->c;
1128 	struct printbuf buf = PRINTBUF;
1129 	struct bch_inode_unpacked u;
1130 	bool do_update = false;
1131 	int ret;
1132 
1133 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1134 	if (ret < 0)
1135 		goto err;
1136 	if (ret)
1137 		return 0;
1138 
1139 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1140 	if (ret)
1141 		goto err;
1142 
1143 	if (!bkey_is_inode(k.k))
1144 		return 0;
1145 
1146 	ret = bch2_inode_unpack(k, &u);
1147 	if (ret)
1148 		goto err;
1149 
1150 	if (snapshot_root->bi_inum != u.bi_inum) {
1151 		ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
1152 		if (ret)
1153 			goto err;
1154 	}
1155 
1156 	if (fsck_err_on(u.bi_hash_seed		!= snapshot_root->bi_hash_seed ||
1157 			INODE_STR_HASH(&u)	!= INODE_STR_HASH(snapshot_root),
1158 			trans, inode_snapshot_mismatch,
1159 			"inode hash info in different snapshots don't match")) {
1160 		u.bi_hash_seed = snapshot_root->bi_hash_seed;
1161 		SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
1162 		do_update = true;
1163 	}
1164 
1165 	if (u.bi_dir || u.bi_dir_offset) {
1166 		ret = check_inode_dirent_inode(trans, &u, &do_update);
1167 		if (ret)
1168 			goto err;
1169 	}
1170 
1171 	if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
1172 			trans, inode_unlinked_but_has_dirent,
1173 			"inode unlinked but has dirent\n%s",
1174 			(printbuf_reset(&buf),
1175 			 bch2_inode_unpacked_to_text(&buf, &u),
1176 			 buf.buf))) {
1177 		u.bi_flags &= ~BCH_INODE_unlinked;
1178 		do_update = true;
1179 	}
1180 
1181 	if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
1182 		/* Check for this early so that check_unreachable_inode() will reattach it */
1183 
1184 		ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
1185 		if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
1186 			goto err;
1187 
1188 		fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
1189 			    "dir unlinked but not empty\n%s",
1190 			    (printbuf_reset(&buf),
1191 			     bch2_inode_unpacked_to_text(&buf, &u),
1192 			     buf.buf));
1193 		u.bi_flags &= ~BCH_INODE_unlinked;
1194 		do_update = true;
1195 		ret = 0;
1196 	}
1197 
1198 	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
1199 	if (ret < 0)
1200 		goto err;
1201 
1202 	if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
1203 			trans, inode_has_child_snapshots_wrong,
1204 			"inode has_child_snapshots flag wrong (should be %u)\n%s",
1205 			ret,
1206 			(printbuf_reset(&buf),
1207 			 bch2_inode_unpacked_to_text(&buf, &u),
1208 			 buf.buf))) {
1209 		if (ret)
1210 			u.bi_flags |= BCH_INODE_has_child_snapshot;
1211 		else
1212 			u.bi_flags &= ~BCH_INODE_has_child_snapshot;
1213 		do_update = true;
1214 	}
1215 	ret = 0;
1216 
1217 	if ((u.bi_flags & BCH_INODE_unlinked) &&
1218 	    !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
1219 		if (!test_bit(BCH_FS_started, &c->flags)) {
1220 			/*
1221 			 * If we're not in online fsck, don't delete unlinked
1222 			 * inodes, just make sure they're on the deleted list.
1223 			 *
1224 			 * They might be referred to by a logged operation -
1225 			 * i.e. we might have crashed in the middle of a
1226 			 * truncate on an unlinked but open file - so we want to
1227 			 * let the delete_dead_inodes kill it after resuming
1228 			 * logged ops.
1229 			 */
1230 			ret = check_inode_deleted_list(trans, k.k->p);
1231 			if (ret < 0)
1232 				goto err_noprint;
1233 
1234 			fsck_err_on(!ret,
1235 				    trans, unlinked_inode_not_on_deleted_list,
1236 				    "inode %llu:%u unlinked, but not on deleted list",
1237 				    u.bi_inum, k.k->p.snapshot);
1238 
1239 			ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
1240 			if (ret)
1241 				goto err;
1242 		} else {
1243 			ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
1244 			if (ret < 0)
1245 				goto err;
1246 
1247 			if (fsck_err_on(!ret,
1248 					trans, inode_unlinked_and_not_open,
1249 				      "inode %llu:%u unlinked and not open",
1250 				      u.bi_inum, u.bi_snapshot)) {
1251 				ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1252 				bch_err_msg(c, ret, "in fsck deleting inode");
1253 				goto err_noprint;
1254 			}
1255 			ret = 0;
1256 		}
1257 	}
1258 
1259 	if (fsck_err_on(u.bi_parent_subvol &&
1260 			(u.bi_subvol == 0 ||
1261 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1262 			trans, inode_bi_parent_nonzero,
1263 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1264 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1265 		u.bi_parent_subvol = 0;
1266 		do_update = true;
1267 	}
1268 
1269 	if (u.bi_subvol) {
1270 		struct bch_subvolume s;
1271 
1272 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
1273 		if (ret && !bch2_err_matches(ret, ENOENT))
1274 			goto err;
1275 
1276 		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1277 			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1278 			goto do_update;
1279 		}
1280 
1281 		if (fsck_err_on(ret,
1282 				trans, inode_bi_subvol_missing,
1283 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1284 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1285 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1286 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1287 							   k.k->p.snapshot),
1288 				trans, inode_bi_subvol_wrong,
1289 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1290 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1291 				le64_to_cpu(s.inode),
1292 				le32_to_cpu(s.snapshot))) {
1293 			u.bi_subvol = 0;
1294 			u.bi_parent_subvol = 0;
1295 			do_update = true;
1296 		}
1297 	}
1298 
1299 	if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
1300 			trans, inode_journal_seq_in_future,
1301 			"inode journal seq in future (currently at %llu)\n%s",
1302 			journal_cur_seq(&c->journal),
1303 			(printbuf_reset(&buf),
1304 			 bch2_inode_unpacked_to_text(&buf, &u),
1305 			buf.buf))) {
1306 		u.bi_journal_seq = journal_cur_seq(&c->journal);
1307 		do_update = true;
1308 	}
1309 do_update:
1310 	if (do_update) {
1311 		ret = __bch2_fsck_write_inode(trans, &u);
1312 		bch_err_msg(c, ret, "in fsck updating inode");
1313 		if (ret)
1314 			goto err_noprint;
1315 	}
1316 err:
1317 fsck_err:
1318 	bch_err_fn(c, ret);
1319 err_noprint:
1320 	printbuf_exit(&buf);
1321 	return ret;
1322 }
1323 
bch2_check_inodes(struct bch_fs * c)1324 int bch2_check_inodes(struct bch_fs *c)
1325 {
1326 	struct bch_inode_unpacked snapshot_root = {};
1327 	struct snapshots_seen s;
1328 
1329 	snapshots_seen_init(&s);
1330 
1331 	int ret = bch2_trans_run(c,
1332 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1333 				POS_MIN,
1334 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1335 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1336 			check_inode(trans, &iter, k, &snapshot_root, &s)));
1337 
1338 	snapshots_seen_exit(&s);
1339 	bch_err_fn(c, ret);
1340 	return ret;
1341 }
1342 
find_oldest_inode_needs_reattach(struct btree_trans * trans,struct bch_inode_unpacked * inode)1343 static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
1344 					    struct bch_inode_unpacked *inode)
1345 {
1346 	struct bch_fs *c = trans->c;
1347 	struct btree_iter iter;
1348 	struct bkey_s_c k;
1349 	int ret = 0;
1350 
1351 	/*
1352 	 * We look for inodes to reattach in natural key order, leaves first,
1353 	 * but we should do the reattach at the oldest version that needs to be
1354 	 * reattached:
1355 	 */
1356 	for_each_btree_key_norestart(trans, iter,
1357 				     BTREE_ID_inodes,
1358 				     SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
1359 				     BTREE_ITER_all_snapshots, k, ret) {
1360 		if (k.k->p.offset != inode->bi_inum)
1361 			break;
1362 
1363 		if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
1364 			continue;
1365 
1366 		if (!bkey_is_inode(k.k))
1367 			break;
1368 
1369 		struct bch_inode_unpacked parent_inode;
1370 		ret = bch2_inode_unpack(k, &parent_inode);
1371 		if (ret)
1372 			break;
1373 
1374 		if (!inode_should_reattach(&parent_inode))
1375 			break;
1376 
1377 		*inode = parent_inode;
1378 	}
1379 	bch2_trans_iter_exit(trans, &iter);
1380 
1381 	return ret;
1382 }
1383 
check_unreachable_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1384 static int check_unreachable_inode(struct btree_trans *trans,
1385 				   struct btree_iter *iter,
1386 				   struct bkey_s_c k)
1387 {
1388 	struct printbuf buf = PRINTBUF;
1389 	int ret = 0;
1390 
1391 	if (!bkey_is_inode(k.k))
1392 		return 0;
1393 
1394 	struct bch_inode_unpacked inode;
1395 	ret = bch2_inode_unpack(k, &inode);
1396 	if (ret)
1397 		return ret;
1398 
1399 	if (!inode_should_reattach(&inode))
1400 		return 0;
1401 
1402 	ret = find_oldest_inode_needs_reattach(trans, &inode);
1403 	if (ret)
1404 		return ret;
1405 
1406 	if (fsck_err(trans, inode_unreachable,
1407 		     "unreachable inode:\n%s",
1408 		     (bch2_inode_unpacked_to_text(&buf, &inode),
1409 		      buf.buf)))
1410 		ret = reattach_inode(trans, &inode);
1411 fsck_err:
1412 	printbuf_exit(&buf);
1413 	return ret;
1414 }
1415 
1416 /*
1417  * Reattach unreachable (but not unlinked) inodes
1418  *
1419  * Run after check_inodes() and check_dirents(), so we node that inode
1420  * backpointer fields point to valid dirents, and every inode that has a dirent
1421  * that points to it has its backpointer field set - so we're just looking for
1422  * non-unlinked inodes without backpointers:
1423  *
1424  * XXX: this is racy w.r.t. hardlink removal in online fsck
1425  */
bch2_check_unreachable_inodes(struct bch_fs * c)1426 int bch2_check_unreachable_inodes(struct bch_fs *c)
1427 {
1428 	int ret = bch2_trans_run(c,
1429 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1430 				POS_MIN,
1431 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1432 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1433 			check_unreachable_inode(trans, &iter, k)));
1434 	bch_err_fn(c, ret);
1435 	return ret;
1436 }
1437 
btree_matches_i_mode(enum btree_id btree,unsigned mode)1438 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1439 {
1440 	switch (btree) {
1441 	case BTREE_ID_extents:
1442 		return S_ISREG(mode) || S_ISLNK(mode);
1443 	case BTREE_ID_dirents:
1444 		return S_ISDIR(mode);
1445 	case BTREE_ID_xattrs:
1446 		return true;
1447 	default:
1448 		BUG();
1449 	}
1450 }
1451 
check_key_has_inode(struct btree_trans * trans,struct btree_iter * iter,struct inode_walker * inode,struct inode_walker_entry * i,struct bkey_s_c k)1452 static int check_key_has_inode(struct btree_trans *trans,
1453 			       struct btree_iter *iter,
1454 			       struct inode_walker *inode,
1455 			       struct inode_walker_entry *i,
1456 			       struct bkey_s_c k)
1457 {
1458 	struct bch_fs *c = trans->c;
1459 	struct printbuf buf = PRINTBUF;
1460 	int ret = PTR_ERR_OR_ZERO(i);
1461 	if (ret)
1462 		return ret;
1463 
1464 	if (k.k->type == KEY_TYPE_whiteout)
1465 		goto out;
1466 
1467 	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1468 		ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1469 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1470 		if (ret)
1471 			goto err;
1472 
1473 		inode->last_pos.inode--;
1474 		ret = -BCH_ERR_transaction_restart_nested;
1475 		goto err;
1476 	}
1477 
1478 	if (fsck_err_on(!i,
1479 			trans, key_in_missing_inode,
1480 			"key in missing inode:\n  %s",
1481 			(printbuf_reset(&buf),
1482 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1483 		goto delete;
1484 
1485 	if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1486 			trans, key_in_wrong_inode_type,
1487 			"key for wrong inode mode %o:\n  %s",
1488 			i->inode.bi_mode,
1489 			(printbuf_reset(&buf),
1490 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1491 		goto delete;
1492 out:
1493 err:
1494 fsck_err:
1495 	printbuf_exit(&buf);
1496 	bch_err_fn(c, ret);
1497 	return ret;
1498 delete:
1499 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1500 	goto out;
1501 }
1502 
check_i_sectors_notnested(struct btree_trans * trans,struct inode_walker * w)1503 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1504 {
1505 	struct bch_fs *c = trans->c;
1506 	int ret = 0;
1507 	s64 count2;
1508 
1509 	darray_for_each(w->inodes, i) {
1510 		if (i->inode.bi_sectors == i->count)
1511 			continue;
1512 
1513 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1514 
1515 		if (w->recalculate_sums)
1516 			i->count = count2;
1517 
1518 		if (i->count != count2) {
1519 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1520 					    w->last_pos.inode, i->snapshot, i->count, count2);
1521 			i->count = count2;
1522 		}
1523 
1524 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1525 				trans, inode_i_sectors_wrong,
1526 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1527 				w->last_pos.inode, i->snapshot,
1528 				i->inode.bi_sectors, i->count)) {
1529 			i->inode.bi_sectors = i->count;
1530 			ret = bch2_fsck_write_inode(trans, &i->inode);
1531 			if (ret)
1532 				break;
1533 		}
1534 	}
1535 fsck_err:
1536 	bch_err_fn(c, ret);
1537 	return ret;
1538 }
1539 
check_i_sectors(struct btree_trans * trans,struct inode_walker * w)1540 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1541 {
1542 	u32 restart_count = trans->restart_count;
1543 	return check_i_sectors_notnested(trans, w) ?:
1544 		trans_was_restarted(trans, restart_count);
1545 }
1546 
1547 struct extent_end {
1548 	u32			snapshot;
1549 	u64			offset;
1550 	struct snapshots_seen	seen;
1551 };
1552 
1553 struct extent_ends {
1554 	struct bpos			last_pos;
1555 	DARRAY(struct extent_end)	e;
1556 };
1557 
extent_ends_reset(struct extent_ends * extent_ends)1558 static void extent_ends_reset(struct extent_ends *extent_ends)
1559 {
1560 	darray_for_each(extent_ends->e, i)
1561 		snapshots_seen_exit(&i->seen);
1562 	extent_ends->e.nr = 0;
1563 }
1564 
extent_ends_exit(struct extent_ends * extent_ends)1565 static void extent_ends_exit(struct extent_ends *extent_ends)
1566 {
1567 	extent_ends_reset(extent_ends);
1568 	darray_exit(&extent_ends->e);
1569 }
1570 
extent_ends_init(struct extent_ends * extent_ends)1571 static void extent_ends_init(struct extent_ends *extent_ends)
1572 {
1573 	memset(extent_ends, 0, sizeof(*extent_ends));
1574 }
1575 
extent_ends_at(struct bch_fs * c,struct extent_ends * extent_ends,struct snapshots_seen * seen,struct bkey_s_c k)1576 static int extent_ends_at(struct bch_fs *c,
1577 			  struct extent_ends *extent_ends,
1578 			  struct snapshots_seen *seen,
1579 			  struct bkey_s_c k)
1580 {
1581 	struct extent_end *i, n = (struct extent_end) {
1582 		.offset		= k.k->p.offset,
1583 		.snapshot	= k.k->p.snapshot,
1584 		.seen		= *seen,
1585 	};
1586 
1587 	n.seen.ids.data = kmemdup(seen->ids.data,
1588 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1589 			      GFP_KERNEL);
1590 	if (!n.seen.ids.data)
1591 		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1592 
1593 	__darray_for_each(extent_ends->e, i) {
1594 		if (i->snapshot == k.k->p.snapshot) {
1595 			snapshots_seen_exit(&i->seen);
1596 			*i = n;
1597 			return 0;
1598 		}
1599 
1600 		if (i->snapshot >= k.k->p.snapshot)
1601 			break;
1602 	}
1603 
1604 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1605 }
1606 
overlapping_extents_found(struct btree_trans * trans,enum btree_id btree,struct bpos pos1,struct snapshots_seen * pos1_seen,struct bkey pos2,bool * fixed,struct extent_end * extent_end)1607 static int overlapping_extents_found(struct btree_trans *trans,
1608 				     enum btree_id btree,
1609 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1610 				     struct bkey pos2,
1611 				     bool *fixed,
1612 				     struct extent_end *extent_end)
1613 {
1614 	struct bch_fs *c = trans->c;
1615 	struct printbuf buf = PRINTBUF;
1616 	struct btree_iter iter1, iter2 = { NULL };
1617 	struct bkey_s_c k1, k2;
1618 	int ret;
1619 
1620 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1621 
1622 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1623 			     BTREE_ITER_all_snapshots|
1624 			     BTREE_ITER_not_extents);
1625 	k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
1626 	ret = bkey_err(k1);
1627 	if (ret)
1628 		goto err;
1629 
1630 	prt_str(&buf, "\n  ");
1631 	bch2_bkey_val_to_text(&buf, c, k1);
1632 
1633 	if (!bpos_eq(pos1, k1.k->p)) {
1634 		prt_str(&buf, "\n  wanted\n  ");
1635 		bch2_bpos_to_text(&buf, pos1);
1636 		prt_str(&buf, "\n  ");
1637 		bch2_bkey_to_text(&buf, &pos2);
1638 
1639 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1640 			__func__, buf.buf);
1641 		ret = -BCH_ERR_internal_fsck_err;
1642 		goto err;
1643 	}
1644 
1645 	bch2_trans_copy_iter(&iter2, &iter1);
1646 
1647 	while (1) {
1648 		bch2_btree_iter_advance(&iter2);
1649 
1650 		k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
1651 		ret = bkey_err(k2);
1652 		if (ret)
1653 			goto err;
1654 
1655 		if (bpos_ge(k2.k->p, pos2.p))
1656 			break;
1657 	}
1658 
1659 	prt_str(&buf, "\n  ");
1660 	bch2_bkey_val_to_text(&buf, c, k2);
1661 
1662 	if (bpos_gt(k2.k->p, pos2.p) ||
1663 	    pos2.size != k2.k->size) {
1664 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1665 			__func__, buf.buf);
1666 		ret = -BCH_ERR_internal_fsck_err;
1667 		goto err;
1668 	}
1669 
1670 	prt_printf(&buf, "\n  overwriting %s extent",
1671 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1672 
1673 	if (fsck_err(trans, extent_overlapping,
1674 		     "overlapping extents%s", buf.buf)) {
1675 		struct btree_iter *old_iter = &iter1;
1676 		struct disk_reservation res = { 0 };
1677 
1678 		if (pos1.snapshot < pos2.p.snapshot) {
1679 			old_iter = &iter2;
1680 			swap(k1, k2);
1681 		}
1682 
1683 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1684 
1685 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1686 				BTREE_UPDATE_internal_snapshot_node,
1687 				k1, k2) ?:
1688 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1689 		bch2_disk_reservation_put(c, &res);
1690 
1691 		if (ret)
1692 			goto err;
1693 
1694 		*fixed = true;
1695 
1696 		if (pos1.snapshot == pos2.p.snapshot) {
1697 			/*
1698 			 * We overwrote the first extent, and did the overwrite
1699 			 * in the same snapshot:
1700 			 */
1701 			extent_end->offset = bkey_start_offset(&pos2);
1702 		} else if (pos1.snapshot > pos2.p.snapshot) {
1703 			/*
1704 			 * We overwrote the first extent in pos2's snapshot:
1705 			 */
1706 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1707 		} else {
1708 			/*
1709 			 * We overwrote the second extent - restart
1710 			 * check_extent() from the top:
1711 			 */
1712 			ret = -BCH_ERR_transaction_restart_nested;
1713 		}
1714 	}
1715 fsck_err:
1716 err:
1717 	bch2_trans_iter_exit(trans, &iter2);
1718 	bch2_trans_iter_exit(trans, &iter1);
1719 	printbuf_exit(&buf);
1720 	return ret;
1721 }
1722 
check_overlapping_extents(struct btree_trans * trans,struct snapshots_seen * seen,struct extent_ends * extent_ends,struct bkey_s_c k,struct btree_iter * iter,bool * fixed)1723 static int check_overlapping_extents(struct btree_trans *trans,
1724 			      struct snapshots_seen *seen,
1725 			      struct extent_ends *extent_ends,
1726 			      struct bkey_s_c k,
1727 			      struct btree_iter *iter,
1728 			      bool *fixed)
1729 {
1730 	struct bch_fs *c = trans->c;
1731 	int ret = 0;
1732 
1733 	/* transaction restart, running again */
1734 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1735 		return 0;
1736 
1737 	if (extent_ends->last_pos.inode != k.k->p.inode)
1738 		extent_ends_reset(extent_ends);
1739 
1740 	darray_for_each(extent_ends->e, i) {
1741 		if (i->offset <= bkey_start_offset(k.k))
1742 			continue;
1743 
1744 		if (!ref_visible2(c,
1745 				  k.k->p.snapshot, seen,
1746 				  i->snapshot, &i->seen))
1747 			continue;
1748 
1749 		ret = overlapping_extents_found(trans, iter->btree_id,
1750 						SPOS(iter->pos.inode,
1751 						     i->offset,
1752 						     i->snapshot),
1753 						&i->seen,
1754 						*k.k, fixed, i);
1755 		if (ret)
1756 			goto err;
1757 	}
1758 
1759 	extent_ends->last_pos = k.k->p;
1760 err:
1761 	return ret;
1762 }
1763 
check_extent_overbig(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1764 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1765 				struct bkey_s_c k)
1766 {
1767 	struct bch_fs *c = trans->c;
1768 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1769 	struct bch_extent_crc_unpacked crc;
1770 	const union bch_extent_entry *i;
1771 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1772 
1773 	bkey_for_each_crc(k.k, ptrs, crc, i)
1774 		if (crc_is_encoded(crc) &&
1775 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1776 			struct printbuf buf = PRINTBUF;
1777 
1778 			bch2_bkey_val_to_text(&buf, c, k);
1779 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1780 			printbuf_exit(&buf);
1781 		}
1782 
1783 	return 0;
1784 }
1785 
check_extent(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct inode_walker * inode,struct snapshots_seen * s,struct extent_ends * extent_ends,struct disk_reservation * res)1786 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1787 			struct bkey_s_c k,
1788 			struct inode_walker *inode,
1789 			struct snapshots_seen *s,
1790 			struct extent_ends *extent_ends,
1791 			struct disk_reservation *res)
1792 {
1793 	struct bch_fs *c = trans->c;
1794 	struct printbuf buf = PRINTBUF;
1795 	int ret = 0;
1796 
1797 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1798 	if (ret) {
1799 		ret = ret < 0 ? ret : 0;
1800 		goto out;
1801 	}
1802 
1803 	if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
1804 		ret = check_i_sectors(trans, inode);
1805 		if (ret)
1806 			goto err;
1807 	}
1808 
1809 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1810 	if (ret)
1811 		goto err;
1812 
1813 	struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
1814 	ret = PTR_ERR_OR_ZERO(extent_i);
1815 	if (ret)
1816 		goto err;
1817 
1818 	ret = check_key_has_inode(trans, iter, inode, extent_i, k);
1819 	if (ret)
1820 		goto err;
1821 
1822 	if (k.k->type != KEY_TYPE_whiteout) {
1823 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1824 						&inode->recalculate_sums);
1825 		if (ret)
1826 			goto err;
1827 
1828 		/*
1829 		 * Check inodes in reverse order, from oldest snapshots to
1830 		 * newest, starting from the inode that matches this extent's
1831 		 * snapshot. If we didn't have one, iterate over all inodes:
1832 		 */
1833 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1834 		     inode->inodes.data && i >= inode->inodes.data;
1835 		     --i) {
1836 			if (i->snapshot > k.k->p.snapshot ||
1837 			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1838 				continue;
1839 
1840 			if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1841 					!bkey_extent_is_reservation(k),
1842 					trans, extent_past_end_of_inode,
1843 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
1844 					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1845 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1846 				struct btree_iter iter2;
1847 
1848 				bch2_trans_copy_iter(&iter2, iter);
1849 				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1850 				ret =   bch2_btree_iter_traverse(&iter2) ?:
1851 					bch2_btree_delete_at(trans, &iter2,
1852 						BTREE_UPDATE_internal_snapshot_node);
1853 				bch2_trans_iter_exit(trans, &iter2);
1854 				if (ret)
1855 					goto err;
1856 
1857 				iter->k.type = KEY_TYPE_whiteout;
1858 				break;
1859 			}
1860 		}
1861 	}
1862 
1863 	ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
1864 	if (ret)
1865 		goto err;
1866 
1867 	if (bkey_extent_is_allocation(k.k)) {
1868 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1869 		     inode->inodes.data && i >= inode->inodes.data;
1870 		     --i) {
1871 			if (i->snapshot > k.k->p.snapshot ||
1872 			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1873 				continue;
1874 
1875 			i->count += k.k->size;
1876 		}
1877 	}
1878 
1879 	if (k.k->type != KEY_TYPE_whiteout) {
1880 		ret = extent_ends_at(c, extent_ends, s, k);
1881 		if (ret)
1882 			goto err;
1883 	}
1884 out:
1885 err:
1886 fsck_err:
1887 	printbuf_exit(&buf);
1888 	bch_err_fn(c, ret);
1889 	return ret;
1890 }
1891 
1892 /*
1893  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1894  * that i_size an i_sectors are consistent
1895  */
bch2_check_extents(struct bch_fs * c)1896 int bch2_check_extents(struct bch_fs *c)
1897 {
1898 	struct inode_walker w = inode_walker_init();
1899 	struct snapshots_seen s;
1900 	struct extent_ends extent_ends;
1901 	struct disk_reservation res = { 0 };
1902 
1903 	snapshots_seen_init(&s);
1904 	extent_ends_init(&extent_ends);
1905 
1906 	int ret = bch2_trans_run(c,
1907 		for_each_btree_key(trans, iter, BTREE_ID_extents,
1908 				POS(BCACHEFS_ROOT_INO, 0),
1909 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
1910 			bch2_disk_reservation_put(c, &res);
1911 			check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
1912 			check_extent_overbig(trans, &iter, k);
1913 		})) ?:
1914 		check_i_sectors_notnested(trans, &w));
1915 
1916 	bch2_disk_reservation_put(c, &res);
1917 	extent_ends_exit(&extent_ends);
1918 	inode_walker_exit(&w);
1919 	snapshots_seen_exit(&s);
1920 
1921 	bch_err_fn(c, ret);
1922 	return ret;
1923 }
1924 
bch2_check_indirect_extents(struct bch_fs * c)1925 int bch2_check_indirect_extents(struct bch_fs *c)
1926 {
1927 	struct disk_reservation res = { 0 };
1928 
1929 	int ret = bch2_trans_run(c,
1930 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1931 				POS_MIN,
1932 				BTREE_ITER_prefetch, k,
1933 				&res, NULL,
1934 				BCH_TRANS_COMMIT_no_enospc, ({
1935 			bch2_disk_reservation_put(c, &res);
1936 			check_extent_overbig(trans, &iter, k);
1937 		})));
1938 
1939 	bch2_disk_reservation_put(c, &res);
1940 	bch_err_fn(c, ret);
1941 	return ret;
1942 }
1943 
check_subdir_count_notnested(struct btree_trans * trans,struct inode_walker * w)1944 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1945 {
1946 	struct bch_fs *c = trans->c;
1947 	int ret = 0;
1948 	s64 count2;
1949 
1950 	darray_for_each(w->inodes, i) {
1951 		if (i->inode.bi_nlink == i->count)
1952 			continue;
1953 
1954 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1955 		if (count2 < 0)
1956 			return count2;
1957 
1958 		if (i->count != count2) {
1959 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1960 					    w->last_pos.inode, i->snapshot, i->count, count2);
1961 			i->count = count2;
1962 			if (i->inode.bi_nlink == i->count)
1963 				continue;
1964 		}
1965 
1966 		if (fsck_err_on(i->inode.bi_nlink != i->count,
1967 				trans, inode_dir_wrong_nlink,
1968 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1969 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1970 			i->inode.bi_nlink = i->count;
1971 			ret = bch2_fsck_write_inode(trans, &i->inode);
1972 			if (ret)
1973 				break;
1974 		}
1975 	}
1976 fsck_err:
1977 	bch_err_fn(c, ret);
1978 	return ret;
1979 }
1980 
check_subdir_dirents_count(struct btree_trans * trans,struct inode_walker * w)1981 static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
1982 {
1983 	u32 restart_count = trans->restart_count;
1984 	return check_subdir_count_notnested(trans, w) ?:
1985 		trans_was_restarted(trans, restart_count);
1986 }
1987 
1988 noinline_for_stack
check_dirent_inode_dirent(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c_dirent d,struct bch_inode_unpacked * target)1989 static int check_dirent_inode_dirent(struct btree_trans *trans,
1990 				   struct btree_iter *iter,
1991 				   struct bkey_s_c_dirent d,
1992 				   struct bch_inode_unpacked *target)
1993 {
1994 	struct bch_fs *c = trans->c;
1995 	struct printbuf buf = PRINTBUF;
1996 	struct btree_iter bp_iter = { NULL };
1997 	int ret = 0;
1998 
1999 	if (inode_points_to_dirent(target, d))
2000 		return 0;
2001 
2002 	if (!target->bi_dir &&
2003 	    !target->bi_dir_offset) {
2004 		fsck_err_on(S_ISDIR(target->bi_mode),
2005 			    trans, inode_dir_missing_backpointer,
2006 			    "directory with missing backpointer\n%s",
2007 			    (printbuf_reset(&buf),
2008 			     bch2_bkey_val_to_text(&buf, c, d.s_c),
2009 			     prt_printf(&buf, "\n"),
2010 			     bch2_inode_unpacked_to_text(&buf, target),
2011 			     buf.buf));
2012 
2013 		fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
2014 			    trans, inode_unlinked_but_has_dirent,
2015 			    "inode unlinked but has dirent\n%s",
2016 			    (printbuf_reset(&buf),
2017 			     bch2_bkey_val_to_text(&buf, c, d.s_c),
2018 			     prt_printf(&buf, "\n"),
2019 			     bch2_inode_unpacked_to_text(&buf, target),
2020 			     buf.buf));
2021 
2022 		target->bi_flags &= ~BCH_INODE_unlinked;
2023 		target->bi_dir		= d.k->p.inode;
2024 		target->bi_dir_offset	= d.k->p.offset;
2025 		return __bch2_fsck_write_inode(trans, target);
2026 	}
2027 
2028 	if (bch2_inode_should_have_single_bp(target) &&
2029 	    !fsck_err(trans, inode_wrong_backpointer,
2030 		      "dirent points to inode that does not point back:\n  %s",
2031 		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
2032 		       prt_printf(&buf, "\n  "),
2033 		       bch2_inode_unpacked_to_text(&buf, target),
2034 		       buf.buf)))
2035 		goto err;
2036 
2037 	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
2038 			      SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
2039 	ret = bkey_err(bp_dirent);
2040 	if (ret && !bch2_err_matches(ret, ENOENT))
2041 		goto err;
2042 
2043 	bool backpointer_exists = !ret;
2044 	ret = 0;
2045 
2046 	if (fsck_err_on(!backpointer_exists,
2047 			trans, inode_wrong_backpointer,
2048 			"inode %llu:%u has wrong backpointer:\n"
2049 			"got       %llu:%llu\n"
2050 			"should be %llu:%llu",
2051 			target->bi_inum, target->bi_snapshot,
2052 			target->bi_dir,
2053 			target->bi_dir_offset,
2054 			d.k->p.inode,
2055 			d.k->p.offset)) {
2056 		target->bi_dir		= d.k->p.inode;
2057 		target->bi_dir_offset	= d.k->p.offset;
2058 		ret = __bch2_fsck_write_inode(trans, target);
2059 		goto out;
2060 	}
2061 
2062 	bch2_bkey_val_to_text(&buf, c, d.s_c);
2063 	prt_newline(&buf);
2064 	if (backpointer_exists)
2065 		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
2066 
2067 	if (fsck_err_on(backpointer_exists &&
2068 			(S_ISDIR(target->bi_mode) ||
2069 			 target->bi_subvol),
2070 			trans, inode_dir_multiple_links,
2071 			"%s %llu:%u with multiple links\n%s",
2072 			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
2073 			target->bi_inum, target->bi_snapshot, buf.buf)) {
2074 		ret = __remove_dirent(trans, d.k->p);
2075 		goto out;
2076 	}
2077 
2078 	/*
2079 	 * hardlinked file with nlink 0:
2080 	 * We're just adjusting nlink here so check_nlinks() will pick
2081 	 * it up, it ignores inodes with nlink 0
2082 	 */
2083 	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
2084 			trans, inode_multiple_links_but_nlink_0,
2085 			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
2086 			target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
2087 		target->bi_nlink++;
2088 		target->bi_flags &= ~BCH_INODE_unlinked;
2089 		ret = __bch2_fsck_write_inode(trans, target);
2090 		if (ret)
2091 			goto err;
2092 	}
2093 out:
2094 err:
2095 fsck_err:
2096 	bch2_trans_iter_exit(trans, &bp_iter);
2097 	printbuf_exit(&buf);
2098 	bch_err_fn(c, ret);
2099 	return ret;
2100 }
2101 
2102 noinline_for_stack
check_dirent_target(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c_dirent d,struct bch_inode_unpacked * target)2103 static int check_dirent_target(struct btree_trans *trans,
2104 			       struct btree_iter *iter,
2105 			       struct bkey_s_c_dirent d,
2106 			       struct bch_inode_unpacked *target)
2107 {
2108 	struct bch_fs *c = trans->c;
2109 	struct bkey_i_dirent *n;
2110 	struct printbuf buf = PRINTBUF;
2111 	int ret = 0;
2112 
2113 	ret = check_dirent_inode_dirent(trans, iter, d, target);
2114 	if (ret)
2115 		goto err;
2116 
2117 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
2118 			trans, dirent_d_type_wrong,
2119 			"incorrect d_type: got %s, should be %s:\n%s",
2120 			bch2_d_type_str(d.v->d_type),
2121 			bch2_d_type_str(inode_d_type(target)),
2122 			(printbuf_reset(&buf),
2123 			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
2124 		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
2125 		ret = PTR_ERR_OR_ZERO(n);
2126 		if (ret)
2127 			goto err;
2128 
2129 		bkey_reassemble(&n->k_i, d.s_c);
2130 		n->v.d_type = inode_d_type(target);
2131 		if (n->v.d_type == DT_SUBVOL) {
2132 			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
2133 			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
2134 		} else {
2135 			n->v.d_inum = cpu_to_le64(target->bi_inum);
2136 		}
2137 
2138 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
2139 		if (ret)
2140 			goto err;
2141 
2142 		d = dirent_i_to_s_c(n);
2143 	}
2144 err:
2145 fsck_err:
2146 	printbuf_exit(&buf);
2147 	bch_err_fn(c, ret);
2148 	return ret;
2149 }
2150 
2151 /* find a subvolume that's a descendent of @snapshot: */
find_snapshot_subvol(struct btree_trans * trans,u32 snapshot,u32 * subvolid)2152 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
2153 {
2154 	struct btree_iter iter;
2155 	struct bkey_s_c k;
2156 	int ret;
2157 
2158 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
2159 		if (k.k->type != KEY_TYPE_subvolume)
2160 			continue;
2161 
2162 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2163 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
2164 			bch2_trans_iter_exit(trans, &iter);
2165 			*subvolid = k.k->p.offset;
2166 			goto found;
2167 		}
2168 	}
2169 	if (!ret)
2170 		ret = -ENOENT;
2171 found:
2172 	bch2_trans_iter_exit(trans, &iter);
2173 	return ret;
2174 }
2175 
2176 noinline_for_stack
check_dirent_to_subvol(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c_dirent d)2177 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
2178 				  struct bkey_s_c_dirent d)
2179 {
2180 	struct bch_fs *c = trans->c;
2181 	struct btree_iter subvol_iter = {};
2182 	struct bch_inode_unpacked subvol_root;
2183 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
2184 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
2185 	u32 parent_snapshot;
2186 	u32 new_parent_subvol = 0;
2187 	u64 parent_inum;
2188 	struct printbuf buf = PRINTBUF;
2189 	int ret = 0;
2190 
2191 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
2192 	if (ret && !bch2_err_matches(ret, ENOENT))
2193 		return ret;
2194 
2195 	if (ret ||
2196 	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
2197 		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
2198 		if (ret2 && !bch2_err_matches(ret, ENOENT))
2199 			return ret2;
2200 	}
2201 
2202 	if (ret &&
2203 	    !new_parent_subvol &&
2204 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
2205 		/*
2206 		 * Couldn't find a subvol for dirent's snapshot - but we lost
2207 		 * subvols, so we need to reconstruct:
2208 		 */
2209 		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
2210 		if (ret)
2211 			return ret;
2212 
2213 		parent_snapshot = d.k->p.snapshot;
2214 	}
2215 
2216 	if (fsck_err_on(ret,
2217 			trans, dirent_to_missing_parent_subvol,
2218 			"dirent parent_subvol points to missing subvolume\n%s",
2219 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
2220 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
2221 			trans, dirent_not_visible_in_parent_subvol,
2222 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
2223 			parent_snapshot,
2224 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
2225 		if (!new_parent_subvol) {
2226 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
2227 			return -BCH_ERR_fsck_repair_unimplemented;
2228 		}
2229 
2230 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
2231 		ret = PTR_ERR_OR_ZERO(new_dirent);
2232 		if (ret)
2233 			goto err;
2234 
2235 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
2236 	}
2237 
2238 	struct bkey_s_c_subvolume s =
2239 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
2240 					 BTREE_ID_subvolumes, POS(0, target_subvol),
2241 					 0, subvolume);
2242 	ret = bkey_err(s.s_c);
2243 	if (ret && !bch2_err_matches(ret, ENOENT))
2244 		return ret;
2245 
2246 	if (ret) {
2247 		if (fsck_err(trans, dirent_to_missing_subvol,
2248 			     "dirent points to missing subvolume\n%s",
2249 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
2250 			return __remove_dirent(trans, d.k->p);
2251 		ret = 0;
2252 		goto out;
2253 	}
2254 
2255 	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
2256 			trans, subvol_fs_path_parent_wrong,
2257 			"subvol with wrong fs_path_parent, should be be %u\n%s",
2258 			parent_subvol,
2259 			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
2260 		struct bkey_i_subvolume *n =
2261 			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
2262 		ret = PTR_ERR_OR_ZERO(n);
2263 		if (ret)
2264 			goto err;
2265 
2266 		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
2267 	}
2268 
2269 	u64 target_inum = le64_to_cpu(s.v->inode);
2270 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2271 
2272 	ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
2273 	if (ret && !bch2_err_matches(ret, ENOENT))
2274 		goto err;
2275 
2276 	if (ret) {
2277 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2278 		ret = -BCH_ERR_fsck_repair_unimplemented;
2279 		goto err;
2280 	}
2281 
2282 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2283 			trans, inode_bi_parent_wrong,
2284 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2285 			target_inum,
2286 			subvol_root.bi_parent_subvol, parent_subvol)) {
2287 		subvol_root.bi_parent_subvol = parent_subvol;
2288 		subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
2289 		ret = __bch2_fsck_write_inode(trans, &subvol_root);
2290 		if (ret)
2291 			goto err;
2292 	}
2293 
2294 	ret = check_dirent_target(trans, iter, d, &subvol_root);
2295 	if (ret)
2296 		goto err;
2297 out:
2298 err:
2299 fsck_err:
2300 	bch2_trans_iter_exit(trans, &subvol_iter);
2301 	printbuf_exit(&buf);
2302 	return ret;
2303 }
2304 
check_dirent(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_hash_info * hash_info,struct inode_walker * dir,struct inode_walker * target,struct snapshots_seen * s)2305 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2306 			struct bkey_s_c k,
2307 			struct bch_hash_info *hash_info,
2308 			struct inode_walker *dir,
2309 			struct inode_walker *target,
2310 			struct snapshots_seen *s)
2311 {
2312 	struct bch_fs *c = trans->c;
2313 	struct inode_walker_entry *i;
2314 	struct printbuf buf = PRINTBUF;
2315 	int ret = 0;
2316 
2317 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2318 	if (ret) {
2319 		ret = ret < 0 ? ret : 0;
2320 		goto out;
2321 	}
2322 
2323 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2324 	if (ret)
2325 		goto err;
2326 
2327 	if (k.k->type == KEY_TYPE_whiteout)
2328 		goto out;
2329 
2330 	if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
2331 		ret = check_subdir_dirents_count(trans, dir);
2332 		if (ret)
2333 			goto err;
2334 	}
2335 
2336 	i = walk_inode(trans, dir, k);
2337 	ret = PTR_ERR_OR_ZERO(i);
2338 	if (ret < 0)
2339 		goto err;
2340 
2341 	ret = check_key_has_inode(trans, iter, dir, i, k);
2342 	if (ret)
2343 		goto err;
2344 
2345 	if (!i)
2346 		goto out;
2347 
2348 	if (dir->first_this_inode)
2349 		*hash_info = bch2_hash_info_init(c, &i->inode);
2350 	dir->first_this_inode = false;
2351 
2352 	ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
2353 	if (ret < 0)
2354 		goto err;
2355 	if (ret) {
2356 		/* dirent has been deleted */
2357 		ret = 0;
2358 		goto out;
2359 	}
2360 
2361 	if (k.k->type != KEY_TYPE_dirent)
2362 		goto out;
2363 
2364 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2365 
2366 	if (d.v->d_type == DT_SUBVOL) {
2367 		ret = check_dirent_to_subvol(trans, iter, d);
2368 		if (ret)
2369 			goto err;
2370 	} else {
2371 		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2372 		if (ret)
2373 			goto err;
2374 
2375 		if (fsck_err_on(!target->inodes.nr,
2376 				trans, dirent_to_missing_inode,
2377 				"dirent points to missing inode:\n%s",
2378 				(printbuf_reset(&buf),
2379 				 bch2_bkey_val_to_text(&buf, c, k),
2380 				 buf.buf))) {
2381 			ret = __remove_dirent(trans, d.k->p);
2382 			if (ret)
2383 				goto err;
2384 		}
2385 
2386 		darray_for_each(target->inodes, i) {
2387 			ret = check_dirent_target(trans, iter, d, &i->inode);
2388 			if (ret)
2389 				goto err;
2390 		}
2391 
2392 		darray_for_each(target->deletes, i)
2393 			if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
2394 					trans, dirent_to_overwritten_inode,
2395 					"dirent points to inode overwritten in snapshot %u:\n%s",
2396 					*i,
2397 					(printbuf_reset(&buf),
2398 					 bch2_bkey_val_to_text(&buf, c, k),
2399 					 buf.buf))) {
2400 				struct btree_iter delete_iter;
2401 				bch2_trans_iter_init(trans, &delete_iter,
2402 						     BTREE_ID_dirents,
2403 						     SPOS(k.k->p.inode, k.k->p.offset, *i),
2404 						     BTREE_ITER_intent);
2405 				ret =   bch2_btree_iter_traverse(&delete_iter) ?:
2406 					bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
2407 							  hash_info,
2408 							  &delete_iter,
2409 							  BTREE_UPDATE_internal_snapshot_node);
2410 				bch2_trans_iter_exit(trans, &delete_iter);
2411 				if (ret)
2412 					goto err;
2413 
2414 			}
2415 	}
2416 
2417 	ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2418 	if (ret)
2419 		goto err;
2420 
2421 	for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
2422 		if (d.v->d_type == DT_DIR)
2423 			i->count++;
2424 		i->i_size += bkey_bytes(d.k);
2425 	}
2426 out:
2427 err:
2428 fsck_err:
2429 	printbuf_exit(&buf);
2430 	bch_err_fn(c, ret);
2431 	return ret;
2432 }
2433 
2434 /*
2435  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2436  * validate d_type
2437  */
bch2_check_dirents(struct bch_fs * c)2438 int bch2_check_dirents(struct bch_fs *c)
2439 {
2440 	struct inode_walker dir = inode_walker_init();
2441 	struct inode_walker target = inode_walker_init();
2442 	struct snapshots_seen s;
2443 	struct bch_hash_info hash_info;
2444 
2445 	snapshots_seen_init(&s);
2446 
2447 	int ret = bch2_trans_run(c,
2448 		for_each_btree_key(trans, iter, BTREE_ID_dirents,
2449 				POS(BCACHEFS_ROOT_INO, 0),
2450 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2451 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2452 		check_subdir_count_notnested(trans, &dir));
2453 
2454 	snapshots_seen_exit(&s);
2455 	inode_walker_exit(&dir);
2456 	inode_walker_exit(&target);
2457 	bch_err_fn(c, ret);
2458 	return ret;
2459 }
2460 
check_xattr(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_hash_info * hash_info,struct inode_walker * inode)2461 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2462 		       struct bkey_s_c k,
2463 		       struct bch_hash_info *hash_info,
2464 		       struct inode_walker *inode)
2465 {
2466 	struct bch_fs *c = trans->c;
2467 	struct inode_walker_entry *i;
2468 	int ret;
2469 
2470 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2471 	if (ret < 0)
2472 		return ret;
2473 	if (ret)
2474 		return 0;
2475 
2476 	i = walk_inode(trans, inode, k);
2477 	ret = PTR_ERR_OR_ZERO(i);
2478 	if (ret)
2479 		return ret;
2480 
2481 	ret = check_key_has_inode(trans, iter, inode, i, k);
2482 	if (ret)
2483 		return ret;
2484 
2485 	if (!i)
2486 		return 0;
2487 
2488 	if (inode->first_this_inode)
2489 		*hash_info = bch2_hash_info_init(c, &i->inode);
2490 	inode->first_this_inode = false;
2491 
2492 	ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
2493 	bch_err_fn(c, ret);
2494 	return ret;
2495 }
2496 
2497 /*
2498  * Walk xattrs: verify that they all have a corresponding inode
2499  */
bch2_check_xattrs(struct bch_fs * c)2500 int bch2_check_xattrs(struct bch_fs *c)
2501 {
2502 	struct inode_walker inode = inode_walker_init();
2503 	struct bch_hash_info hash_info;
2504 	int ret = 0;
2505 
2506 	ret = bch2_trans_run(c,
2507 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2508 			POS(BCACHEFS_ROOT_INO, 0),
2509 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2510 			k,
2511 			NULL, NULL,
2512 			BCH_TRANS_COMMIT_no_enospc,
2513 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2514 
2515 	inode_walker_exit(&inode);
2516 	bch_err_fn(c, ret);
2517 	return ret;
2518 }
2519 
check_root_trans(struct btree_trans * trans)2520 static int check_root_trans(struct btree_trans *trans)
2521 {
2522 	struct bch_fs *c = trans->c;
2523 	struct bch_inode_unpacked root_inode;
2524 	u32 snapshot;
2525 	u64 inum;
2526 	int ret;
2527 
2528 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2529 	if (ret && !bch2_err_matches(ret, ENOENT))
2530 		return ret;
2531 
2532 	if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2533 				"root subvol missing")) {
2534 		struct bkey_i_subvolume *root_subvol =
2535 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2536 		ret = PTR_ERR_OR_ZERO(root_subvol);
2537 		if (ret)
2538 			goto err;
2539 
2540 		snapshot	= U32_MAX;
2541 		inum		= BCACHEFS_ROOT_INO;
2542 
2543 		bkey_subvolume_init(&root_subvol->k_i);
2544 		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2545 		root_subvol->v.flags	= 0;
2546 		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2547 		root_subvol->v.inode	= cpu_to_le64(inum);
2548 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2549 		bch_err_msg(c, ret, "writing root subvol");
2550 		if (ret)
2551 			goto err;
2552 	}
2553 
2554 	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
2555 	if (ret && !bch2_err_matches(ret, ENOENT))
2556 		return ret;
2557 
2558 	if (mustfix_fsck_err_on(ret,
2559 				trans, root_dir_missing,
2560 				"root directory missing") ||
2561 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2562 				trans, root_inode_not_dir,
2563 				"root inode not a directory")) {
2564 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2565 				0, NULL);
2566 		root_inode.bi_inum = inum;
2567 		root_inode.bi_snapshot = snapshot;
2568 
2569 		ret = __bch2_fsck_write_inode(trans, &root_inode);
2570 		bch_err_msg(c, ret, "writing root inode");
2571 	}
2572 err:
2573 fsck_err:
2574 	return ret;
2575 }
2576 
2577 /* Get root directory, create if it doesn't exist: */
bch2_check_root(struct bch_fs * c)2578 int bch2_check_root(struct bch_fs *c)
2579 {
2580 	int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2581 		check_root_trans(trans));
2582 	bch_err_fn(c, ret);
2583 	return ret;
2584 }
2585 
2586 typedef DARRAY(u32) darray_u32;
2587 
darray_u32_has(darray_u32 * d,u32 v)2588 static bool darray_u32_has(darray_u32 *d, u32 v)
2589 {
2590 	darray_for_each(*d, i)
2591 		if (*i == v)
2592 			return true;
2593 	return false;
2594 }
2595 
check_subvol_path(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)2596 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2597 {
2598 	struct bch_fs *c = trans->c;
2599 	struct btree_iter parent_iter = {};
2600 	darray_u32 subvol_path = {};
2601 	struct printbuf buf = PRINTBUF;
2602 	int ret = 0;
2603 
2604 	if (k.k->type != KEY_TYPE_subvolume)
2605 		return 0;
2606 
2607 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2608 		ret = darray_push(&subvol_path, k.k->p.offset);
2609 		if (ret)
2610 			goto err;
2611 
2612 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2613 
2614 		struct bch_inode_unpacked subvol_root;
2615 		ret = bch2_inode_find_by_inum_trans(trans,
2616 					(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2617 					&subvol_root);
2618 		if (ret)
2619 			break;
2620 
2621 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2622 
2623 		if (darray_u32_has(&subvol_path, parent)) {
2624 			if (fsck_err(c, subvol_loop, "subvolume loop"))
2625 				ret = reattach_subvol(trans, s);
2626 			break;
2627 		}
2628 
2629 		bch2_trans_iter_exit(trans, &parent_iter);
2630 		bch2_trans_iter_init(trans, &parent_iter,
2631 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2632 		k = bch2_btree_iter_peek_slot(&parent_iter);
2633 		ret = bkey_err(k);
2634 		if (ret)
2635 			goto err;
2636 
2637 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2638 				trans, subvol_unreachable,
2639 				"unreachable subvolume %s",
2640 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2641 				 buf.buf))) {
2642 			ret = reattach_subvol(trans, s);
2643 			break;
2644 		}
2645 	}
2646 fsck_err:
2647 err:
2648 	printbuf_exit(&buf);
2649 	darray_exit(&subvol_path);
2650 	bch2_trans_iter_exit(trans, &parent_iter);
2651 	return ret;
2652 }
2653 
bch2_check_subvolume_structure(struct bch_fs * c)2654 int bch2_check_subvolume_structure(struct bch_fs *c)
2655 {
2656 	int ret = bch2_trans_run(c,
2657 		for_each_btree_key_commit(trans, iter,
2658 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2659 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2660 			check_subvol_path(trans, &iter, k)));
2661 	bch_err_fn(c, ret);
2662 	return ret;
2663 }
2664 
2665 struct pathbuf_entry {
2666 	u64	inum;
2667 	u32	snapshot;
2668 };
2669 
2670 typedef DARRAY(struct pathbuf_entry) pathbuf;
2671 
bch2_bi_depth_renumber_one(struct btree_trans * trans,struct pathbuf_entry * p,u32 new_depth)2672 static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
2673 				      u32 new_depth)
2674 {
2675 	struct btree_iter iter;
2676 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
2677 					       SPOS(0, p->inum, p->snapshot), 0);
2678 
2679 	struct bch_inode_unpacked inode;
2680 	int ret = bkey_err(k) ?:
2681 		!bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
2682 		: bch2_inode_unpack(k, &inode);
2683 	if (ret)
2684 		goto err;
2685 
2686 	if (inode.bi_depth != new_depth) {
2687 		inode.bi_depth = new_depth;
2688 		ret = __bch2_fsck_write_inode(trans, &inode) ?:
2689 			bch2_trans_commit(trans, NULL, NULL, 0);
2690 	}
2691 err:
2692 	bch2_trans_iter_exit(trans, &iter);
2693 	return ret;
2694 }
2695 
bch2_bi_depth_renumber(struct btree_trans * trans,pathbuf * path,u32 new_bi_depth)2696 static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
2697 {
2698 	u32 restart_count = trans->restart_count;
2699 	int ret = 0;
2700 
2701 	darray_for_each_reverse(*path, i) {
2702 		ret = nested_lockrestart_do(trans,
2703 				bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
2704 		bch_err_fn(trans->c, ret);
2705 		if (ret)
2706 			break;
2707 
2708 		new_bi_depth++;
2709 	}
2710 
2711 	return ret ?: trans_was_restarted(trans, restart_count);
2712 }
2713 
path_is_dup(pathbuf * p,u64 inum,u32 snapshot)2714 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2715 {
2716 	darray_for_each(*p, i)
2717 		if (i->inum	== inum &&
2718 		    i->snapshot	== snapshot)
2719 			return true;
2720 	return false;
2721 }
2722 
check_path_loop(struct btree_trans * trans,struct bkey_s_c inode_k)2723 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
2724 {
2725 	struct bch_fs *c = trans->c;
2726 	struct btree_iter inode_iter = {};
2727 	pathbuf path = {};
2728 	struct printbuf buf = PRINTBUF;
2729 	u32 snapshot = inode_k.k->p.snapshot;
2730 	bool redo_bi_depth = false;
2731 	u32 min_bi_depth = U32_MAX;
2732 	int ret = 0;
2733 
2734 	struct bch_inode_unpacked inode;
2735 	ret = bch2_inode_unpack(inode_k, &inode);
2736 	if (ret)
2737 		return ret;
2738 
2739 	while (!inode.bi_subvol) {
2740 		struct btree_iter dirent_iter;
2741 		struct bkey_s_c_dirent d;
2742 		u32 parent_snapshot = snapshot;
2743 
2744 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2745 		ret = bkey_err(d.s_c);
2746 		if (ret && !bch2_err_matches(ret, ENOENT))
2747 			goto out;
2748 
2749 		if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
2750 			bch2_trans_iter_exit(trans, &dirent_iter);
2751 
2752 		if (bch2_err_matches(ret, ENOENT)) {
2753 			printbuf_reset(&buf);
2754 			bch2_bkey_val_to_text(&buf, c, inode_k);
2755 			bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
2756 				bch2_err_str(ret), buf.buf);
2757 			goto out;
2758 		}
2759 
2760 		bch2_trans_iter_exit(trans, &dirent_iter);
2761 
2762 		ret = darray_push(&path, ((struct pathbuf_entry) {
2763 			.inum		= inode.bi_inum,
2764 			.snapshot	= snapshot,
2765 		}));
2766 		if (ret)
2767 			return ret;
2768 
2769 		snapshot = parent_snapshot;
2770 
2771 		bch2_trans_iter_exit(trans, &inode_iter);
2772 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2773 					     SPOS(0, inode.bi_dir, snapshot), 0);
2774 
2775 		struct bch_inode_unpacked parent_inode;
2776 		ret = bkey_err(inode_k) ?:
2777 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2778 			: bch2_inode_unpack(inode_k, &parent_inode);
2779 		if (ret) {
2780 			/* Should have been caught in dirents pass */
2781 			bch_err_msg(c, ret, "error looking up parent directory");
2782 			goto out;
2783 		}
2784 
2785 		min_bi_depth = parent_inode.bi_depth;
2786 
2787 		if (parent_inode.bi_depth < inode.bi_depth &&
2788 		    min_bi_depth < U16_MAX)
2789 			break;
2790 
2791 		inode = parent_inode;
2792 		snapshot = inode_k.k->p.snapshot;
2793 		redo_bi_depth = true;
2794 
2795 		if (path_is_dup(&path, inode.bi_inum, snapshot)) {
2796 			/* XXX print path */
2797 			bch_err(c, "directory structure loop");
2798 
2799 			darray_for_each(path, i)
2800 				pr_err("%llu:%u", i->inum, i->snapshot);
2801 			pr_err("%llu:%u", inode.bi_inum, snapshot);
2802 
2803 			if (fsck_err(trans, dir_loop, "directory structure loop")) {
2804 				ret = remove_backpointer(trans, &inode);
2805 				bch_err_msg(c, ret, "removing dirent");
2806 				if (ret)
2807 					break;
2808 
2809 				ret = reattach_inode(trans, &inode);
2810 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2811 			}
2812 
2813 			goto out;
2814 		}
2815 	}
2816 
2817 	if (inode.bi_subvol)
2818 		min_bi_depth = 0;
2819 
2820 	if (redo_bi_depth)
2821 		ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
2822 out:
2823 fsck_err:
2824 	bch2_trans_iter_exit(trans, &inode_iter);
2825 	darray_exit(&path);
2826 	printbuf_exit(&buf);
2827 	bch_err_fn(c, ret);
2828 	return ret;
2829 }
2830 
2831 /*
2832  * Check for loops in the directory structure: all other connectivity issues
2833  * have been fixed by prior passes
2834  */
bch2_check_directory_structure(struct bch_fs * c)2835 int bch2_check_directory_structure(struct bch_fs *c)
2836 {
2837 	int ret = bch2_trans_run(c,
2838 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2839 					  BTREE_ITER_intent|
2840 					  BTREE_ITER_prefetch|
2841 					  BTREE_ITER_all_snapshots, k,
2842 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2843 			if (!S_ISDIR(bkey_inode_mode(k)))
2844 				continue;
2845 
2846 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2847 				continue;
2848 
2849 			check_path_loop(trans, k);
2850 		})));
2851 
2852 	bch_err_fn(c, ret);
2853 	return ret;
2854 }
2855 
2856 struct nlink_table {
2857 	size_t		nr;
2858 	size_t		size;
2859 
2860 	struct nlink {
2861 		u64	inum;
2862 		u32	snapshot;
2863 		u32	count;
2864 	}		*d;
2865 };
2866 
add_nlink(struct bch_fs * c,struct nlink_table * t,u64 inum,u32 snapshot)2867 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2868 		     u64 inum, u32 snapshot)
2869 {
2870 	if (t->nr == t->size) {
2871 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2872 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2873 
2874 		if (!d) {
2875 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2876 				new_size);
2877 			return -BCH_ERR_ENOMEM_fsck_add_nlink;
2878 		}
2879 
2880 		if (t->d)
2881 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2882 		kvfree(t->d);
2883 
2884 		t->d = d;
2885 		t->size = new_size;
2886 	}
2887 
2888 
2889 	t->d[t->nr++] = (struct nlink) {
2890 		.inum		= inum,
2891 		.snapshot	= snapshot,
2892 	};
2893 
2894 	return 0;
2895 }
2896 
nlink_cmp(const void * _l,const void * _r)2897 static int nlink_cmp(const void *_l, const void *_r)
2898 {
2899 	const struct nlink *l = _l;
2900 	const struct nlink *r = _r;
2901 
2902 	return cmp_int(l->inum, r->inum);
2903 }
2904 
inc_link(struct bch_fs * c,struct snapshots_seen * s,struct nlink_table * links,u64 range_start,u64 range_end,u64 inum,u32 snapshot)2905 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2906 		     struct nlink_table *links,
2907 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2908 {
2909 	struct nlink *link, key = {
2910 		.inum = inum, .snapshot = U32_MAX,
2911 	};
2912 
2913 	if (inum < range_start || inum >= range_end)
2914 		return;
2915 
2916 	link = __inline_bsearch(&key, links->d, links->nr,
2917 				sizeof(links->d[0]), nlink_cmp);
2918 	if (!link)
2919 		return;
2920 
2921 	while (link > links->d && link[0].inum == link[-1].inum)
2922 		--link;
2923 
2924 	for (; link < links->d + links->nr && link->inum == inum; link++)
2925 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2926 			link->count++;
2927 			if (link->snapshot >= snapshot)
2928 				break;
2929 		}
2930 }
2931 
2932 noinline_for_stack
check_nlinks_find_hardlinks(struct bch_fs * c,struct nlink_table * t,u64 start,u64 * end)2933 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2934 				       struct nlink_table *t,
2935 				       u64 start, u64 *end)
2936 {
2937 	int ret = bch2_trans_run(c,
2938 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2939 				   POS(0, start),
2940 				   BTREE_ITER_intent|
2941 				   BTREE_ITER_prefetch|
2942 				   BTREE_ITER_all_snapshots, k, ({
2943 			if (!bkey_is_inode(k.k))
2944 				continue;
2945 
2946 			/* Should never fail, checked by bch2_inode_invalid: */
2947 			struct bch_inode_unpacked u;
2948 			_ret3 = bch2_inode_unpack(k, &u);
2949 			if (_ret3)
2950 				break;
2951 
2952 			/*
2953 			 * Backpointer and directory structure checks are sufficient for
2954 			 * directories, since they can't have hardlinks:
2955 			 */
2956 			if (S_ISDIR(u.bi_mode))
2957 				continue;
2958 
2959 			/*
2960 			 * Previous passes ensured that bi_nlink is nonzero if
2961 			 * it had multiple hardlinks:
2962 			 */
2963 			if (!u.bi_nlink)
2964 				continue;
2965 
2966 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2967 			if (ret) {
2968 				*end = k.k->p.offset;
2969 				ret = 0;
2970 				break;
2971 			}
2972 			0;
2973 		})));
2974 
2975 	bch_err_fn(c, ret);
2976 	return ret;
2977 }
2978 
2979 noinline_for_stack
check_nlinks_walk_dirents(struct bch_fs * c,struct nlink_table * links,u64 range_start,u64 range_end)2980 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2981 				     u64 range_start, u64 range_end)
2982 {
2983 	struct snapshots_seen s;
2984 
2985 	snapshots_seen_init(&s);
2986 
2987 	int ret = bch2_trans_run(c,
2988 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2989 				   BTREE_ITER_intent|
2990 				   BTREE_ITER_prefetch|
2991 				   BTREE_ITER_all_snapshots, k, ({
2992 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2993 			if (ret)
2994 				break;
2995 
2996 			if (k.k->type == KEY_TYPE_dirent) {
2997 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2998 
2999 				if (d.v->d_type != DT_DIR &&
3000 				    d.v->d_type != DT_SUBVOL)
3001 					inc_link(c, &s, links, range_start, range_end,
3002 						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
3003 			}
3004 			0;
3005 		})));
3006 
3007 	snapshots_seen_exit(&s);
3008 
3009 	bch_err_fn(c, ret);
3010 	return ret;
3011 }
3012 
check_nlinks_update_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct nlink_table * links,size_t * idx,u64 range_end)3013 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
3014 				     struct bkey_s_c k,
3015 				     struct nlink_table *links,
3016 				     size_t *idx, u64 range_end)
3017 {
3018 	struct bch_inode_unpacked u;
3019 	struct nlink *link = &links->d[*idx];
3020 	int ret = 0;
3021 
3022 	if (k.k->p.offset >= range_end)
3023 		return 1;
3024 
3025 	if (!bkey_is_inode(k.k))
3026 		return 0;
3027 
3028 	ret = bch2_inode_unpack(k, &u);
3029 	if (ret)
3030 		return ret;
3031 
3032 	if (S_ISDIR(u.bi_mode))
3033 		return 0;
3034 
3035 	if (!u.bi_nlink)
3036 		return 0;
3037 
3038 	while ((cmp_int(link->inum, k.k->p.offset) ?:
3039 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
3040 		BUG_ON(*idx == links->nr);
3041 		link = &links->d[++*idx];
3042 	}
3043 
3044 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
3045 			trans, inode_wrong_nlink,
3046 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
3047 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
3048 			bch2_inode_nlink_get(&u), link->count)) {
3049 		bch2_inode_nlink_set(&u, link->count);
3050 		ret = __bch2_fsck_write_inode(trans, &u);
3051 	}
3052 fsck_err:
3053 	return ret;
3054 }
3055 
3056 noinline_for_stack
check_nlinks_update_hardlinks(struct bch_fs * c,struct nlink_table * links,u64 range_start,u64 range_end)3057 static int check_nlinks_update_hardlinks(struct bch_fs *c,
3058 			       struct nlink_table *links,
3059 			       u64 range_start, u64 range_end)
3060 {
3061 	size_t idx = 0;
3062 
3063 	int ret = bch2_trans_run(c,
3064 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
3065 				POS(0, range_start),
3066 				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
3067 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
3068 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
3069 	if (ret < 0) {
3070 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
3071 		return ret;
3072 	}
3073 
3074 	return 0;
3075 }
3076 
bch2_check_nlinks(struct bch_fs * c)3077 int bch2_check_nlinks(struct bch_fs *c)
3078 {
3079 	struct nlink_table links = { 0 };
3080 	u64 this_iter_range_start, next_iter_range_start = 0;
3081 	int ret = 0;
3082 
3083 	do {
3084 		this_iter_range_start = next_iter_range_start;
3085 		next_iter_range_start = U64_MAX;
3086 
3087 		ret = check_nlinks_find_hardlinks(c, &links,
3088 						  this_iter_range_start,
3089 						  &next_iter_range_start);
3090 
3091 		ret = check_nlinks_walk_dirents(c, &links,
3092 					  this_iter_range_start,
3093 					  next_iter_range_start);
3094 		if (ret)
3095 			break;
3096 
3097 		ret = check_nlinks_update_hardlinks(c, &links,
3098 					 this_iter_range_start,
3099 					 next_iter_range_start);
3100 		if (ret)
3101 			break;
3102 
3103 		links.nr = 0;
3104 	} while (next_iter_range_start != U64_MAX);
3105 
3106 	kvfree(links.d);
3107 	bch_err_fn(c, ret);
3108 	return ret;
3109 }
3110 
fix_reflink_p_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)3111 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
3112 			     struct bkey_s_c k)
3113 {
3114 	struct bkey_s_c_reflink_p p;
3115 	struct bkey_i_reflink_p *u;
3116 
3117 	if (k.k->type != KEY_TYPE_reflink_p)
3118 		return 0;
3119 
3120 	p = bkey_s_c_to_reflink_p(k);
3121 
3122 	if (!p.v->front_pad && !p.v->back_pad)
3123 		return 0;
3124 
3125 	u = bch2_trans_kmalloc(trans, sizeof(*u));
3126 	int ret = PTR_ERR_OR_ZERO(u);
3127 	if (ret)
3128 		return ret;
3129 
3130 	bkey_reassemble(&u->k_i, k);
3131 	u->v.front_pad	= 0;
3132 	u->v.back_pad	= 0;
3133 
3134 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
3135 }
3136 
bch2_fix_reflink_p(struct bch_fs * c)3137 int bch2_fix_reflink_p(struct bch_fs *c)
3138 {
3139 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
3140 		return 0;
3141 
3142 	int ret = bch2_trans_run(c,
3143 		for_each_btree_key_commit(trans, iter,
3144 				BTREE_ID_extents, POS_MIN,
3145 				BTREE_ITER_intent|BTREE_ITER_prefetch|
3146 				BTREE_ITER_all_snapshots, k,
3147 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
3148 			fix_reflink_p_key(trans, &iter, k)));
3149 	bch_err_fn(c, ret);
3150 	return ret;
3151 }
3152 
3153 #ifndef NO_BCACHEFS_CHARDEV
3154 
3155 struct fsck_thread {
3156 	struct thread_with_stdio thr;
3157 	struct bch_fs		*c;
3158 	struct bch_opts		opts;
3159 };
3160 
bch2_fsck_thread_exit(struct thread_with_stdio * _thr)3161 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
3162 {
3163 	struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
3164 	kfree(thr);
3165 }
3166 
bch2_fsck_offline_thread_fn(struct thread_with_stdio * stdio)3167 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
3168 {
3169 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3170 	struct bch_fs *c = thr->c;
3171 
3172 	int ret = PTR_ERR_OR_ZERO(c);
3173 	if (ret)
3174 		return ret;
3175 
3176 	ret = bch2_fs_start(thr->c);
3177 	if (ret)
3178 		goto err;
3179 
3180 	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
3181 		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
3182 		ret |= 1;
3183 	}
3184 	if (test_bit(BCH_FS_error, &c->flags)) {
3185 		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
3186 		ret |= 4;
3187 	}
3188 err:
3189 	bch2_fs_stop(c);
3190 	return ret;
3191 }
3192 
3193 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
3194 	.exit		= bch2_fsck_thread_exit,
3195 	.fn		= bch2_fsck_offline_thread_fn,
3196 };
3197 
bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user * user_arg)3198 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
3199 {
3200 	struct bch_ioctl_fsck_offline arg;
3201 	struct fsck_thread *thr = NULL;
3202 	darray_str(devs) = {};
3203 	long ret = 0;
3204 
3205 	if (copy_from_user(&arg, user_arg, sizeof(arg)))
3206 		return -EFAULT;
3207 
3208 	if (arg.flags)
3209 		return -EINVAL;
3210 
3211 	if (!capable(CAP_SYS_ADMIN))
3212 		return -EPERM;
3213 
3214 	for (size_t i = 0; i < arg.nr_devs; i++) {
3215 		u64 dev_u64;
3216 		ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
3217 		if (ret)
3218 			goto err;
3219 
3220 		char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
3221 		ret = PTR_ERR_OR_ZERO(dev_str);
3222 		if (ret)
3223 			goto err;
3224 
3225 		ret = darray_push(&devs, dev_str);
3226 		if (ret) {
3227 			kfree(dev_str);
3228 			goto err;
3229 		}
3230 	}
3231 
3232 	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3233 	if (!thr) {
3234 		ret = -ENOMEM;
3235 		goto err;
3236 	}
3237 
3238 	thr->opts = bch2_opts_empty();
3239 
3240 	if (arg.opts) {
3241 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3242 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
3243 			bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
3244 		if (!IS_ERR(optstr))
3245 			kfree(optstr);
3246 
3247 		if (ret)
3248 			goto err;
3249 	}
3250 
3251 	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
3252 	opt_set(thr->opts, read_only, 1);
3253 	opt_set(thr->opts, ratelimit_errors, 0);
3254 
3255 	/* We need request_key() to be called before we punt to kthread: */
3256 	opt_set(thr->opts, nostart, true);
3257 
3258 	bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
3259 
3260 	thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
3261 
3262 	if (!IS_ERR(thr->c) &&
3263 	    thr->c->opts.errors == BCH_ON_ERROR_panic)
3264 		thr->c->opts.errors = BCH_ON_ERROR_ro;
3265 
3266 	ret = __bch2_run_thread_with_stdio(&thr->thr);
3267 out:
3268 	darray_for_each(devs, i)
3269 		kfree(*i);
3270 	darray_exit(&devs);
3271 	return ret;
3272 err:
3273 	if (thr)
3274 		bch2_fsck_thread_exit(&thr->thr);
3275 	pr_err("ret %s", bch2_err_str(ret));
3276 	goto out;
3277 }
3278 
bch2_fsck_online_thread_fn(struct thread_with_stdio * stdio)3279 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
3280 {
3281 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3282 	struct bch_fs *c = thr->c;
3283 
3284 	c->stdio_filter = current;
3285 	c->stdio = &thr->thr.stdio;
3286 
3287 	/*
3288 	 * XXX: can we figure out a way to do this without mucking with c->opts?
3289 	 */
3290 	unsigned old_fix_errors = c->opts.fix_errors;
3291 	if (opt_defined(thr->opts, fix_errors))
3292 		c->opts.fix_errors = thr->opts.fix_errors;
3293 	else
3294 		c->opts.fix_errors = FSCK_FIX_ask;
3295 
3296 	c->opts.fsck = true;
3297 	set_bit(BCH_FS_fsck_running, &c->flags);
3298 
3299 	c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
3300 	int ret = bch2_run_online_recovery_passes(c);
3301 
3302 	clear_bit(BCH_FS_fsck_running, &c->flags);
3303 	bch_err_fn(c, ret);
3304 
3305 	c->stdio = NULL;
3306 	c->stdio_filter = NULL;
3307 	c->opts.fix_errors = old_fix_errors;
3308 
3309 	up(&c->online_fsck_mutex);
3310 	bch2_ro_ref_put(c);
3311 	return ret;
3312 }
3313 
3314 static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
3315 	.exit		= bch2_fsck_thread_exit,
3316 	.fn		= bch2_fsck_online_thread_fn,
3317 };
3318 
bch2_ioctl_fsck_online(struct bch_fs * c,struct bch_ioctl_fsck_online arg)3319 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
3320 {
3321 	struct fsck_thread *thr = NULL;
3322 	long ret = 0;
3323 
3324 	if (arg.flags)
3325 		return -EINVAL;
3326 
3327 	if (!capable(CAP_SYS_ADMIN))
3328 		return -EPERM;
3329 
3330 	if (!bch2_ro_ref_tryget(c))
3331 		return -EROFS;
3332 
3333 	if (down_trylock(&c->online_fsck_mutex)) {
3334 		bch2_ro_ref_put(c);
3335 		return -EAGAIN;
3336 	}
3337 
3338 	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3339 	if (!thr) {
3340 		ret = -ENOMEM;
3341 		goto err;
3342 	}
3343 
3344 	thr->c = c;
3345 	thr->opts = bch2_opts_empty();
3346 
3347 	if (arg.opts) {
3348 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3349 
3350 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
3351 			bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
3352 		if (!IS_ERR(optstr))
3353 			kfree(optstr);
3354 
3355 		if (ret)
3356 			goto err;
3357 	}
3358 
3359 	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
3360 err:
3361 	if (ret < 0) {
3362 		bch_err_fn(c, ret);
3363 		if (thr)
3364 			bch2_fsck_thread_exit(&thr->thr);
3365 		up(&c->online_fsck_mutex);
3366 		bch2_ro_ref_put(c);
3367 	}
3368 	return ret;
3369 }
3370 
3371 #endif /* NO_BCACHEFS_CHARDEV */
3372