1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bcachefs_ioctl.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "darray.h"
10 #include "dirent.h"
11 #include "error.h"
12 #include "fs.h"
13 #include "fsck.h"
14 #include "inode.h"
15 #include "keylist.h"
16 #include "namei.h"
17 #include "recovery_passes.h"
18 #include "snapshot.h"
19 #include "super.h"
20 #include "thread_with_file.h"
21 #include "xattr.h"
22
23 #include <linux/bsearch.h>
24 #include <linux/dcache.h> /* struct qstr */
25
dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,struct bch_inode_unpacked * inode)26 static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
27 struct bch_inode_unpacked *inode)
28 {
29 if (d.v->d_type == DT_SUBVOL
30 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
31 : le64_to_cpu(d.v->d_inum) == inode->bi_inum)
32 return 0;
33 return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
34 }
35
dirent_inode_mismatch_msg(struct printbuf * out,struct bch_fs * c,struct bkey_s_c_dirent dirent,struct bch_inode_unpacked * inode)36 static void dirent_inode_mismatch_msg(struct printbuf *out,
37 struct bch_fs *c,
38 struct bkey_s_c_dirent dirent,
39 struct bch_inode_unpacked *inode)
40 {
41 prt_str(out, "inode points to dirent that does not point back:");
42 prt_newline(out);
43 bch2_bkey_val_to_text(out, c, dirent.s_c);
44 prt_newline(out);
45 bch2_inode_unpacked_to_text(out, inode);
46 }
47
dirent_points_to_inode(struct bch_fs * c,struct bkey_s_c_dirent dirent,struct bch_inode_unpacked * inode)48 static int dirent_points_to_inode(struct bch_fs *c,
49 struct bkey_s_c_dirent dirent,
50 struct bch_inode_unpacked *inode)
51 {
52 int ret = dirent_points_to_inode_nowarn(dirent, inode);
53 if (ret) {
54 struct printbuf buf = PRINTBUF;
55 dirent_inode_mismatch_msg(&buf, c, dirent, inode);
56 bch_warn(c, "%s", buf.buf);
57 printbuf_exit(&buf);
58 }
59 return ret;
60 }
61
62 /*
63 * XXX: this is handling transaction restarts without returning
64 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
65 */
bch2_count_inode_sectors(struct btree_trans * trans,u64 inum,u32 snapshot)66 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
67 u32 snapshot)
68 {
69 u64 sectors = 0;
70
71 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
72 SPOS(inum, 0, snapshot),
73 POS(inum, U64_MAX),
74 0, k, ({
75 if (bkey_extent_is_allocation(k.k))
76 sectors += k.k->size;
77 0;
78 }));
79
80 return ret ?: sectors;
81 }
82
bch2_count_subdirs(struct btree_trans * trans,u64 inum,u32 snapshot)83 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
84 u32 snapshot)
85 {
86 u64 subdirs = 0;
87
88 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
89 SPOS(inum, 0, snapshot),
90 POS(inum, U64_MAX),
91 0, k, ({
92 if (k.k->type == KEY_TYPE_dirent &&
93 bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
94 subdirs++;
95 0;
96 }));
97
98 return ret ?: subdirs;
99 }
100
subvol_lookup(struct btree_trans * trans,u32 subvol,u32 * snapshot,u64 * inum)101 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
102 u32 *snapshot, u64 *inum)
103 {
104 struct bch_subvolume s;
105 int ret = bch2_subvolume_get(trans, subvol, false, &s);
106
107 *snapshot = le32_to_cpu(s.snapshot);
108 *inum = le64_to_cpu(s.inode);
109 return ret;
110 }
111
lookup_inode(struct btree_trans * trans,u64 inode_nr,u32 snapshot,struct bch_inode_unpacked * inode)112 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
113 struct bch_inode_unpacked *inode)
114 {
115 struct btree_iter iter;
116 struct bkey_s_c k;
117 int ret;
118
119 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
120 SPOS(0, inode_nr, snapshot), 0);
121 ret = bkey_err(k);
122 if (ret)
123 goto err;
124
125 ret = bkey_is_inode(k.k)
126 ? bch2_inode_unpack(k, inode)
127 : -BCH_ERR_ENOENT_inode;
128 err:
129 bch2_trans_iter_exit(trans, &iter);
130 return ret;
131 }
132
lookup_dirent_in_snapshot(struct btree_trans * trans,struct bch_hash_info hash_info,subvol_inum dir,struct qstr * name,u64 * target,unsigned * type,u32 snapshot)133 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
134 struct bch_hash_info hash_info,
135 subvol_inum dir, struct qstr *name,
136 u64 *target, unsigned *type, u32 snapshot)
137 {
138 struct btree_iter iter;
139 struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
140 &hash_info, dir, name, 0, snapshot);
141 int ret = bkey_err(k);
142 if (ret)
143 return ret;
144
145 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
146 *target = le64_to_cpu(d.v->d_inum);
147 *type = d.v->d_type;
148 bch2_trans_iter_exit(trans, &iter);
149 return 0;
150 }
151
152 /*
153 * Find any subvolume associated with a tree of snapshots
154 * We can't rely on master_subvol - it might have been deleted.
155 */
find_snapshot_tree_subvol(struct btree_trans * trans,u32 tree_id,u32 * subvol)156 static int find_snapshot_tree_subvol(struct btree_trans *trans,
157 u32 tree_id, u32 *subvol)
158 {
159 struct btree_iter iter;
160 struct bkey_s_c k;
161 int ret;
162
163 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
164 if (k.k->type != KEY_TYPE_snapshot)
165 continue;
166
167 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
168 if (le32_to_cpu(s.v->tree) != tree_id)
169 continue;
170
171 if (s.v->subvol) {
172 *subvol = le32_to_cpu(s.v->subvol);
173 goto found;
174 }
175 }
176 ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
177 found:
178 bch2_trans_iter_exit(trans, &iter);
179 return ret;
180 }
181
182 /* Get lost+found, create if it doesn't exist: */
lookup_lostfound(struct btree_trans * trans,u32 snapshot,struct bch_inode_unpacked * lostfound,u64 reattaching_inum)183 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
184 struct bch_inode_unpacked *lostfound,
185 u64 reattaching_inum)
186 {
187 struct bch_fs *c = trans->c;
188 struct qstr lostfound_str = QSTR("lost+found");
189 struct btree_iter lostfound_iter = { NULL };
190 u64 inum = 0;
191 unsigned d_type = 0;
192 int ret;
193
194 struct bch_snapshot_tree st;
195 ret = bch2_snapshot_tree_lookup(trans,
196 bch2_snapshot_tree(c, snapshot), &st);
197 if (ret)
198 return ret;
199
200 u32 subvolid;
201 ret = find_snapshot_tree_subvol(trans,
202 bch2_snapshot_tree(c, snapshot), &subvolid);
203 bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
204 bch2_snapshot_tree(c, snapshot));
205 if (ret)
206 return ret;
207
208 struct bch_subvolume subvol;
209 ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
210 bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
211 if (ret)
212 return ret;
213
214 if (!subvol.inode) {
215 struct btree_iter iter;
216 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
217 BTREE_ID_subvolumes, POS(0, subvolid),
218 0, subvolume);
219 ret = PTR_ERR_OR_ZERO(subvol);
220 if (ret)
221 return ret;
222
223 subvol->v.inode = cpu_to_le64(reattaching_inum);
224 bch2_trans_iter_exit(trans, &iter);
225 }
226
227 subvol_inum root_inum = {
228 .subvol = subvolid,
229 .inum = le64_to_cpu(subvol.inode)
230 };
231
232 struct bch_inode_unpacked root_inode;
233 struct bch_hash_info root_hash_info;
234 ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
235 bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
236 root_inum.inum, subvolid);
237 if (ret)
238 return ret;
239
240 root_hash_info = bch2_hash_info_init(c, &root_inode);
241
242 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
243 &lostfound_str, &inum, &d_type, snapshot);
244 if (bch2_err_matches(ret, ENOENT))
245 goto create_lostfound;
246
247 bch_err_fn(c, ret);
248 if (ret)
249 return ret;
250
251 if (d_type != DT_DIR) {
252 bch_err(c, "error looking up lost+found: not a directory");
253 return -BCH_ERR_ENOENT_not_directory;
254 }
255
256 /*
257 * The bch2_check_dirents pass has already run, dangling dirents
258 * shouldn't exist here:
259 */
260 ret = lookup_inode(trans, inum, snapshot, lostfound);
261 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
262 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
263 return ret;
264
265 create_lostfound:
266 /*
267 * we always create lost+found in the root snapshot; we don't want
268 * different branches of the snapshot tree to have different lost+found
269 */
270 snapshot = le32_to_cpu(st.root_snapshot);
271 /*
272 * XXX: we could have a nicer log message here if we had a nice way to
273 * walk backpointers to print a path
274 */
275 struct printbuf path = PRINTBUF;
276 ret = bch2_inum_to_path(trans, root_inum, &path);
277 if (ret)
278 goto err;
279
280 bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
281 path.buf, root_inum.subvol, snapshot);
282 printbuf_exit(&path);
283
284 u64 now = bch2_current_time(c);
285 u64 cpu = raw_smp_processor_id();
286
287 bch2_inode_init_early(c, lostfound);
288 bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
289 lostfound->bi_dir = root_inode.bi_inum;
290 lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
291
292 root_inode.bi_nlink++;
293
294 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
295 if (ret)
296 goto err;
297
298 bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
299 ret = bch2_btree_iter_traverse(&lostfound_iter);
300 if (ret)
301 goto err;
302
303 ret = bch2_dirent_create_snapshot(trans,
304 0, root_inode.bi_inum, snapshot, &root_hash_info,
305 mode_to_type(lostfound->bi_mode),
306 &lostfound_str,
307 lostfound->bi_inum,
308 &lostfound->bi_dir_offset,
309 STR_HASH_must_create) ?:
310 bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
311 BTREE_UPDATE_internal_snapshot_node);
312 err:
313 bch_err_msg(c, ret, "creating lost+found");
314 bch2_trans_iter_exit(trans, &lostfound_iter);
315 return ret;
316 }
317
inode_should_reattach(struct bch_inode_unpacked * inode)318 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
319 {
320 if (inode->bi_inum == BCACHEFS_ROOT_INO &&
321 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
322 return false;
323
324 return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
325 }
326
maybe_delete_dirent(struct btree_trans * trans,struct bpos d_pos,u32 snapshot)327 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
328 {
329 struct btree_iter iter;
330 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
331 SPOS(d_pos.inode, d_pos.offset, snapshot),
332 BTREE_ITER_intent|
333 BTREE_ITER_with_updates);
334 int ret = bkey_err(k);
335 if (ret)
336 return ret;
337
338 if (bpos_eq(k.k->p, d_pos)) {
339 /*
340 * delet_at() doesn't work because the update path doesn't
341 * internally use BTREE_ITER_with_updates yet
342 */
343 struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
344 ret = PTR_ERR_OR_ZERO(k);
345 if (ret)
346 goto err;
347
348 bkey_init(&k->k);
349 k->k.type = KEY_TYPE_whiteout;
350 k->k.p = iter.pos;
351 ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
352 }
353 err:
354 bch2_trans_iter_exit(trans, &iter);
355 return ret;
356 }
357
reattach_inode(struct btree_trans * trans,struct bch_inode_unpacked * inode)358 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
359 {
360 struct bch_fs *c = trans->c;
361 struct bch_inode_unpacked lostfound;
362 char name_buf[20];
363 int ret;
364
365 u32 dirent_snapshot = inode->bi_snapshot;
366 if (inode->bi_subvol) {
367 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
368
369 u64 root_inum;
370 ret = subvol_lookup(trans, inode->bi_parent_subvol,
371 &dirent_snapshot, &root_inum);
372 if (ret)
373 return ret;
374
375 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
376 } else {
377 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
378 }
379
380 ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
381 if (ret)
382 return ret;
383
384 lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
385
386 /* ensure lost+found inode is also present in inode snapshot */
387 if (!inode->bi_subvol) {
388 BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
389 lostfound.bi_snapshot = inode->bi_snapshot;
390 }
391
392 ret = __bch2_fsck_write_inode(trans, &lostfound);
393 if (ret)
394 return ret;
395
396 struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
397 struct qstr name = QSTR(name_buf);
398
399 inode->bi_dir = lostfound.bi_inum;
400
401 ret = bch2_dirent_create_snapshot(trans,
402 inode->bi_parent_subvol, lostfound.bi_inum,
403 dirent_snapshot,
404 &dir_hash,
405 inode_d_type(inode),
406 &name,
407 inode->bi_subvol ?: inode->bi_inum,
408 &inode->bi_dir_offset,
409 STR_HASH_must_create);
410 if (ret) {
411 bch_err_msg(c, ret, "error creating dirent");
412 return ret;
413 }
414
415 ret = __bch2_fsck_write_inode(trans, inode);
416 if (ret)
417 return ret;
418
419 /*
420 * Fix up inodes in child snapshots: if they should also be reattached
421 * update the backpointer field, if they should not be we need to emit
422 * whiteouts for the dirent we just created.
423 */
424 if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
425 snapshot_id_list whiteouts_done;
426 struct btree_iter iter;
427 struct bkey_s_c k;
428
429 darray_init(&whiteouts_done);
430
431 for_each_btree_key_reverse_norestart(trans, iter,
432 BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
433 BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
434 if (k.k->p.offset != inode->bi_inum)
435 break;
436
437 if (!bkey_is_inode(k.k) ||
438 !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
439 snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
440 continue;
441
442 struct bch_inode_unpacked child_inode;
443 ret = bch2_inode_unpack(k, &child_inode);
444 if (ret)
445 break;
446
447 if (!inode_should_reattach(&child_inode)) {
448 ret = maybe_delete_dirent(trans,
449 SPOS(lostfound.bi_inum, inode->bi_dir_offset,
450 dirent_snapshot),
451 k.k->p.snapshot);
452 if (ret)
453 break;
454
455 ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
456 if (ret)
457 break;
458 } else {
459 iter.snapshot = k.k->p.snapshot;
460 child_inode.bi_dir = inode->bi_dir;
461 child_inode.bi_dir_offset = inode->bi_dir_offset;
462
463 ret = bch2_inode_write_flags(trans, &iter, &child_inode,
464 BTREE_UPDATE_internal_snapshot_node);
465 if (ret)
466 break;
467 }
468 }
469 darray_exit(&whiteouts_done);
470 bch2_trans_iter_exit(trans, &iter);
471 }
472
473 return ret;
474 }
475
dirent_get_by_pos(struct btree_trans * trans,struct btree_iter * iter,struct bpos pos)476 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
477 struct btree_iter *iter,
478 struct bpos pos)
479 {
480 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
481 }
482
remove_backpointer(struct btree_trans * trans,struct bch_inode_unpacked * inode)483 static int remove_backpointer(struct btree_trans *trans,
484 struct bch_inode_unpacked *inode)
485 {
486 if (!inode->bi_dir)
487 return 0;
488
489 struct bch_fs *c = trans->c;
490 struct btree_iter iter;
491 struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
492 SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
493 int ret = bkey_err(d) ?:
494 dirent_points_to_inode(c, d, inode) ?:
495 bch2_fsck_remove_dirent(trans, d.k->p);
496 bch2_trans_iter_exit(trans, &iter);
497 return ret;
498 }
499
reattach_subvol(struct btree_trans * trans,struct bkey_s_c_subvolume s)500 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
501 {
502 struct bch_fs *c = trans->c;
503
504 struct bch_inode_unpacked inode;
505 int ret = bch2_inode_find_by_inum_trans(trans,
506 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
507 &inode);
508 if (ret)
509 return ret;
510
511 ret = remove_backpointer(trans, &inode);
512 if (!bch2_err_matches(ret, ENOENT))
513 bch_err_msg(c, ret, "removing dirent");
514 if (ret)
515 return ret;
516
517 ret = reattach_inode(trans, &inode);
518 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
519 return ret;
520 }
521
reconstruct_subvol(struct btree_trans * trans,u32 snapshotid,u32 subvolid,u64 inum)522 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
523 {
524 struct bch_fs *c = trans->c;
525
526 if (!bch2_snapshot_is_leaf(c, snapshotid)) {
527 bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
528 return -BCH_ERR_fsck_repair_unimplemented;
529 }
530
531 /*
532 * If inum isn't set, that means we're being called from check_dirents,
533 * not check_inodes - the root of this subvolume doesn't exist or we
534 * would have found it there:
535 */
536 if (!inum) {
537 struct btree_iter inode_iter = {};
538 struct bch_inode_unpacked new_inode;
539 u64 cpu = raw_smp_processor_id();
540
541 bch2_inode_init_early(c, &new_inode);
542 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
543
544 new_inode.bi_subvol = subvolid;
545
546 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
547 bch2_btree_iter_traverse(&inode_iter) ?:
548 bch2_inode_write(trans, &inode_iter, &new_inode);
549 bch2_trans_iter_exit(trans, &inode_iter);
550 if (ret)
551 return ret;
552
553 inum = new_inode.bi_inum;
554 }
555
556 bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
557
558 struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
559 int ret = PTR_ERR_OR_ZERO(new_subvol);
560 if (ret)
561 return ret;
562
563 bkey_subvolume_init(&new_subvol->k_i);
564 new_subvol->k.p.offset = subvolid;
565 new_subvol->v.snapshot = cpu_to_le32(snapshotid);
566 new_subvol->v.inode = cpu_to_le64(inum);
567 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
568 if (ret)
569 return ret;
570
571 struct btree_iter iter;
572 struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
573 BTREE_ID_snapshots, POS(0, snapshotid),
574 0, snapshot);
575 ret = PTR_ERR_OR_ZERO(s);
576 bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
577 if (ret)
578 return ret;
579
580 u32 snapshot_tree = le32_to_cpu(s->v.tree);
581
582 s->v.subvol = cpu_to_le32(subvolid);
583 SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
584 bch2_trans_iter_exit(trans, &iter);
585
586 struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
587 BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
588 0, snapshot_tree);
589 ret = PTR_ERR_OR_ZERO(st);
590 bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
591 if (ret)
592 return ret;
593
594 if (!st->v.master_subvol)
595 st->v.master_subvol = cpu_to_le32(subvolid);
596
597 bch2_trans_iter_exit(trans, &iter);
598 return 0;
599 }
600
reconstruct_inode(struct btree_trans * trans,enum btree_id btree,u32 snapshot,u64 inum)601 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
602 {
603 struct bch_fs *c = trans->c;
604 unsigned i_mode = S_IFREG;
605 u64 i_size = 0;
606
607 switch (btree) {
608 case BTREE_ID_extents: {
609 struct btree_iter iter = {};
610
611 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
612 struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
613 bch2_trans_iter_exit(trans, &iter);
614 int ret = bkey_err(k);
615 if (ret)
616 return ret;
617
618 i_size = k.k->p.offset << 9;
619 break;
620 }
621 case BTREE_ID_dirents:
622 i_mode = S_IFDIR;
623 break;
624 case BTREE_ID_xattrs:
625 break;
626 default:
627 BUG();
628 }
629
630 struct bch_inode_unpacked new_inode;
631 bch2_inode_init_early(c, &new_inode);
632 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
633 new_inode.bi_size = i_size;
634 new_inode.bi_inum = inum;
635 new_inode.bi_snapshot = snapshot;
636
637 return __bch2_fsck_write_inode(trans, &new_inode);
638 }
639
640 struct snapshots_seen {
641 struct bpos pos;
642 snapshot_id_list ids;
643 };
644
snapshots_seen_exit(struct snapshots_seen * s)645 static inline void snapshots_seen_exit(struct snapshots_seen *s)
646 {
647 darray_exit(&s->ids);
648 }
649
snapshots_seen_init(struct snapshots_seen * s)650 static inline void snapshots_seen_init(struct snapshots_seen *s)
651 {
652 memset(s, 0, sizeof(*s));
653 }
654
snapshots_seen_add_inorder(struct bch_fs * c,struct snapshots_seen * s,u32 id)655 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
656 {
657 u32 *i;
658 __darray_for_each(s->ids, i) {
659 if (*i == id)
660 return 0;
661 if (*i > id)
662 break;
663 }
664
665 int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
666 if (ret)
667 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
668 s->ids.size);
669 return ret;
670 }
671
snapshots_seen_update(struct bch_fs * c,struct snapshots_seen * s,enum btree_id btree_id,struct bpos pos)672 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
673 enum btree_id btree_id, struct bpos pos)
674 {
675 if (!bkey_eq(s->pos, pos))
676 s->ids.nr = 0;
677 s->pos = pos;
678
679 return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
680 }
681
682 /**
683 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
684 * and @ancestor hasn't been overwritten in @seen
685 *
686 * @c: filesystem handle
687 * @seen: list of snapshot ids already seen at current position
688 * @id: descendent snapshot id
689 * @ancestor: ancestor snapshot id
690 *
691 * Returns: whether key in @ancestor snapshot is visible in @id snapshot
692 */
key_visible_in_snapshot(struct bch_fs * c,struct snapshots_seen * seen,u32 id,u32 ancestor)693 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
694 u32 id, u32 ancestor)
695 {
696 ssize_t i;
697
698 EBUG_ON(id > ancestor);
699
700 /* @ancestor should be the snapshot most recently added to @seen */
701 EBUG_ON(ancestor != seen->pos.snapshot);
702 EBUG_ON(ancestor != darray_last(seen->ids));
703
704 if (id == ancestor)
705 return true;
706
707 if (!bch2_snapshot_is_ancestor(c, id, ancestor))
708 return false;
709
710 /*
711 * We know that @id is a descendant of @ancestor, we're checking if
712 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
713 * @ascestor and with @id as a descendent.
714 *
715 * But we already know that we're scanning IDs between @id and @ancestor
716 * numerically, since snapshot ID lists are kept sorted, so if we find
717 * an id that's an ancestor of @id we're done:
718 */
719
720 for (i = seen->ids.nr - 2;
721 i >= 0 && seen->ids.data[i] >= id;
722 --i)
723 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
724 return false;
725
726 return true;
727 }
728
729 /**
730 * ref_visible - given a key with snapshot id @src that points to a key with
731 * snapshot id @dst, test whether there is some snapshot in which @dst is
732 * visible.
733 *
734 * @c: filesystem handle
735 * @s: list of snapshot IDs already seen at @src
736 * @src: snapshot ID of src key
737 * @dst: snapshot ID of dst key
738 * Returns: true if there is some snapshot in which @dst is visible
739 *
740 * Assumes we're visiting @src keys in natural key order
741 */
ref_visible(struct bch_fs * c,struct snapshots_seen * s,u32 src,u32 dst)742 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
743 u32 src, u32 dst)
744 {
745 return dst <= src
746 ? key_visible_in_snapshot(c, s, dst, src)
747 : bch2_snapshot_is_ancestor(c, src, dst);
748 }
749
ref_visible2(struct bch_fs * c,u32 src,struct snapshots_seen * src_seen,u32 dst,struct snapshots_seen * dst_seen)750 static int ref_visible2(struct bch_fs *c,
751 u32 src, struct snapshots_seen *src_seen,
752 u32 dst, struct snapshots_seen *dst_seen)
753 {
754 if (dst > src) {
755 swap(dst, src);
756 swap(dst_seen, src_seen);
757 }
758 return key_visible_in_snapshot(c, src_seen, dst, src);
759 }
760
761 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
762 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \
763 (_i)->snapshot <= (_snapshot); _i++) \
764 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
765
766 struct inode_walker_entry {
767 struct bch_inode_unpacked inode;
768 u32 snapshot;
769 u64 count;
770 u64 i_size;
771 };
772
773 struct inode_walker {
774 bool first_this_inode;
775 bool have_inodes;
776 bool recalculate_sums;
777 struct bpos last_pos;
778
779 DARRAY(struct inode_walker_entry) inodes;
780 snapshot_id_list deletes;
781 };
782
inode_walker_exit(struct inode_walker * w)783 static void inode_walker_exit(struct inode_walker *w)
784 {
785 darray_exit(&w->inodes);
786 darray_exit(&w->deletes);
787 }
788
inode_walker_init(void)789 static struct inode_walker inode_walker_init(void)
790 {
791 return (struct inode_walker) { 0, };
792 }
793
add_inode(struct bch_fs * c,struct inode_walker * w,struct bkey_s_c inode)794 static int add_inode(struct bch_fs *c, struct inode_walker *w,
795 struct bkey_s_c inode)
796 {
797 struct bch_inode_unpacked u;
798
799 return bch2_inode_unpack(inode, &u) ?:
800 darray_push(&w->inodes, ((struct inode_walker_entry) {
801 .inode = u,
802 .snapshot = inode.k->p.snapshot,
803 }));
804 }
805
get_inodes_all_snapshots(struct btree_trans * trans,struct inode_walker * w,u64 inum)806 static int get_inodes_all_snapshots(struct btree_trans *trans,
807 struct inode_walker *w, u64 inum)
808 {
809 struct bch_fs *c = trans->c;
810 struct btree_iter iter;
811 struct bkey_s_c k;
812 int ret;
813
814 /*
815 * We no longer have inodes for w->last_pos; clear this to avoid
816 * screwing up check_i_sectors/check_subdir_count if we take a
817 * transaction restart here:
818 */
819 w->have_inodes = false;
820 w->recalculate_sums = false;
821 w->inodes.nr = 0;
822
823 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
824 BTREE_ITER_all_snapshots, k, ret) {
825 if (k.k->p.offset != inum)
826 break;
827
828 if (bkey_is_inode(k.k))
829 add_inode(c, w, k);
830 }
831 bch2_trans_iter_exit(trans, &iter);
832
833 if (ret)
834 return ret;
835
836 w->first_this_inode = true;
837 w->have_inodes = true;
838 return 0;
839 }
840
841 static struct inode_walker_entry *
lookup_inode_for_snapshot(struct bch_fs * c,struct inode_walker * w,struct bkey_s_c k)842 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
843 {
844 bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
845
846 struct inode_walker_entry *i;
847 __darray_for_each(w->inodes, i)
848 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
849 goto found;
850
851 return NULL;
852 found:
853 BUG_ON(k.k->p.snapshot > i->snapshot);
854
855 if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
856 struct inode_walker_entry new = *i;
857
858 new.snapshot = k.k->p.snapshot;
859 new.count = 0;
860 new.i_size = 0;
861
862 struct printbuf buf = PRINTBUF;
863 bch2_bkey_val_to_text(&buf, c, k);
864
865 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
866 "unexpected because we should always update the inode when we update a key in that inode\n"
867 "%s",
868 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
869 printbuf_exit(&buf);
870
871 while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
872 --i;
873
874 size_t pos = i - w->inodes.data;
875 int ret = darray_insert_item(&w->inodes, pos, new);
876 if (ret)
877 return ERR_PTR(ret);
878
879 i = w->inodes.data + pos;
880 }
881
882 return i;
883 }
884
walk_inode(struct btree_trans * trans,struct inode_walker * w,struct bkey_s_c k)885 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
886 struct inode_walker *w,
887 struct bkey_s_c k)
888 {
889 if (w->last_pos.inode != k.k->p.inode) {
890 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
891 if (ret)
892 return ERR_PTR(ret);
893 }
894
895 w->last_pos = k.k->p;
896
897 return lookup_inode_for_snapshot(trans->c, w, k);
898 }
899
get_visible_inodes(struct btree_trans * trans,struct inode_walker * w,struct snapshots_seen * s,u64 inum)900 static int get_visible_inodes(struct btree_trans *trans,
901 struct inode_walker *w,
902 struct snapshots_seen *s,
903 u64 inum)
904 {
905 struct bch_fs *c = trans->c;
906 struct btree_iter iter;
907 struct bkey_s_c k;
908 int ret;
909
910 w->inodes.nr = 0;
911 w->deletes.nr = 0;
912
913 for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
914 BTREE_ITER_all_snapshots, k, ret) {
915 if (k.k->p.offset != inum)
916 break;
917
918 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
919 continue;
920
921 if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
922 continue;
923
924 ret = bkey_is_inode(k.k)
925 ? add_inode(c, w, k)
926 : snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
927 if (ret)
928 break;
929 }
930 bch2_trans_iter_exit(trans, &iter);
931
932 return ret;
933 }
934
935 /*
936 * Prefer to delete the first one, since that will be the one at the wrong
937 * offset:
938 * return value: 0 -> delete k1, 1 -> delete k2
939 */
bch2_fsck_update_backpointers(struct btree_trans * trans,struct snapshots_seen * s,const struct bch_hash_desc desc,struct bch_hash_info * hash_info,struct bkey_i * new)940 int bch2_fsck_update_backpointers(struct btree_trans *trans,
941 struct snapshots_seen *s,
942 const struct bch_hash_desc desc,
943 struct bch_hash_info *hash_info,
944 struct bkey_i *new)
945 {
946 if (new->k.type != KEY_TYPE_dirent)
947 return 0;
948
949 struct bkey_i_dirent *d = bkey_i_to_dirent(new);
950 struct inode_walker target = inode_walker_init();
951 int ret = 0;
952
953 if (d->v.d_type == DT_SUBVOL) {
954 BUG();
955 } else {
956 ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
957 if (ret)
958 goto err;
959
960 darray_for_each(target.inodes, i) {
961 i->inode.bi_dir_offset = d->k.p.offset;
962 ret = __bch2_fsck_write_inode(trans, &i->inode);
963 if (ret)
964 goto err;
965 }
966 }
967 err:
968 inode_walker_exit(&target);
969 return ret;
970 }
971
inode_get_dirent(struct btree_trans * trans,struct btree_iter * iter,struct bch_inode_unpacked * inode,u32 * snapshot)972 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
973 struct btree_iter *iter,
974 struct bch_inode_unpacked *inode,
975 u32 *snapshot)
976 {
977 if (inode->bi_subvol) {
978 u64 inum;
979 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
980 if (ret)
981 return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
982 }
983
984 return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
985 }
986
check_inode_deleted_list(struct btree_trans * trans,struct bpos p)987 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
988 {
989 struct btree_iter iter;
990 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
991 int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
992 bch2_trans_iter_exit(trans, &iter);
993 return ret;
994 }
995
check_inode_dirent_inode(struct btree_trans * trans,struct bch_inode_unpacked * inode,bool * write_inode)996 static int check_inode_dirent_inode(struct btree_trans *trans,
997 struct bch_inode_unpacked *inode,
998 bool *write_inode)
999 {
1000 struct bch_fs *c = trans->c;
1001 struct printbuf buf = PRINTBUF;
1002
1003 u32 inode_snapshot = inode->bi_snapshot;
1004 struct btree_iter dirent_iter = {};
1005 struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
1006 int ret = bkey_err(d);
1007 if (ret && !bch2_err_matches(ret, ENOENT))
1008 return ret;
1009
1010 if (fsck_err_on(ret,
1011 trans, inode_points_to_missing_dirent,
1012 "inode points to missing dirent\n%s",
1013 (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
1014 fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
1015 trans, inode_points_to_wrong_dirent,
1016 "%s",
1017 (printbuf_reset(&buf),
1018 dirent_inode_mismatch_msg(&buf, c, d, inode),
1019 buf.buf))) {
1020 /*
1021 * We just clear the backpointer fields for now. If we find a
1022 * dirent that points to this inode in check_dirents(), we'll
1023 * update it then; then when we get to check_path() if the
1024 * backpointer is still 0 we'll reattach it.
1025 */
1026 inode->bi_dir = 0;
1027 inode->bi_dir_offset = 0;
1028 *write_inode = true;
1029 }
1030
1031 ret = 0;
1032 fsck_err:
1033 bch2_trans_iter_exit(trans, &dirent_iter);
1034 printbuf_exit(&buf);
1035 bch_err_fn(c, ret);
1036 return ret;
1037 }
1038
get_snapshot_root_inode(struct btree_trans * trans,struct bch_inode_unpacked * root,u64 inum)1039 static int get_snapshot_root_inode(struct btree_trans *trans,
1040 struct bch_inode_unpacked *root,
1041 u64 inum)
1042 {
1043 struct btree_iter iter;
1044 struct bkey_s_c k;
1045 int ret = 0;
1046
1047 for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
1048 SPOS(0, inum, U32_MAX),
1049 BTREE_ITER_all_snapshots, k, ret) {
1050 if (k.k->p.offset != inum)
1051 break;
1052 if (bkey_is_inode(k.k))
1053 goto found_root;
1054 }
1055 if (ret)
1056 goto err;
1057 BUG();
1058 found_root:
1059 ret = bch2_inode_unpack(k, root);
1060 err:
1061 bch2_trans_iter_exit(trans, &iter);
1062 return ret;
1063 }
1064
check_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_inode_unpacked * snapshot_root,struct snapshots_seen * s)1065 static int check_inode(struct btree_trans *trans,
1066 struct btree_iter *iter,
1067 struct bkey_s_c k,
1068 struct bch_inode_unpacked *snapshot_root,
1069 struct snapshots_seen *s)
1070 {
1071 struct bch_fs *c = trans->c;
1072 struct printbuf buf = PRINTBUF;
1073 struct bch_inode_unpacked u;
1074 bool do_update = false;
1075 int ret;
1076
1077 ret = bch2_check_key_has_snapshot(trans, iter, k);
1078 if (ret < 0)
1079 goto err;
1080 if (ret)
1081 return 0;
1082
1083 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1084 if (ret)
1085 goto err;
1086
1087 if (!bkey_is_inode(k.k))
1088 return 0;
1089
1090 ret = bch2_inode_unpack(k, &u);
1091 if (ret)
1092 goto err;
1093
1094 if (snapshot_root->bi_inum != u.bi_inum) {
1095 ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
1096 if (ret)
1097 goto err;
1098 }
1099
1100 if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
1101 INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
1102 trans, inode_snapshot_mismatch,
1103 "inode hash info in different snapshots don't match")) {
1104 u.bi_hash_seed = snapshot_root->bi_hash_seed;
1105 SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
1106 do_update = true;
1107 }
1108
1109 if (u.bi_dir || u.bi_dir_offset) {
1110 ret = check_inode_dirent_inode(trans, &u, &do_update);
1111 if (ret)
1112 goto err;
1113 }
1114
1115 if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
1116 trans, inode_unlinked_but_has_dirent,
1117 "inode unlinked but has dirent\n%s",
1118 (printbuf_reset(&buf),
1119 bch2_inode_unpacked_to_text(&buf, &u),
1120 buf.buf))) {
1121 u.bi_flags &= ~BCH_INODE_unlinked;
1122 do_update = true;
1123 }
1124
1125 if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
1126 /* Check for this early so that check_unreachable_inode() will reattach it */
1127
1128 ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
1129 if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
1130 goto err;
1131
1132 fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
1133 "dir unlinked but not empty\n%s",
1134 (printbuf_reset(&buf),
1135 bch2_inode_unpacked_to_text(&buf, &u),
1136 buf.buf));
1137 u.bi_flags &= ~BCH_INODE_unlinked;
1138 do_update = true;
1139 ret = 0;
1140 }
1141
1142 ret = bch2_inode_has_child_snapshots(trans, k.k->p);
1143 if (ret < 0)
1144 goto err;
1145
1146 if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
1147 trans, inode_has_child_snapshots_wrong,
1148 "inode has_child_snapshots flag wrong (should be %u)\n%s",
1149 ret,
1150 (printbuf_reset(&buf),
1151 bch2_inode_unpacked_to_text(&buf, &u),
1152 buf.buf))) {
1153 if (ret)
1154 u.bi_flags |= BCH_INODE_has_child_snapshot;
1155 else
1156 u.bi_flags &= ~BCH_INODE_has_child_snapshot;
1157 do_update = true;
1158 }
1159 ret = 0;
1160
1161 if ((u.bi_flags & BCH_INODE_unlinked) &&
1162 !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
1163 if (!test_bit(BCH_FS_started, &c->flags)) {
1164 /*
1165 * If we're not in online fsck, don't delete unlinked
1166 * inodes, just make sure they're on the deleted list.
1167 *
1168 * They might be referred to by a logged operation -
1169 * i.e. we might have crashed in the middle of a
1170 * truncate on an unlinked but open file - so we want to
1171 * let the delete_dead_inodes kill it after resuming
1172 * logged ops.
1173 */
1174 ret = check_inode_deleted_list(trans, k.k->p);
1175 if (ret < 0)
1176 goto err_noprint;
1177
1178 fsck_err_on(!ret,
1179 trans, unlinked_inode_not_on_deleted_list,
1180 "inode %llu:%u unlinked, but not on deleted list",
1181 u.bi_inum, k.k->p.snapshot);
1182
1183 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
1184 if (ret)
1185 goto err;
1186 } else {
1187 ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
1188 if (ret < 0)
1189 goto err;
1190
1191 if (fsck_err_on(!ret,
1192 trans, inode_unlinked_and_not_open,
1193 "inode %llu:%u unlinked and not open",
1194 u.bi_inum, u.bi_snapshot)) {
1195 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1196 bch_err_msg(c, ret, "in fsck deleting inode");
1197 goto err_noprint;
1198 }
1199 ret = 0;
1200 }
1201 }
1202
1203 if (fsck_err_on(u.bi_parent_subvol &&
1204 (u.bi_subvol == 0 ||
1205 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1206 trans, inode_bi_parent_nonzero,
1207 "inode %llu:%u has subvol %u but nonzero parent subvol %u",
1208 u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1209 u.bi_parent_subvol = 0;
1210 do_update = true;
1211 }
1212
1213 if (u.bi_subvol) {
1214 struct bch_subvolume s;
1215
1216 ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
1217 if (ret && !bch2_err_matches(ret, ENOENT))
1218 goto err;
1219
1220 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1221 ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1222 goto do_update;
1223 }
1224
1225 if (fsck_err_on(ret,
1226 trans, inode_bi_subvol_missing,
1227 "inode %llu:%u bi_subvol points to missing subvolume %u",
1228 u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1229 fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1230 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1231 k.k->p.snapshot),
1232 trans, inode_bi_subvol_wrong,
1233 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1234 u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1235 le64_to_cpu(s.inode),
1236 le32_to_cpu(s.snapshot))) {
1237 u.bi_subvol = 0;
1238 u.bi_parent_subvol = 0;
1239 do_update = true;
1240 }
1241 }
1242
1243 if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
1244 trans, inode_journal_seq_in_future,
1245 "inode journal seq in future (currently at %llu)\n%s",
1246 journal_cur_seq(&c->journal),
1247 (printbuf_reset(&buf),
1248 bch2_inode_unpacked_to_text(&buf, &u),
1249 buf.buf))) {
1250 u.bi_journal_seq = journal_cur_seq(&c->journal);
1251 do_update = true;
1252 }
1253 do_update:
1254 if (do_update) {
1255 ret = __bch2_fsck_write_inode(trans, &u);
1256 bch_err_msg(c, ret, "in fsck updating inode");
1257 if (ret)
1258 goto err_noprint;
1259 }
1260 err:
1261 fsck_err:
1262 bch_err_fn(c, ret);
1263 err_noprint:
1264 printbuf_exit(&buf);
1265 return ret;
1266 }
1267
bch2_check_inodes(struct bch_fs * c)1268 int bch2_check_inodes(struct bch_fs *c)
1269 {
1270 struct bch_inode_unpacked snapshot_root = {};
1271 struct snapshots_seen s;
1272
1273 snapshots_seen_init(&s);
1274
1275 int ret = bch2_trans_run(c,
1276 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1277 POS_MIN,
1278 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1279 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1280 check_inode(trans, &iter, k, &snapshot_root, &s)));
1281
1282 snapshots_seen_exit(&s);
1283 bch_err_fn(c, ret);
1284 return ret;
1285 }
1286
find_oldest_inode_needs_reattach(struct btree_trans * trans,struct bch_inode_unpacked * inode)1287 static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
1288 struct bch_inode_unpacked *inode)
1289 {
1290 struct bch_fs *c = trans->c;
1291 struct btree_iter iter;
1292 struct bkey_s_c k;
1293 int ret = 0;
1294
1295 /*
1296 * We look for inodes to reattach in natural key order, leaves first,
1297 * but we should do the reattach at the oldest version that needs to be
1298 * reattached:
1299 */
1300 for_each_btree_key_norestart(trans, iter,
1301 BTREE_ID_inodes,
1302 SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
1303 BTREE_ITER_all_snapshots, k, ret) {
1304 if (k.k->p.offset != inode->bi_inum)
1305 break;
1306
1307 if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
1308 continue;
1309
1310 if (!bkey_is_inode(k.k))
1311 break;
1312
1313 struct bch_inode_unpacked parent_inode;
1314 ret = bch2_inode_unpack(k, &parent_inode);
1315 if (ret)
1316 break;
1317
1318 if (!inode_should_reattach(&parent_inode))
1319 break;
1320
1321 *inode = parent_inode;
1322 }
1323 bch2_trans_iter_exit(trans, &iter);
1324
1325 return ret;
1326 }
1327
check_unreachable_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1328 static int check_unreachable_inode(struct btree_trans *trans,
1329 struct btree_iter *iter,
1330 struct bkey_s_c k)
1331 {
1332 struct printbuf buf = PRINTBUF;
1333 int ret = 0;
1334
1335 if (!bkey_is_inode(k.k))
1336 return 0;
1337
1338 struct bch_inode_unpacked inode;
1339 ret = bch2_inode_unpack(k, &inode);
1340 if (ret)
1341 return ret;
1342
1343 if (!inode_should_reattach(&inode))
1344 return 0;
1345
1346 ret = find_oldest_inode_needs_reattach(trans, &inode);
1347 if (ret)
1348 return ret;
1349
1350 if (fsck_err(trans, inode_unreachable,
1351 "unreachable inode:\n%s",
1352 (bch2_inode_unpacked_to_text(&buf, &inode),
1353 buf.buf)))
1354 ret = reattach_inode(trans, &inode);
1355 fsck_err:
1356 printbuf_exit(&buf);
1357 return ret;
1358 }
1359
1360 /*
1361 * Reattach unreachable (but not unlinked) inodes
1362 *
1363 * Run after check_inodes() and check_dirents(), so we node that inode
1364 * backpointer fields point to valid dirents, and every inode that has a dirent
1365 * that points to it has its backpointer field set - so we're just looking for
1366 * non-unlinked inodes without backpointers:
1367 *
1368 * XXX: this is racy w.r.t. hardlink removal in online fsck
1369 */
bch2_check_unreachable_inodes(struct bch_fs * c)1370 int bch2_check_unreachable_inodes(struct bch_fs *c)
1371 {
1372 int ret = bch2_trans_run(c,
1373 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1374 POS_MIN,
1375 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1376 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1377 check_unreachable_inode(trans, &iter, k)));
1378 bch_err_fn(c, ret);
1379 return ret;
1380 }
1381
btree_matches_i_mode(enum btree_id btree,unsigned mode)1382 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1383 {
1384 switch (btree) {
1385 case BTREE_ID_extents:
1386 return S_ISREG(mode) || S_ISLNK(mode);
1387 case BTREE_ID_dirents:
1388 return S_ISDIR(mode);
1389 case BTREE_ID_xattrs:
1390 return true;
1391 default:
1392 BUG();
1393 }
1394 }
1395
check_key_has_inode(struct btree_trans * trans,struct btree_iter * iter,struct inode_walker * inode,struct inode_walker_entry * i,struct bkey_s_c k)1396 static int check_key_has_inode(struct btree_trans *trans,
1397 struct btree_iter *iter,
1398 struct inode_walker *inode,
1399 struct inode_walker_entry *i,
1400 struct bkey_s_c k)
1401 {
1402 struct bch_fs *c = trans->c;
1403 struct printbuf buf = PRINTBUF;
1404 int ret = PTR_ERR_OR_ZERO(i);
1405 if (ret)
1406 return ret;
1407
1408 if (k.k->type == KEY_TYPE_whiteout)
1409 goto out;
1410
1411 if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1412 ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1413 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1414 if (ret)
1415 goto err;
1416
1417 inode->last_pos.inode--;
1418 ret = -BCH_ERR_transaction_restart_nested;
1419 goto err;
1420 }
1421
1422 if (fsck_err_on(!i,
1423 trans, key_in_missing_inode,
1424 "key in missing inode:\n %s",
1425 (printbuf_reset(&buf),
1426 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1427 goto delete;
1428
1429 if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1430 trans, key_in_wrong_inode_type,
1431 "key for wrong inode mode %o:\n %s",
1432 i->inode.bi_mode,
1433 (printbuf_reset(&buf),
1434 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1435 goto delete;
1436 out:
1437 err:
1438 fsck_err:
1439 printbuf_exit(&buf);
1440 bch_err_fn(c, ret);
1441 return ret;
1442 delete:
1443 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1444 goto out;
1445 }
1446
check_i_sectors_notnested(struct btree_trans * trans,struct inode_walker * w)1447 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1448 {
1449 struct bch_fs *c = trans->c;
1450 int ret = 0;
1451 s64 count2;
1452
1453 darray_for_each(w->inodes, i) {
1454 if (i->inode.bi_sectors == i->count)
1455 continue;
1456
1457 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1458
1459 if (w->recalculate_sums)
1460 i->count = count2;
1461
1462 if (i->count != count2) {
1463 bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1464 w->last_pos.inode, i->snapshot, i->count, count2);
1465 i->count = count2;
1466 }
1467
1468 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1469 trans, inode_i_sectors_wrong,
1470 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1471 w->last_pos.inode, i->snapshot,
1472 i->inode.bi_sectors, i->count)) {
1473 i->inode.bi_sectors = i->count;
1474 ret = bch2_fsck_write_inode(trans, &i->inode);
1475 if (ret)
1476 break;
1477 }
1478 }
1479 fsck_err:
1480 bch_err_fn(c, ret);
1481 return ret;
1482 }
1483
check_i_sectors(struct btree_trans * trans,struct inode_walker * w)1484 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1485 {
1486 u32 restart_count = trans->restart_count;
1487 return check_i_sectors_notnested(trans, w) ?:
1488 trans_was_restarted(trans, restart_count);
1489 }
1490
1491 struct extent_end {
1492 u32 snapshot;
1493 u64 offset;
1494 struct snapshots_seen seen;
1495 };
1496
1497 struct extent_ends {
1498 struct bpos last_pos;
1499 DARRAY(struct extent_end) e;
1500 };
1501
extent_ends_reset(struct extent_ends * extent_ends)1502 static void extent_ends_reset(struct extent_ends *extent_ends)
1503 {
1504 darray_for_each(extent_ends->e, i)
1505 snapshots_seen_exit(&i->seen);
1506 extent_ends->e.nr = 0;
1507 }
1508
extent_ends_exit(struct extent_ends * extent_ends)1509 static void extent_ends_exit(struct extent_ends *extent_ends)
1510 {
1511 extent_ends_reset(extent_ends);
1512 darray_exit(&extent_ends->e);
1513 }
1514
extent_ends_init(struct extent_ends * extent_ends)1515 static void extent_ends_init(struct extent_ends *extent_ends)
1516 {
1517 memset(extent_ends, 0, sizeof(*extent_ends));
1518 }
1519
extent_ends_at(struct bch_fs * c,struct extent_ends * extent_ends,struct snapshots_seen * seen,struct bkey_s_c k)1520 static int extent_ends_at(struct bch_fs *c,
1521 struct extent_ends *extent_ends,
1522 struct snapshots_seen *seen,
1523 struct bkey_s_c k)
1524 {
1525 struct extent_end *i, n = (struct extent_end) {
1526 .offset = k.k->p.offset,
1527 .snapshot = k.k->p.snapshot,
1528 .seen = *seen,
1529 };
1530
1531 n.seen.ids.data = kmemdup(seen->ids.data,
1532 sizeof(seen->ids.data[0]) * seen->ids.size,
1533 GFP_KERNEL);
1534 if (!n.seen.ids.data)
1535 return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1536
1537 __darray_for_each(extent_ends->e, i) {
1538 if (i->snapshot == k.k->p.snapshot) {
1539 snapshots_seen_exit(&i->seen);
1540 *i = n;
1541 return 0;
1542 }
1543
1544 if (i->snapshot >= k.k->p.snapshot)
1545 break;
1546 }
1547
1548 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1549 }
1550
overlapping_extents_found(struct btree_trans * trans,enum btree_id btree,struct bpos pos1,struct snapshots_seen * pos1_seen,struct bkey pos2,bool * fixed,struct extent_end * extent_end)1551 static int overlapping_extents_found(struct btree_trans *trans,
1552 enum btree_id btree,
1553 struct bpos pos1, struct snapshots_seen *pos1_seen,
1554 struct bkey pos2,
1555 bool *fixed,
1556 struct extent_end *extent_end)
1557 {
1558 struct bch_fs *c = trans->c;
1559 struct printbuf buf = PRINTBUF;
1560 struct btree_iter iter1, iter2 = { NULL };
1561 struct bkey_s_c k1, k2;
1562 int ret;
1563
1564 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1565
1566 bch2_trans_iter_init(trans, &iter1, btree, pos1,
1567 BTREE_ITER_all_snapshots|
1568 BTREE_ITER_not_extents);
1569 k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
1570 ret = bkey_err(k1);
1571 if (ret)
1572 goto err;
1573
1574 prt_str(&buf, "\n ");
1575 bch2_bkey_val_to_text(&buf, c, k1);
1576
1577 if (!bpos_eq(pos1, k1.k->p)) {
1578 prt_str(&buf, "\n wanted\n ");
1579 bch2_bpos_to_text(&buf, pos1);
1580 prt_str(&buf, "\n ");
1581 bch2_bkey_to_text(&buf, &pos2);
1582
1583 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1584 __func__, buf.buf);
1585 ret = -BCH_ERR_internal_fsck_err;
1586 goto err;
1587 }
1588
1589 bch2_trans_copy_iter(&iter2, &iter1);
1590
1591 while (1) {
1592 bch2_btree_iter_advance(&iter2);
1593
1594 k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
1595 ret = bkey_err(k2);
1596 if (ret)
1597 goto err;
1598
1599 if (bpos_ge(k2.k->p, pos2.p))
1600 break;
1601 }
1602
1603 prt_str(&buf, "\n ");
1604 bch2_bkey_val_to_text(&buf, c, k2);
1605
1606 if (bpos_gt(k2.k->p, pos2.p) ||
1607 pos2.size != k2.k->size) {
1608 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1609 __func__, buf.buf);
1610 ret = -BCH_ERR_internal_fsck_err;
1611 goto err;
1612 }
1613
1614 prt_printf(&buf, "\n overwriting %s extent",
1615 pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1616
1617 if (fsck_err(trans, extent_overlapping,
1618 "overlapping extents%s", buf.buf)) {
1619 struct btree_iter *old_iter = &iter1;
1620 struct disk_reservation res = { 0 };
1621
1622 if (pos1.snapshot < pos2.p.snapshot) {
1623 old_iter = &iter2;
1624 swap(k1, k2);
1625 }
1626
1627 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1628
1629 ret = bch2_trans_update_extent_overwrite(trans, old_iter,
1630 BTREE_UPDATE_internal_snapshot_node,
1631 k1, k2) ?:
1632 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1633 bch2_disk_reservation_put(c, &res);
1634
1635 if (ret)
1636 goto err;
1637
1638 *fixed = true;
1639
1640 if (pos1.snapshot == pos2.p.snapshot) {
1641 /*
1642 * We overwrote the first extent, and did the overwrite
1643 * in the same snapshot:
1644 */
1645 extent_end->offset = bkey_start_offset(&pos2);
1646 } else if (pos1.snapshot > pos2.p.snapshot) {
1647 /*
1648 * We overwrote the first extent in pos2's snapshot:
1649 */
1650 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1651 } else {
1652 /*
1653 * We overwrote the second extent - restart
1654 * check_extent() from the top:
1655 */
1656 ret = -BCH_ERR_transaction_restart_nested;
1657 }
1658 }
1659 fsck_err:
1660 err:
1661 bch2_trans_iter_exit(trans, &iter2);
1662 bch2_trans_iter_exit(trans, &iter1);
1663 printbuf_exit(&buf);
1664 return ret;
1665 }
1666
check_overlapping_extents(struct btree_trans * trans,struct snapshots_seen * seen,struct extent_ends * extent_ends,struct bkey_s_c k,struct btree_iter * iter,bool * fixed)1667 static int check_overlapping_extents(struct btree_trans *trans,
1668 struct snapshots_seen *seen,
1669 struct extent_ends *extent_ends,
1670 struct bkey_s_c k,
1671 struct btree_iter *iter,
1672 bool *fixed)
1673 {
1674 struct bch_fs *c = trans->c;
1675 int ret = 0;
1676
1677 /* transaction restart, running again */
1678 if (bpos_eq(extent_ends->last_pos, k.k->p))
1679 return 0;
1680
1681 if (extent_ends->last_pos.inode != k.k->p.inode)
1682 extent_ends_reset(extent_ends);
1683
1684 darray_for_each(extent_ends->e, i) {
1685 if (i->offset <= bkey_start_offset(k.k))
1686 continue;
1687
1688 if (!ref_visible2(c,
1689 k.k->p.snapshot, seen,
1690 i->snapshot, &i->seen))
1691 continue;
1692
1693 ret = overlapping_extents_found(trans, iter->btree_id,
1694 SPOS(iter->pos.inode,
1695 i->offset,
1696 i->snapshot),
1697 &i->seen,
1698 *k.k, fixed, i);
1699 if (ret)
1700 goto err;
1701 }
1702
1703 extent_ends->last_pos = k.k->p;
1704 err:
1705 return ret;
1706 }
1707
check_extent_overbig(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1708 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1709 struct bkey_s_c k)
1710 {
1711 struct bch_fs *c = trans->c;
1712 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1713 struct bch_extent_crc_unpacked crc;
1714 const union bch_extent_entry *i;
1715 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1716
1717 bkey_for_each_crc(k.k, ptrs, crc, i)
1718 if (crc_is_encoded(crc) &&
1719 crc.uncompressed_size > encoded_extent_max_sectors) {
1720 struct printbuf buf = PRINTBUF;
1721
1722 bch2_bkey_val_to_text(&buf, c, k);
1723 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf);
1724 printbuf_exit(&buf);
1725 }
1726
1727 return 0;
1728 }
1729
check_extent(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct inode_walker * inode,struct snapshots_seen * s,struct extent_ends * extent_ends,struct disk_reservation * res)1730 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1731 struct bkey_s_c k,
1732 struct inode_walker *inode,
1733 struct snapshots_seen *s,
1734 struct extent_ends *extent_ends,
1735 struct disk_reservation *res)
1736 {
1737 struct bch_fs *c = trans->c;
1738 struct printbuf buf = PRINTBUF;
1739 int ret = 0;
1740
1741 ret = bch2_check_key_has_snapshot(trans, iter, k);
1742 if (ret) {
1743 ret = ret < 0 ? ret : 0;
1744 goto out;
1745 }
1746
1747 if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
1748 ret = check_i_sectors(trans, inode);
1749 if (ret)
1750 goto err;
1751 }
1752
1753 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1754 if (ret)
1755 goto err;
1756
1757 struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
1758 ret = PTR_ERR_OR_ZERO(extent_i);
1759 if (ret)
1760 goto err;
1761
1762 ret = check_key_has_inode(trans, iter, inode, extent_i, k);
1763 if (ret)
1764 goto err;
1765
1766 if (k.k->type != KEY_TYPE_whiteout) {
1767 ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1768 &inode->recalculate_sums);
1769 if (ret)
1770 goto err;
1771
1772 /*
1773 * Check inodes in reverse order, from oldest snapshots to
1774 * newest, starting from the inode that matches this extent's
1775 * snapshot. If we didn't have one, iterate over all inodes:
1776 */
1777 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1778 inode->inodes.data && i >= inode->inodes.data;
1779 --i) {
1780 if (i->snapshot > k.k->p.snapshot ||
1781 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1782 continue;
1783
1784 if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1785 !bkey_extent_is_reservation(k),
1786 trans, extent_past_end_of_inode,
1787 "extent type past end of inode %llu:%u, i_size %llu\n %s",
1788 i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1789 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1790 struct btree_iter iter2;
1791
1792 bch2_trans_copy_iter(&iter2, iter);
1793 bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1794 ret = bch2_btree_iter_traverse(&iter2) ?:
1795 bch2_btree_delete_at(trans, &iter2,
1796 BTREE_UPDATE_internal_snapshot_node);
1797 bch2_trans_iter_exit(trans, &iter2);
1798 if (ret)
1799 goto err;
1800
1801 iter->k.type = KEY_TYPE_whiteout;
1802 break;
1803 }
1804 }
1805 }
1806
1807 ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
1808 if (ret)
1809 goto err;
1810
1811 if (bkey_extent_is_allocation(k.k)) {
1812 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1813 inode->inodes.data && i >= inode->inodes.data;
1814 --i) {
1815 if (i->snapshot > k.k->p.snapshot ||
1816 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1817 continue;
1818
1819 i->count += k.k->size;
1820 }
1821 }
1822
1823 if (k.k->type != KEY_TYPE_whiteout) {
1824 ret = extent_ends_at(c, extent_ends, s, k);
1825 if (ret)
1826 goto err;
1827 }
1828 out:
1829 err:
1830 fsck_err:
1831 printbuf_exit(&buf);
1832 bch_err_fn(c, ret);
1833 return ret;
1834 }
1835
1836 /*
1837 * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1838 * that i_size an i_sectors are consistent
1839 */
bch2_check_extents(struct bch_fs * c)1840 int bch2_check_extents(struct bch_fs *c)
1841 {
1842 struct inode_walker w = inode_walker_init();
1843 struct snapshots_seen s;
1844 struct extent_ends extent_ends;
1845 struct disk_reservation res = { 0 };
1846
1847 snapshots_seen_init(&s);
1848 extent_ends_init(&extent_ends);
1849
1850 int ret = bch2_trans_run(c,
1851 for_each_btree_key(trans, iter, BTREE_ID_extents,
1852 POS(BCACHEFS_ROOT_INO, 0),
1853 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
1854 bch2_disk_reservation_put(c, &res);
1855 check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
1856 check_extent_overbig(trans, &iter, k);
1857 })) ?:
1858 check_i_sectors_notnested(trans, &w));
1859
1860 bch2_disk_reservation_put(c, &res);
1861 extent_ends_exit(&extent_ends);
1862 inode_walker_exit(&w);
1863 snapshots_seen_exit(&s);
1864
1865 bch_err_fn(c, ret);
1866 return ret;
1867 }
1868
bch2_check_indirect_extents(struct bch_fs * c)1869 int bch2_check_indirect_extents(struct bch_fs *c)
1870 {
1871 struct disk_reservation res = { 0 };
1872
1873 int ret = bch2_trans_run(c,
1874 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1875 POS_MIN,
1876 BTREE_ITER_prefetch, k,
1877 &res, NULL,
1878 BCH_TRANS_COMMIT_no_enospc, ({
1879 bch2_disk_reservation_put(c, &res);
1880 check_extent_overbig(trans, &iter, k);
1881 })));
1882
1883 bch2_disk_reservation_put(c, &res);
1884 bch_err_fn(c, ret);
1885 return ret;
1886 }
1887
check_subdir_count_notnested(struct btree_trans * trans,struct inode_walker * w)1888 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1889 {
1890 struct bch_fs *c = trans->c;
1891 int ret = 0;
1892 s64 count2;
1893
1894 darray_for_each(w->inodes, i) {
1895 if (i->inode.bi_nlink == i->count)
1896 continue;
1897
1898 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1899 if (count2 < 0)
1900 return count2;
1901
1902 if (i->count != count2) {
1903 bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1904 w->last_pos.inode, i->snapshot, i->count, count2);
1905 i->count = count2;
1906 if (i->inode.bi_nlink == i->count)
1907 continue;
1908 }
1909
1910 if (fsck_err_on(i->inode.bi_nlink != i->count,
1911 trans, inode_dir_wrong_nlink,
1912 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1913 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1914 i->inode.bi_nlink = i->count;
1915 ret = bch2_fsck_write_inode(trans, &i->inode);
1916 if (ret)
1917 break;
1918 }
1919 }
1920 fsck_err:
1921 bch_err_fn(c, ret);
1922 return ret;
1923 }
1924
check_subdir_dirents_count(struct btree_trans * trans,struct inode_walker * w)1925 static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
1926 {
1927 u32 restart_count = trans->restart_count;
1928 return check_subdir_count_notnested(trans, w) ?:
1929 trans_was_restarted(trans, restart_count);
1930 }
1931
1932 /* find a subvolume that's a descendent of @snapshot: */
find_snapshot_subvol(struct btree_trans * trans,u32 snapshot,u32 * subvolid)1933 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1934 {
1935 struct btree_iter iter;
1936 struct bkey_s_c k;
1937 int ret;
1938
1939 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1940 if (k.k->type != KEY_TYPE_subvolume)
1941 continue;
1942
1943 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1944 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1945 bch2_trans_iter_exit(trans, &iter);
1946 *subvolid = k.k->p.offset;
1947 goto found;
1948 }
1949 }
1950 if (!ret)
1951 ret = -ENOENT;
1952 found:
1953 bch2_trans_iter_exit(trans, &iter);
1954 return ret;
1955 }
1956
1957 noinline_for_stack
check_dirent_to_subvol(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c_dirent d)1958 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1959 struct bkey_s_c_dirent d)
1960 {
1961 struct bch_fs *c = trans->c;
1962 struct btree_iter subvol_iter = {};
1963 struct bch_inode_unpacked subvol_root;
1964 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1965 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1966 u32 parent_snapshot;
1967 u32 new_parent_subvol = 0;
1968 u64 parent_inum;
1969 struct printbuf buf = PRINTBUF;
1970 int ret = 0;
1971
1972 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1973 if (ret && !bch2_err_matches(ret, ENOENT))
1974 return ret;
1975
1976 if (ret ||
1977 (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
1978 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1979 if (ret2 && !bch2_err_matches(ret, ENOENT))
1980 return ret2;
1981 }
1982
1983 if (ret &&
1984 !new_parent_subvol &&
1985 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1986 /*
1987 * Couldn't find a subvol for dirent's snapshot - but we lost
1988 * subvols, so we need to reconstruct:
1989 */
1990 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
1991 if (ret)
1992 return ret;
1993
1994 parent_snapshot = d.k->p.snapshot;
1995 }
1996
1997 if (fsck_err_on(ret,
1998 trans, dirent_to_missing_parent_subvol,
1999 "dirent parent_subvol points to missing subvolume\n%s",
2000 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
2001 fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
2002 trans, dirent_not_visible_in_parent_subvol,
2003 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
2004 parent_snapshot,
2005 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
2006 if (!new_parent_subvol) {
2007 bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
2008 return -BCH_ERR_fsck_repair_unimplemented;
2009 }
2010
2011 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
2012 ret = PTR_ERR_OR_ZERO(new_dirent);
2013 if (ret)
2014 goto err;
2015
2016 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
2017 }
2018
2019 struct bkey_s_c_subvolume s =
2020 bch2_bkey_get_iter_typed(trans, &subvol_iter,
2021 BTREE_ID_subvolumes, POS(0, target_subvol),
2022 0, subvolume);
2023 ret = bkey_err(s.s_c);
2024 if (ret && !bch2_err_matches(ret, ENOENT))
2025 return ret;
2026
2027 if (ret) {
2028 if (fsck_err(trans, dirent_to_missing_subvol,
2029 "dirent points to missing subvolume\n%s",
2030 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
2031 return bch2_fsck_remove_dirent(trans, d.k->p);
2032 ret = 0;
2033 goto out;
2034 }
2035
2036 if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
2037 trans, subvol_fs_path_parent_wrong,
2038 "subvol with wrong fs_path_parent, should be be %u\n%s",
2039 parent_subvol,
2040 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
2041 struct bkey_i_subvolume *n =
2042 bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
2043 ret = PTR_ERR_OR_ZERO(n);
2044 if (ret)
2045 goto err;
2046
2047 n->v.fs_path_parent = cpu_to_le32(parent_subvol);
2048 }
2049
2050 u64 target_inum = le64_to_cpu(s.v->inode);
2051 u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2052
2053 ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
2054 if (ret && !bch2_err_matches(ret, ENOENT))
2055 goto err;
2056
2057 if (ret) {
2058 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2059 ret = -BCH_ERR_fsck_repair_unimplemented;
2060 goto err;
2061 }
2062
2063 if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2064 trans, inode_bi_parent_wrong,
2065 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2066 target_inum,
2067 subvol_root.bi_parent_subvol, parent_subvol)) {
2068 subvol_root.bi_parent_subvol = parent_subvol;
2069 subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
2070 ret = __bch2_fsck_write_inode(trans, &subvol_root);
2071 if (ret)
2072 goto err;
2073 }
2074
2075 ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
2076 if (ret)
2077 goto err;
2078 out:
2079 err:
2080 fsck_err:
2081 bch2_trans_iter_exit(trans, &subvol_iter);
2082 printbuf_exit(&buf);
2083 return ret;
2084 }
2085
check_dirent(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_hash_info * hash_info,struct inode_walker * dir,struct inode_walker * target,struct snapshots_seen * s)2086 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2087 struct bkey_s_c k,
2088 struct bch_hash_info *hash_info,
2089 struct inode_walker *dir,
2090 struct inode_walker *target,
2091 struct snapshots_seen *s)
2092 {
2093 struct bch_fs *c = trans->c;
2094 struct inode_walker_entry *i;
2095 struct printbuf buf = PRINTBUF;
2096 int ret = 0;
2097
2098 ret = bch2_check_key_has_snapshot(trans, iter, k);
2099 if (ret) {
2100 ret = ret < 0 ? ret : 0;
2101 goto out;
2102 }
2103
2104 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2105 if (ret)
2106 goto err;
2107
2108 if (k.k->type == KEY_TYPE_whiteout)
2109 goto out;
2110
2111 if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
2112 ret = check_subdir_dirents_count(trans, dir);
2113 if (ret)
2114 goto err;
2115 }
2116
2117 i = walk_inode(trans, dir, k);
2118 ret = PTR_ERR_OR_ZERO(i);
2119 if (ret < 0)
2120 goto err;
2121
2122 ret = check_key_has_inode(trans, iter, dir, i, k);
2123 if (ret)
2124 goto err;
2125
2126 if (!i)
2127 goto out;
2128
2129 if (dir->first_this_inode)
2130 *hash_info = bch2_hash_info_init(c, &i->inode);
2131 dir->first_this_inode = false;
2132
2133 ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
2134 if (ret < 0)
2135 goto err;
2136 if (ret) {
2137 /* dirent has been deleted */
2138 ret = 0;
2139 goto out;
2140 }
2141
2142 if (k.k->type != KEY_TYPE_dirent)
2143 goto out;
2144
2145 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2146
2147 if (d.v->d_type == DT_SUBVOL) {
2148 ret = check_dirent_to_subvol(trans, iter, d);
2149 if (ret)
2150 goto err;
2151 } else {
2152 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2153 if (ret)
2154 goto err;
2155
2156 if (fsck_err_on(!target->inodes.nr,
2157 trans, dirent_to_missing_inode,
2158 "dirent points to missing inode:\n%s",
2159 (printbuf_reset(&buf),
2160 bch2_bkey_val_to_text(&buf, c, k),
2161 buf.buf))) {
2162 ret = bch2_fsck_remove_dirent(trans, d.k->p);
2163 if (ret)
2164 goto err;
2165 }
2166
2167 darray_for_each(target->inodes, i) {
2168 ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
2169 if (ret)
2170 goto err;
2171 }
2172
2173 darray_for_each(target->deletes, i)
2174 if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
2175 trans, dirent_to_overwritten_inode,
2176 "dirent points to inode overwritten in snapshot %u:\n%s",
2177 *i,
2178 (printbuf_reset(&buf),
2179 bch2_bkey_val_to_text(&buf, c, k),
2180 buf.buf))) {
2181 struct btree_iter delete_iter;
2182 bch2_trans_iter_init(trans, &delete_iter,
2183 BTREE_ID_dirents,
2184 SPOS(k.k->p.inode, k.k->p.offset, *i),
2185 BTREE_ITER_intent);
2186 ret = bch2_btree_iter_traverse(&delete_iter) ?:
2187 bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
2188 hash_info,
2189 &delete_iter,
2190 BTREE_UPDATE_internal_snapshot_node);
2191 bch2_trans_iter_exit(trans, &delete_iter);
2192 if (ret)
2193 goto err;
2194
2195 }
2196 }
2197
2198 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2199 if (ret)
2200 goto err;
2201
2202 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
2203 if (d.v->d_type == DT_DIR)
2204 i->count++;
2205 i->i_size += bkey_bytes(d.k);
2206 }
2207 out:
2208 err:
2209 fsck_err:
2210 printbuf_exit(&buf);
2211 bch_err_fn(c, ret);
2212 return ret;
2213 }
2214
2215 /*
2216 * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2217 * validate d_type
2218 */
bch2_check_dirents(struct bch_fs * c)2219 int bch2_check_dirents(struct bch_fs *c)
2220 {
2221 struct inode_walker dir = inode_walker_init();
2222 struct inode_walker target = inode_walker_init();
2223 struct snapshots_seen s;
2224 struct bch_hash_info hash_info;
2225
2226 snapshots_seen_init(&s);
2227
2228 int ret = bch2_trans_run(c,
2229 for_each_btree_key(trans, iter, BTREE_ID_dirents,
2230 POS(BCACHEFS_ROOT_INO, 0),
2231 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2232 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2233 check_subdir_count_notnested(trans, &dir));
2234
2235 snapshots_seen_exit(&s);
2236 inode_walker_exit(&dir);
2237 inode_walker_exit(&target);
2238 bch_err_fn(c, ret);
2239 return ret;
2240 }
2241
check_xattr(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_hash_info * hash_info,struct inode_walker * inode)2242 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2243 struct bkey_s_c k,
2244 struct bch_hash_info *hash_info,
2245 struct inode_walker *inode)
2246 {
2247 struct bch_fs *c = trans->c;
2248 struct inode_walker_entry *i;
2249 int ret;
2250
2251 ret = bch2_check_key_has_snapshot(trans, iter, k);
2252 if (ret < 0)
2253 return ret;
2254 if (ret)
2255 return 0;
2256
2257 i = walk_inode(trans, inode, k);
2258 ret = PTR_ERR_OR_ZERO(i);
2259 if (ret)
2260 return ret;
2261
2262 ret = check_key_has_inode(trans, iter, inode, i, k);
2263 if (ret)
2264 return ret;
2265
2266 if (!i)
2267 return 0;
2268
2269 if (inode->first_this_inode)
2270 *hash_info = bch2_hash_info_init(c, &i->inode);
2271 inode->first_this_inode = false;
2272
2273 ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
2274 bch_err_fn(c, ret);
2275 return ret;
2276 }
2277
2278 /*
2279 * Walk xattrs: verify that they all have a corresponding inode
2280 */
bch2_check_xattrs(struct bch_fs * c)2281 int bch2_check_xattrs(struct bch_fs *c)
2282 {
2283 struct inode_walker inode = inode_walker_init();
2284 struct bch_hash_info hash_info;
2285 int ret = 0;
2286
2287 ret = bch2_trans_run(c,
2288 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2289 POS(BCACHEFS_ROOT_INO, 0),
2290 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2291 k,
2292 NULL, NULL,
2293 BCH_TRANS_COMMIT_no_enospc,
2294 check_xattr(trans, &iter, k, &hash_info, &inode)));
2295
2296 inode_walker_exit(&inode);
2297 bch_err_fn(c, ret);
2298 return ret;
2299 }
2300
check_root_trans(struct btree_trans * trans)2301 static int check_root_trans(struct btree_trans *trans)
2302 {
2303 struct bch_fs *c = trans->c;
2304 struct bch_inode_unpacked root_inode;
2305 u32 snapshot;
2306 u64 inum;
2307 int ret;
2308
2309 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2310 if (ret && !bch2_err_matches(ret, ENOENT))
2311 return ret;
2312
2313 if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2314 "root subvol missing")) {
2315 struct bkey_i_subvolume *root_subvol =
2316 bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2317 ret = PTR_ERR_OR_ZERO(root_subvol);
2318 if (ret)
2319 goto err;
2320
2321 snapshot = U32_MAX;
2322 inum = BCACHEFS_ROOT_INO;
2323
2324 bkey_subvolume_init(&root_subvol->k_i);
2325 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2326 root_subvol->v.flags = 0;
2327 root_subvol->v.snapshot = cpu_to_le32(snapshot);
2328 root_subvol->v.inode = cpu_to_le64(inum);
2329 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2330 bch_err_msg(c, ret, "writing root subvol");
2331 if (ret)
2332 goto err;
2333 }
2334
2335 ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
2336 if (ret && !bch2_err_matches(ret, ENOENT))
2337 return ret;
2338
2339 if (mustfix_fsck_err_on(ret,
2340 trans, root_dir_missing,
2341 "root directory missing") ||
2342 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2343 trans, root_inode_not_dir,
2344 "root inode not a directory")) {
2345 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2346 0, NULL);
2347 root_inode.bi_inum = inum;
2348 root_inode.bi_snapshot = snapshot;
2349
2350 ret = __bch2_fsck_write_inode(trans, &root_inode);
2351 bch_err_msg(c, ret, "writing root inode");
2352 }
2353 err:
2354 fsck_err:
2355 return ret;
2356 }
2357
2358 /* Get root directory, create if it doesn't exist: */
bch2_check_root(struct bch_fs * c)2359 int bch2_check_root(struct bch_fs *c)
2360 {
2361 int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2362 check_root_trans(trans));
2363 bch_err_fn(c, ret);
2364 return ret;
2365 }
2366
2367 typedef DARRAY(u32) darray_u32;
2368
darray_u32_has(darray_u32 * d,u32 v)2369 static bool darray_u32_has(darray_u32 *d, u32 v)
2370 {
2371 darray_for_each(*d, i)
2372 if (*i == v)
2373 return true;
2374 return false;
2375 }
2376
check_subvol_path(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)2377 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2378 {
2379 struct bch_fs *c = trans->c;
2380 struct btree_iter parent_iter = {};
2381 darray_u32 subvol_path = {};
2382 struct printbuf buf = PRINTBUF;
2383 int ret = 0;
2384
2385 if (k.k->type != KEY_TYPE_subvolume)
2386 return 0;
2387
2388 while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2389 ret = darray_push(&subvol_path, k.k->p.offset);
2390 if (ret)
2391 goto err;
2392
2393 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2394
2395 struct bch_inode_unpacked subvol_root;
2396 ret = bch2_inode_find_by_inum_trans(trans,
2397 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2398 &subvol_root);
2399 if (ret)
2400 break;
2401
2402 u32 parent = le32_to_cpu(s.v->fs_path_parent);
2403
2404 if (darray_u32_has(&subvol_path, parent)) {
2405 if (fsck_err(c, subvol_loop, "subvolume loop"))
2406 ret = reattach_subvol(trans, s);
2407 break;
2408 }
2409
2410 bch2_trans_iter_exit(trans, &parent_iter);
2411 bch2_trans_iter_init(trans, &parent_iter,
2412 BTREE_ID_subvolumes, POS(0, parent), 0);
2413 k = bch2_btree_iter_peek_slot(&parent_iter);
2414 ret = bkey_err(k);
2415 if (ret)
2416 goto err;
2417
2418 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2419 trans, subvol_unreachable,
2420 "unreachable subvolume %s",
2421 (bch2_bkey_val_to_text(&buf, c, s.s_c),
2422 buf.buf))) {
2423 ret = reattach_subvol(trans, s);
2424 break;
2425 }
2426 }
2427 fsck_err:
2428 err:
2429 printbuf_exit(&buf);
2430 darray_exit(&subvol_path);
2431 bch2_trans_iter_exit(trans, &parent_iter);
2432 return ret;
2433 }
2434
bch2_check_subvolume_structure(struct bch_fs * c)2435 int bch2_check_subvolume_structure(struct bch_fs *c)
2436 {
2437 int ret = bch2_trans_run(c,
2438 for_each_btree_key_commit(trans, iter,
2439 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2440 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2441 check_subvol_path(trans, &iter, k)));
2442 bch_err_fn(c, ret);
2443 return ret;
2444 }
2445
2446 struct pathbuf_entry {
2447 u64 inum;
2448 u32 snapshot;
2449 };
2450
2451 typedef DARRAY(struct pathbuf_entry) pathbuf;
2452
bch2_bi_depth_renumber_one(struct btree_trans * trans,struct pathbuf_entry * p,u32 new_depth)2453 static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
2454 u32 new_depth)
2455 {
2456 struct btree_iter iter;
2457 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
2458 SPOS(0, p->inum, p->snapshot), 0);
2459
2460 struct bch_inode_unpacked inode;
2461 int ret = bkey_err(k) ?:
2462 !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
2463 : bch2_inode_unpack(k, &inode);
2464 if (ret)
2465 goto err;
2466
2467 if (inode.bi_depth != new_depth) {
2468 inode.bi_depth = new_depth;
2469 ret = __bch2_fsck_write_inode(trans, &inode) ?:
2470 bch2_trans_commit(trans, NULL, NULL, 0);
2471 }
2472 err:
2473 bch2_trans_iter_exit(trans, &iter);
2474 return ret;
2475 }
2476
bch2_bi_depth_renumber(struct btree_trans * trans,pathbuf * path,u32 new_bi_depth)2477 static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
2478 {
2479 u32 restart_count = trans->restart_count;
2480 int ret = 0;
2481
2482 darray_for_each_reverse(*path, i) {
2483 ret = nested_lockrestart_do(trans,
2484 bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
2485 bch_err_fn(trans->c, ret);
2486 if (ret)
2487 break;
2488
2489 new_bi_depth++;
2490 }
2491
2492 return ret ?: trans_was_restarted(trans, restart_count);
2493 }
2494
path_is_dup(pathbuf * p,u64 inum,u32 snapshot)2495 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2496 {
2497 darray_for_each(*p, i)
2498 if (i->inum == inum &&
2499 i->snapshot == snapshot)
2500 return true;
2501 return false;
2502 }
2503
check_path_loop(struct btree_trans * trans,struct bkey_s_c inode_k)2504 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
2505 {
2506 struct bch_fs *c = trans->c;
2507 struct btree_iter inode_iter = {};
2508 pathbuf path = {};
2509 struct printbuf buf = PRINTBUF;
2510 u32 snapshot = inode_k.k->p.snapshot;
2511 bool redo_bi_depth = false;
2512 u32 min_bi_depth = U32_MAX;
2513 int ret = 0;
2514
2515 struct bch_inode_unpacked inode;
2516 ret = bch2_inode_unpack(inode_k, &inode);
2517 if (ret)
2518 return ret;
2519
2520 while (!inode.bi_subvol) {
2521 struct btree_iter dirent_iter;
2522 struct bkey_s_c_dirent d;
2523 u32 parent_snapshot = snapshot;
2524
2525 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2526 ret = bkey_err(d.s_c);
2527 if (ret && !bch2_err_matches(ret, ENOENT))
2528 goto out;
2529
2530 if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
2531 bch2_trans_iter_exit(trans, &dirent_iter);
2532
2533 if (bch2_err_matches(ret, ENOENT)) {
2534 printbuf_reset(&buf);
2535 bch2_bkey_val_to_text(&buf, c, inode_k);
2536 bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
2537 bch2_err_str(ret), buf.buf);
2538 goto out;
2539 }
2540
2541 bch2_trans_iter_exit(trans, &dirent_iter);
2542
2543 ret = darray_push(&path, ((struct pathbuf_entry) {
2544 .inum = inode.bi_inum,
2545 .snapshot = snapshot,
2546 }));
2547 if (ret)
2548 return ret;
2549
2550 snapshot = parent_snapshot;
2551
2552 bch2_trans_iter_exit(trans, &inode_iter);
2553 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2554 SPOS(0, inode.bi_dir, snapshot), 0);
2555
2556 struct bch_inode_unpacked parent_inode;
2557 ret = bkey_err(inode_k) ?:
2558 !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2559 : bch2_inode_unpack(inode_k, &parent_inode);
2560 if (ret) {
2561 /* Should have been caught in dirents pass */
2562 bch_err_msg(c, ret, "error looking up parent directory");
2563 goto out;
2564 }
2565
2566 min_bi_depth = parent_inode.bi_depth;
2567
2568 if (parent_inode.bi_depth < inode.bi_depth &&
2569 min_bi_depth < U16_MAX)
2570 break;
2571
2572 inode = parent_inode;
2573 snapshot = inode_k.k->p.snapshot;
2574 redo_bi_depth = true;
2575
2576 if (path_is_dup(&path, inode.bi_inum, snapshot)) {
2577 /* XXX print path */
2578 bch_err(c, "directory structure loop");
2579
2580 darray_for_each(path, i)
2581 pr_err("%llu:%u", i->inum, i->snapshot);
2582 pr_err("%llu:%u", inode.bi_inum, snapshot);
2583
2584 if (fsck_err(trans, dir_loop, "directory structure loop")) {
2585 ret = remove_backpointer(trans, &inode);
2586 bch_err_msg(c, ret, "removing dirent");
2587 if (ret)
2588 break;
2589
2590 ret = reattach_inode(trans, &inode);
2591 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2592 }
2593
2594 goto out;
2595 }
2596 }
2597
2598 if (inode.bi_subvol)
2599 min_bi_depth = 0;
2600
2601 if (redo_bi_depth)
2602 ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
2603 out:
2604 fsck_err:
2605 bch2_trans_iter_exit(trans, &inode_iter);
2606 darray_exit(&path);
2607 printbuf_exit(&buf);
2608 bch_err_fn(c, ret);
2609 return ret;
2610 }
2611
2612 /*
2613 * Check for loops in the directory structure: all other connectivity issues
2614 * have been fixed by prior passes
2615 */
bch2_check_directory_structure(struct bch_fs * c)2616 int bch2_check_directory_structure(struct bch_fs *c)
2617 {
2618 int ret = bch2_trans_run(c,
2619 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2620 BTREE_ITER_intent|
2621 BTREE_ITER_prefetch|
2622 BTREE_ITER_all_snapshots, k,
2623 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2624 if (!S_ISDIR(bkey_inode_mode(k)))
2625 continue;
2626
2627 if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2628 continue;
2629
2630 check_path_loop(trans, k);
2631 })));
2632
2633 bch_err_fn(c, ret);
2634 return ret;
2635 }
2636
2637 struct nlink_table {
2638 size_t nr;
2639 size_t size;
2640
2641 struct nlink {
2642 u64 inum;
2643 u32 snapshot;
2644 u32 count;
2645 } *d;
2646 };
2647
add_nlink(struct bch_fs * c,struct nlink_table * t,u64 inum,u32 snapshot)2648 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2649 u64 inum, u32 snapshot)
2650 {
2651 if (t->nr == t->size) {
2652 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2653 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2654
2655 if (!d) {
2656 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2657 new_size);
2658 return -BCH_ERR_ENOMEM_fsck_add_nlink;
2659 }
2660
2661 if (t->d)
2662 memcpy(d, t->d, t->size * sizeof(t->d[0]));
2663 kvfree(t->d);
2664
2665 t->d = d;
2666 t->size = new_size;
2667 }
2668
2669
2670 t->d[t->nr++] = (struct nlink) {
2671 .inum = inum,
2672 .snapshot = snapshot,
2673 };
2674
2675 return 0;
2676 }
2677
nlink_cmp(const void * _l,const void * _r)2678 static int nlink_cmp(const void *_l, const void *_r)
2679 {
2680 const struct nlink *l = _l;
2681 const struct nlink *r = _r;
2682
2683 return cmp_int(l->inum, r->inum);
2684 }
2685
inc_link(struct bch_fs * c,struct snapshots_seen * s,struct nlink_table * links,u64 range_start,u64 range_end,u64 inum,u32 snapshot)2686 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2687 struct nlink_table *links,
2688 u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2689 {
2690 struct nlink *link, key = {
2691 .inum = inum, .snapshot = U32_MAX,
2692 };
2693
2694 if (inum < range_start || inum >= range_end)
2695 return;
2696
2697 link = __inline_bsearch(&key, links->d, links->nr,
2698 sizeof(links->d[0]), nlink_cmp);
2699 if (!link)
2700 return;
2701
2702 while (link > links->d && link[0].inum == link[-1].inum)
2703 --link;
2704
2705 for (; link < links->d + links->nr && link->inum == inum; link++)
2706 if (ref_visible(c, s, snapshot, link->snapshot)) {
2707 link->count++;
2708 if (link->snapshot >= snapshot)
2709 break;
2710 }
2711 }
2712
2713 noinline_for_stack
check_nlinks_find_hardlinks(struct bch_fs * c,struct nlink_table * t,u64 start,u64 * end)2714 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2715 struct nlink_table *t,
2716 u64 start, u64 *end)
2717 {
2718 int ret = bch2_trans_run(c,
2719 for_each_btree_key(trans, iter, BTREE_ID_inodes,
2720 POS(0, start),
2721 BTREE_ITER_intent|
2722 BTREE_ITER_prefetch|
2723 BTREE_ITER_all_snapshots, k, ({
2724 if (!bkey_is_inode(k.k))
2725 continue;
2726
2727 /* Should never fail, checked by bch2_inode_invalid: */
2728 struct bch_inode_unpacked u;
2729 _ret3 = bch2_inode_unpack(k, &u);
2730 if (_ret3)
2731 break;
2732
2733 /*
2734 * Backpointer and directory structure checks are sufficient for
2735 * directories, since they can't have hardlinks:
2736 */
2737 if (S_ISDIR(u.bi_mode))
2738 continue;
2739
2740 /*
2741 * Previous passes ensured that bi_nlink is nonzero if
2742 * it had multiple hardlinks:
2743 */
2744 if (!u.bi_nlink)
2745 continue;
2746
2747 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2748 if (ret) {
2749 *end = k.k->p.offset;
2750 ret = 0;
2751 break;
2752 }
2753 0;
2754 })));
2755
2756 bch_err_fn(c, ret);
2757 return ret;
2758 }
2759
2760 noinline_for_stack
check_nlinks_walk_dirents(struct bch_fs * c,struct nlink_table * links,u64 range_start,u64 range_end)2761 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2762 u64 range_start, u64 range_end)
2763 {
2764 struct snapshots_seen s;
2765
2766 snapshots_seen_init(&s);
2767
2768 int ret = bch2_trans_run(c,
2769 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2770 BTREE_ITER_intent|
2771 BTREE_ITER_prefetch|
2772 BTREE_ITER_all_snapshots, k, ({
2773 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2774 if (ret)
2775 break;
2776
2777 if (k.k->type == KEY_TYPE_dirent) {
2778 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2779
2780 if (d.v->d_type != DT_DIR &&
2781 d.v->d_type != DT_SUBVOL)
2782 inc_link(c, &s, links, range_start, range_end,
2783 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2784 }
2785 0;
2786 })));
2787
2788 snapshots_seen_exit(&s);
2789
2790 bch_err_fn(c, ret);
2791 return ret;
2792 }
2793
check_nlinks_update_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct nlink_table * links,size_t * idx,u64 range_end)2794 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2795 struct bkey_s_c k,
2796 struct nlink_table *links,
2797 size_t *idx, u64 range_end)
2798 {
2799 struct bch_inode_unpacked u;
2800 struct nlink *link = &links->d[*idx];
2801 int ret = 0;
2802
2803 if (k.k->p.offset >= range_end)
2804 return 1;
2805
2806 if (!bkey_is_inode(k.k))
2807 return 0;
2808
2809 ret = bch2_inode_unpack(k, &u);
2810 if (ret)
2811 return ret;
2812
2813 if (S_ISDIR(u.bi_mode))
2814 return 0;
2815
2816 if (!u.bi_nlink)
2817 return 0;
2818
2819 while ((cmp_int(link->inum, k.k->p.offset) ?:
2820 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2821 BUG_ON(*idx == links->nr);
2822 link = &links->d[++*idx];
2823 }
2824
2825 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2826 trans, inode_wrong_nlink,
2827 "inode %llu type %s has wrong i_nlink (%u, should be %u)",
2828 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2829 bch2_inode_nlink_get(&u), link->count)) {
2830 bch2_inode_nlink_set(&u, link->count);
2831 ret = __bch2_fsck_write_inode(trans, &u);
2832 }
2833 fsck_err:
2834 return ret;
2835 }
2836
2837 noinline_for_stack
check_nlinks_update_hardlinks(struct bch_fs * c,struct nlink_table * links,u64 range_start,u64 range_end)2838 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2839 struct nlink_table *links,
2840 u64 range_start, u64 range_end)
2841 {
2842 size_t idx = 0;
2843
2844 int ret = bch2_trans_run(c,
2845 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2846 POS(0, range_start),
2847 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2848 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2849 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2850 if (ret < 0) {
2851 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2852 return ret;
2853 }
2854
2855 return 0;
2856 }
2857
bch2_check_nlinks(struct bch_fs * c)2858 int bch2_check_nlinks(struct bch_fs *c)
2859 {
2860 struct nlink_table links = { 0 };
2861 u64 this_iter_range_start, next_iter_range_start = 0;
2862 int ret = 0;
2863
2864 do {
2865 this_iter_range_start = next_iter_range_start;
2866 next_iter_range_start = U64_MAX;
2867
2868 ret = check_nlinks_find_hardlinks(c, &links,
2869 this_iter_range_start,
2870 &next_iter_range_start);
2871
2872 ret = check_nlinks_walk_dirents(c, &links,
2873 this_iter_range_start,
2874 next_iter_range_start);
2875 if (ret)
2876 break;
2877
2878 ret = check_nlinks_update_hardlinks(c, &links,
2879 this_iter_range_start,
2880 next_iter_range_start);
2881 if (ret)
2882 break;
2883
2884 links.nr = 0;
2885 } while (next_iter_range_start != U64_MAX);
2886
2887 kvfree(links.d);
2888 bch_err_fn(c, ret);
2889 return ret;
2890 }
2891
fix_reflink_p_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)2892 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2893 struct bkey_s_c k)
2894 {
2895 struct bkey_s_c_reflink_p p;
2896 struct bkey_i_reflink_p *u;
2897
2898 if (k.k->type != KEY_TYPE_reflink_p)
2899 return 0;
2900
2901 p = bkey_s_c_to_reflink_p(k);
2902
2903 if (!p.v->front_pad && !p.v->back_pad)
2904 return 0;
2905
2906 u = bch2_trans_kmalloc(trans, sizeof(*u));
2907 int ret = PTR_ERR_OR_ZERO(u);
2908 if (ret)
2909 return ret;
2910
2911 bkey_reassemble(&u->k_i, k);
2912 u->v.front_pad = 0;
2913 u->v.back_pad = 0;
2914
2915 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2916 }
2917
bch2_fix_reflink_p(struct bch_fs * c)2918 int bch2_fix_reflink_p(struct bch_fs *c)
2919 {
2920 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2921 return 0;
2922
2923 int ret = bch2_trans_run(c,
2924 for_each_btree_key_commit(trans, iter,
2925 BTREE_ID_extents, POS_MIN,
2926 BTREE_ITER_intent|BTREE_ITER_prefetch|
2927 BTREE_ITER_all_snapshots, k,
2928 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2929 fix_reflink_p_key(trans, &iter, k)));
2930 bch_err_fn(c, ret);
2931 return ret;
2932 }
2933
2934 #ifndef NO_BCACHEFS_CHARDEV
2935
2936 struct fsck_thread {
2937 struct thread_with_stdio thr;
2938 struct bch_fs *c;
2939 struct bch_opts opts;
2940 };
2941
bch2_fsck_thread_exit(struct thread_with_stdio * _thr)2942 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
2943 {
2944 struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
2945 kfree(thr);
2946 }
2947
bch2_fsck_offline_thread_fn(struct thread_with_stdio * stdio)2948 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
2949 {
2950 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
2951 struct bch_fs *c = thr->c;
2952
2953 int ret = PTR_ERR_OR_ZERO(c);
2954 if (ret)
2955 return ret;
2956
2957 ret = bch2_fs_start(thr->c);
2958 if (ret)
2959 goto err;
2960
2961 if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
2962 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
2963 ret |= 1;
2964 }
2965 if (test_bit(BCH_FS_error, &c->flags)) {
2966 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
2967 ret |= 4;
2968 }
2969 err:
2970 bch2_fs_stop(c);
2971 return ret;
2972 }
2973
2974 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
2975 .exit = bch2_fsck_thread_exit,
2976 .fn = bch2_fsck_offline_thread_fn,
2977 };
2978
bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user * user_arg)2979 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
2980 {
2981 struct bch_ioctl_fsck_offline arg;
2982 struct fsck_thread *thr = NULL;
2983 darray_str(devs) = {};
2984 long ret = 0;
2985
2986 if (copy_from_user(&arg, user_arg, sizeof(arg)))
2987 return -EFAULT;
2988
2989 if (arg.flags)
2990 return -EINVAL;
2991
2992 if (!capable(CAP_SYS_ADMIN))
2993 return -EPERM;
2994
2995 for (size_t i = 0; i < arg.nr_devs; i++) {
2996 u64 dev_u64;
2997 ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
2998 if (ret)
2999 goto err;
3000
3001 char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
3002 ret = PTR_ERR_OR_ZERO(dev_str);
3003 if (ret)
3004 goto err;
3005
3006 ret = darray_push(&devs, dev_str);
3007 if (ret) {
3008 kfree(dev_str);
3009 goto err;
3010 }
3011 }
3012
3013 thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3014 if (!thr) {
3015 ret = -ENOMEM;
3016 goto err;
3017 }
3018
3019 thr->opts = bch2_opts_empty();
3020
3021 if (arg.opts) {
3022 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3023 ret = PTR_ERR_OR_ZERO(optstr) ?:
3024 bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
3025 if (!IS_ERR(optstr))
3026 kfree(optstr);
3027
3028 if (ret)
3029 goto err;
3030 }
3031
3032 opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
3033 opt_set(thr->opts, read_only, 1);
3034 opt_set(thr->opts, ratelimit_errors, 0);
3035
3036 /* We need request_key() to be called before we punt to kthread: */
3037 opt_set(thr->opts, nostart, true);
3038
3039 bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
3040
3041 thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
3042
3043 if (!IS_ERR(thr->c) &&
3044 thr->c->opts.errors == BCH_ON_ERROR_panic)
3045 thr->c->opts.errors = BCH_ON_ERROR_ro;
3046
3047 ret = __bch2_run_thread_with_stdio(&thr->thr);
3048 out:
3049 darray_for_each(devs, i)
3050 kfree(*i);
3051 darray_exit(&devs);
3052 return ret;
3053 err:
3054 if (thr)
3055 bch2_fsck_thread_exit(&thr->thr);
3056 pr_err("ret %s", bch2_err_str(ret));
3057 goto out;
3058 }
3059
bch2_fsck_online_thread_fn(struct thread_with_stdio * stdio)3060 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
3061 {
3062 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3063 struct bch_fs *c = thr->c;
3064
3065 c->stdio_filter = current;
3066 c->stdio = &thr->thr.stdio;
3067
3068 /*
3069 * XXX: can we figure out a way to do this without mucking with c->opts?
3070 */
3071 unsigned old_fix_errors = c->opts.fix_errors;
3072 if (opt_defined(thr->opts, fix_errors))
3073 c->opts.fix_errors = thr->opts.fix_errors;
3074 else
3075 c->opts.fix_errors = FSCK_FIX_ask;
3076
3077 c->opts.fsck = true;
3078 set_bit(BCH_FS_fsck_running, &c->flags);
3079
3080 c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
3081 int ret = bch2_run_online_recovery_passes(c);
3082
3083 clear_bit(BCH_FS_fsck_running, &c->flags);
3084 bch_err_fn(c, ret);
3085
3086 c->stdio = NULL;
3087 c->stdio_filter = NULL;
3088 c->opts.fix_errors = old_fix_errors;
3089
3090 up(&c->online_fsck_mutex);
3091 bch2_ro_ref_put(c);
3092 return ret;
3093 }
3094
3095 static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
3096 .exit = bch2_fsck_thread_exit,
3097 .fn = bch2_fsck_online_thread_fn,
3098 };
3099
bch2_ioctl_fsck_online(struct bch_fs * c,struct bch_ioctl_fsck_online arg)3100 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
3101 {
3102 struct fsck_thread *thr = NULL;
3103 long ret = 0;
3104
3105 if (arg.flags)
3106 return -EINVAL;
3107
3108 if (!capable(CAP_SYS_ADMIN))
3109 return -EPERM;
3110
3111 if (!bch2_ro_ref_tryget(c))
3112 return -EROFS;
3113
3114 if (down_trylock(&c->online_fsck_mutex)) {
3115 bch2_ro_ref_put(c);
3116 return -EAGAIN;
3117 }
3118
3119 thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3120 if (!thr) {
3121 ret = -ENOMEM;
3122 goto err;
3123 }
3124
3125 thr->c = c;
3126 thr->opts = bch2_opts_empty();
3127
3128 if (arg.opts) {
3129 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3130
3131 ret = PTR_ERR_OR_ZERO(optstr) ?:
3132 bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
3133 if (!IS_ERR(optstr))
3134 kfree(optstr);
3135
3136 if (ret)
3137 goto err;
3138 }
3139
3140 ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
3141 err:
3142 if (ret < 0) {
3143 bch_err_fn(c, ret);
3144 if (thr)
3145 bch2_fsck_thread_exit(&thr->thr);
3146 up(&c->online_fsck_mutex);
3147 bch2_ro_ref_put(c);
3148 }
3149 return ret;
3150 }
3151
3152 #endif /* NO_BCACHEFS_CHARDEV */
3153