xref: /linux/fs/xfs/scrub/nlinks.c (revision d85fe250f2eb61e19029e9e0d30095c5f646e2f2)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_log_format.h"
13 #include "xfs_trans.h"
14 #include "xfs_inode.h"
15 #include "xfs_icache.h"
16 #include "xfs_iwalk.h"
17 #include "xfs_ialloc.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ag.h"
21 #include "scrub/scrub.h"
22 #include "scrub/common.h"
23 #include "scrub/repair.h"
24 #include "scrub/xfile.h"
25 #include "scrub/xfarray.h"
26 #include "scrub/iscan.h"
27 #include "scrub/orphanage.h"
28 #include "scrub/nlinks.h"
29 #include "scrub/trace.h"
30 #include "scrub/readdir.h"
31 #include "scrub/tempfile.h"
32 
33 /*
34  * Live Inode Link Count Checking
35  * ==============================
36  *
37  * Inode link counts are "summary" metadata, in the sense that they are
38  * computed as the number of directory entries referencing each file on the
39  * filesystem.  Therefore, we compute the correct link counts by creating a
40  * shadow link count structure and walking every inode.
41  */
42 
43 /* Set us up to scrub inode link counts. */
44 int
45 xchk_setup_nlinks(
46 	struct xfs_scrub	*sc)
47 {
48 	struct xchk_nlink_ctrs	*xnc;
49 	int			error;
50 
51 	xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
52 
53 	if (xchk_could_repair(sc)) {
54 		error = xrep_setup_nlinks(sc);
55 		if (error)
56 			return error;
57 	}
58 
59 	xnc = kvzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
60 	if (!xnc)
61 		return -ENOMEM;
62 	xnc->xname.name = xnc->namebuf;
63 	xnc->sc = sc;
64 	sc->buf = xnc;
65 
66 	return xchk_setup_fs(sc);
67 }
68 
69 /*
70  * Part 1: Collecting file link counts.  For each file, we create a shadow link
71  * counting structure, then walk the entire directory tree, incrementing parent
72  * and child link counts for each directory entry seen.
73  *
74  * To avoid false corruption reports in part 2, any failure in this part must
75  * set the INCOMPLETE flag even when a negative errno is returned.  This care
76  * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
77  * ECANCELED) that are absorbed into a scrub state flag update by
78  * xchk_*_process_error.  Scrub and repair share the same incore data
79  * structures, so the INCOMPLETE flag is critical to prevent a repair based on
80  * insufficient information.
81  *
82  * Because we are scanning a live filesystem, it's possible that another thread
83  * will try to update the link counts for an inode that we've already scanned.
84  * This will cause our counts to be incorrect.  Therefore, we hook all
85  * directory entry updates because that is when link count updates occur.  By
86  * shadowing transaction updates in this manner, live nlink check can ensure by
87  * locking the inode and the shadow structure that its own copies are not out
88  * of date.  Because the hook code runs in a different process context from the
89  * scrub code and the scrub state flags are not accessed atomically, failures
90  * in the hook code must abort the iscan and the scrubber must notice the
91  * aborted scan and set the incomplete flag.
92  *
93  * Note that we use jump labels and srcu notifier hooks to minimize the
94  * overhead when live nlinks is /not/ running.  Locking order for nlink
95  * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock.
96  */
97 
98 /*
99  * Add a delta to an nlink counter, clamping the value to U32_MAX.  Because
100  * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results
101  * even if we lose some precision.
102  */
103 static inline void
104 careful_add(
105 	xfs_nlink_t	*nlinkp,
106 	int		delta)
107 {
108 	uint64_t	new_value = (uint64_t)(*nlinkp) + delta;
109 
110 	BUILD_BUG_ON(XFS_MAXLINK > U32_MAX);
111 	*nlinkp = min_t(uint64_t, new_value, U32_MAX);
112 }
113 
114 /* Update incore link count information.  Caller must hold the nlinks lock. */
115 STATIC int
116 xchk_nlinks_update_incore(
117 	struct xchk_nlink_ctrs	*xnc,
118 	xfs_ino_t		ino,
119 	int			parents_delta,
120 	int			backrefs_delta,
121 	int			children_delta)
122 {
123 	struct xchk_nlink	nl;
124 	int			error;
125 
126 	if (!xnc->nlinks)
127 		return 0;
128 
129 	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
130 	if (error)
131 		return error;
132 
133 	trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta,
134 			backrefs_delta, children_delta);
135 
136 	careful_add(&nl.parents, parents_delta);
137 	careful_add(&nl.backrefs, backrefs_delta);
138 	careful_add(&nl.children, children_delta);
139 
140 	nl.flags |= XCHK_NLINK_WRITTEN;
141 	error = xfarray_store(xnc->nlinks, ino, &nl);
142 	if (error == -EFBIG) {
143 		/*
144 		 * EFBIG means we tried to store data at too high a byte offset
145 		 * in the sparse array.  IOWs, we cannot complete the check and
146 		 * must notify userspace that the check was incomplete.
147 		 */
148 		error = -ECANCELED;
149 	}
150 	return error;
151 }
152 
153 /*
154  * Apply a link count change from the regular filesystem into our shadow link
155  * count structure based on a directory update in progress.
156  */
157 STATIC int
158 xchk_nlinks_live_update(
159 	struct notifier_block		*nb,
160 	unsigned long			action,
161 	void				*data)
162 {
163 	struct xfs_dir_update_params	*p = data;
164 	struct xchk_nlink_ctrs		*xnc;
165 	int				error;
166 
167 	xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
168 
169 	/*
170 	 * Ignore temporary directories being used to stage dir repairs, since
171 	 * we don't bump the link counts of the children.
172 	 */
173 	if (xrep_is_tempfile(p->dp))
174 		return NOTIFY_DONE;
175 
176 	trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
177 			p->delta, p->name->name, p->name->len);
178 
179 	/*
180 	 * If we've already scanned @dp, update the number of parents that link
181 	 * to @ip.  If @ip is a subdirectory, update the number of child links
182 	 * going out of @dp.
183 	 */
184 	if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) {
185 		mutex_lock(&xnc->lock);
186 		error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta,
187 				0, 0);
188 		if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode))
189 			error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
190 					0, p->delta);
191 		mutex_unlock(&xnc->lock);
192 		if (error)
193 			goto out_abort;
194 	}
195 
196 	/*
197 	 * If @ip is a subdirectory and we've already scanned it, update the
198 	 * number of backrefs pointing to @dp.
199 	 */
200 	if (S_ISDIR(VFS_IC(p->ip)->i_mode) &&
201 	    xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) {
202 		mutex_lock(&xnc->lock);
203 		error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
204 				p->delta, 0);
205 		mutex_unlock(&xnc->lock);
206 		if (error)
207 			goto out_abort;
208 	}
209 
210 	return NOTIFY_DONE;
211 
212 out_abort:
213 	xchk_iscan_abort(&xnc->collect_iscan);
214 	return NOTIFY_DONE;
215 }
216 
217 /* Bump the observed link count for the inode referenced by this entry. */
218 STATIC int
219 xchk_nlinks_collect_dirent(
220 	struct xfs_scrub	*sc,
221 	struct xfs_inode	*dp,
222 	xfs_dir2_dataptr_t	dapos,
223 	const struct xfs_name	*name,
224 	xfs_ino_t		ino,
225 	void			*priv)
226 {
227 	struct xchk_nlink_ctrs	*xnc = priv;
228 	bool			dot = false, dotdot = false;
229 	int			error;
230 
231 	/* Does this name make sense? */
232 	if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) {
233 		error = -ECANCELED;
234 		goto out_abort;
235 	}
236 
237 	if (name->len == 1 && name->name[0] == '.')
238 		dot = true;
239 	else if (name->len == 2 && name->name[0] == '.' &&
240 				   name->name[1] == '.')
241 		dotdot = true;
242 
243 	/* Don't accept a '.' entry that points somewhere else. */
244 	if (dot && ino != dp->i_ino) {
245 		error = -ECANCELED;
246 		goto out_abort;
247 	}
248 
249 	/* Don't accept an invalid inode number. */
250 	if (!xfs_verify_dir_ino(sc->mp, ino)) {
251 		error = -ECANCELED;
252 		goto out_abort;
253 	}
254 
255 	/* Update the shadow link counts if we haven't already failed. */
256 
257 	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
258 		error = -ECANCELED;
259 		goto out_incomplete;
260 	}
261 
262 	trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name);
263 
264 	mutex_lock(&xnc->lock);
265 
266 	/*
267 	 * If this is a dotdot entry, it is a back link from dp to ino.  How
268 	 * we handle this depends on whether or not dp is the root directory.
269 	 *
270 	 * The root directory is its own parent, so we pretend the dotdot entry
271 	 * establishes the "parent" of the root directory.  Increment the
272 	 * number of parents of the root directory.
273 	 *
274 	 * Otherwise, increment the number of backrefs pointing back to ino.
275 	 */
276 	if (dotdot) {
277 		if (dp == sc->mp->m_rootip)
278 			error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
279 		else
280 			error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
281 		if (error)
282 			goto out_unlock;
283 	}
284 
285 	/*
286 	 * If this dirent is a forward link from dp to ino, increment the
287 	 * number of parents linking into ino.
288 	 */
289 	if (!dot && !dotdot) {
290 		error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
291 		if (error)
292 			goto out_unlock;
293 	}
294 
295 	/*
296 	 * If this dirent is a forward link to a subdirectory, increment the
297 	 * number of child links of dp.
298 	 */
299 	if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) {
300 		error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1);
301 		if (error)
302 			goto out_unlock;
303 	}
304 
305 	mutex_unlock(&xnc->lock);
306 	return 0;
307 
308 out_unlock:
309 	mutex_unlock(&xnc->lock);
310 out_abort:
311 	xchk_iscan_abort(&xnc->collect_iscan);
312 out_incomplete:
313 	xchk_set_incomplete(sc);
314 	return error;
315 }
316 
317 /* Walk a directory to bump the observed link counts of the children. */
318 STATIC int
319 xchk_nlinks_collect_dir(
320 	struct xchk_nlink_ctrs	*xnc,
321 	struct xfs_inode	*dp)
322 {
323 	struct xfs_scrub	*sc = xnc->sc;
324 	unsigned int		lock_mode;
325 	int			error = 0;
326 
327 	/*
328 	 * Ignore temporary directories being used to stage dir repairs, since
329 	 * we don't bump the link counts of the children.
330 	 */
331 	if (xrep_is_tempfile(dp))
332 		return 0;
333 
334 	/* Prevent anyone from changing this directory while we walk it. */
335 	xfs_ilock(dp, XFS_IOLOCK_SHARED);
336 	lock_mode = xfs_ilock_data_map_shared(dp);
337 
338 	/*
339 	 * The dotdot entry of an unlinked directory still points to the last
340 	 * parent, but the parent no longer links to this directory.  Skip the
341 	 * directory to avoid overcounting.
342 	 */
343 	if (VFS_I(dp)->i_nlink == 0)
344 		goto out_unlock;
345 
346 	/*
347 	 * We cannot count file links if the directory looks as though it has
348 	 * been zapped by the inode record repair code.
349 	 */
350 	if (xchk_dir_looks_zapped(dp)) {
351 		error = -EBUSY;
352 		goto out_abort;
353 	}
354 
355 	error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc);
356 	if (error == -ECANCELED) {
357 		error = 0;
358 		goto out_unlock;
359 	}
360 	if (error)
361 		goto out_abort;
362 
363 	xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
364 	goto out_unlock;
365 
366 out_abort:
367 	xchk_set_incomplete(sc);
368 	xchk_iscan_abort(&xnc->collect_iscan);
369 out_unlock:
370 	xfs_iunlock(dp, lock_mode);
371 	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
372 	return error;
373 }
374 
375 /* If this looks like a valid pointer, count it. */
376 static inline int
377 xchk_nlinks_collect_metafile(
378 	struct xchk_nlink_ctrs	*xnc,
379 	xfs_ino_t		ino)
380 {
381 	if (!xfs_verify_ino(xnc->sc->mp, ino))
382 		return 0;
383 
384 	trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino);
385 	return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
386 }
387 
388 /* Bump the link counts of metadata files rooted in the superblock. */
389 STATIC int
390 xchk_nlinks_collect_metafiles(
391 	struct xchk_nlink_ctrs	*xnc)
392 {
393 	struct xfs_mount	*mp = xnc->sc->mp;
394 	int			error = -ECANCELED;
395 
396 
397 	if (xchk_iscan_aborted(&xnc->collect_iscan))
398 		goto out_incomplete;
399 
400 	mutex_lock(&xnc->lock);
401 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino);
402 	if (error)
403 		goto out_abort;
404 
405 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino);
406 	if (error)
407 		goto out_abort;
408 
409 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino);
410 	if (error)
411 		goto out_abort;
412 
413 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino);
414 	if (error)
415 		goto out_abort;
416 
417 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino);
418 	if (error)
419 		goto out_abort;
420 	mutex_unlock(&xnc->lock);
421 
422 	return 0;
423 
424 out_abort:
425 	mutex_unlock(&xnc->lock);
426 	xchk_iscan_abort(&xnc->collect_iscan);
427 out_incomplete:
428 	xchk_set_incomplete(xnc->sc);
429 	return error;
430 }
431 
432 /* Advance the collection scan cursor for this non-directory file. */
433 static inline int
434 xchk_nlinks_collect_file(
435 	struct xchk_nlink_ctrs	*xnc,
436 	struct xfs_inode	*ip)
437 {
438 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
439 	xchk_iscan_mark_visited(&xnc->collect_iscan, ip);
440 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
441 	return 0;
442 }
443 
444 /* Walk all directories and count inode links. */
445 STATIC int
446 xchk_nlinks_collect(
447 	struct xchk_nlink_ctrs	*xnc)
448 {
449 	struct xfs_scrub	*sc = xnc->sc;
450 	struct xfs_inode	*ip;
451 	int			error;
452 
453 	/* Count the rt and quota files that are rooted in the superblock. */
454 	error = xchk_nlinks_collect_metafiles(xnc);
455 	if (error)
456 		return error;
457 
458 	/*
459 	 * Set up for a potentially lengthy filesystem scan by reducing our
460 	 * transaction resource usage for the duration.  Specifically:
461 	 *
462 	 * Cancel the transaction to release the log grant space while we scan
463 	 * the filesystem.
464 	 *
465 	 * Create a new empty transaction to eliminate the possibility of the
466 	 * inode scan deadlocking on cyclical metadata.
467 	 *
468 	 * We pass the empty transaction to the file scanning function to avoid
469 	 * repeatedly cycling empty transactions.  This can be done even though
470 	 * we take the IOLOCK to quiesce the file because empty transactions
471 	 * do not take sb_internal.
472 	 */
473 	xchk_trans_cancel(sc);
474 	error = xchk_trans_alloc_empty(sc);
475 	if (error)
476 		return error;
477 
478 	while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
479 		if (S_ISDIR(VFS_I(ip)->i_mode))
480 			error = xchk_nlinks_collect_dir(xnc, ip);
481 		else
482 			error = xchk_nlinks_collect_file(xnc, ip);
483 		xchk_irele(sc, ip);
484 		if (error)
485 			break;
486 
487 		if (xchk_should_terminate(sc, &error))
488 			break;
489 	}
490 	xchk_iscan_iter_finish(&xnc->collect_iscan);
491 	if (error) {
492 		xchk_set_incomplete(sc);
493 		/*
494 		 * If we couldn't grab an inode that was busy with a state
495 		 * change, change the error code so that we exit to userspace
496 		 * as quickly as possible.
497 		 */
498 		if (error == -EBUSY)
499 			return -ECANCELED;
500 		return error;
501 	}
502 
503 	/*
504 	 * Switch out for a real transaction in preparation for building a new
505 	 * tree.
506 	 */
507 	xchk_trans_cancel(sc);
508 	return xchk_setup_fs(sc);
509 }
510 
511 /*
512  * Part 2: Comparing file link counters.  Walk each inode and compare the link
513  * counts against our shadow information; and then walk each shadow link count
514  * structure (that wasn't covered in the first part), comparing it against the
515  * file.
516  */
517 
518 /* Read the observed link count for comparison with the actual inode. */
519 STATIC int
520 xchk_nlinks_comparison_read(
521 	struct xchk_nlink_ctrs	*xnc,
522 	xfs_ino_t		ino,
523 	struct xchk_nlink	*obs)
524 {
525 	struct xchk_nlink	nl;
526 	int			error;
527 
528 	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
529 	if (error)
530 		return error;
531 
532 	nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN);
533 
534 	error = xfarray_store(xnc->nlinks, ino, &nl);
535 	if (error == -EFBIG) {
536 		/*
537 		 * EFBIG means we tried to store data at too high a byte offset
538 		 * in the sparse array.  IOWs, we cannot complete the check and
539 		 * must notify userspace that the check was incomplete.  This
540 		 * shouldn't really happen outside of the collection phase.
541 		 */
542 		xchk_set_incomplete(xnc->sc);
543 		return -ECANCELED;
544 	}
545 	if (error)
546 		return error;
547 
548 	/* Copy the counters, but do not expose the internal state. */
549 	obs->parents = nl.parents;
550 	obs->backrefs = nl.backrefs;
551 	obs->children = nl.children;
552 	obs->flags = 0;
553 	return 0;
554 }
555 
556 /* Check our link count against an inode. */
557 STATIC int
558 xchk_nlinks_compare_inode(
559 	struct xchk_nlink_ctrs	*xnc,
560 	struct xfs_inode	*ip)
561 {
562 	struct xchk_nlink	obs;
563 	struct xfs_scrub	*sc = xnc->sc;
564 	uint64_t		total_links;
565 	unsigned int		actual_nlink;
566 	int			error;
567 
568 	/*
569 	 * Ignore temporary files being used to stage repairs, since we assume
570 	 * they're correct for non-directories, and the directory repair code
571 	 * doesn't bump the link counts for the children.
572 	 */
573 	if (xrep_is_tempfile(ip))
574 		return 0;
575 
576 	xfs_ilock(ip, XFS_ILOCK_SHARED);
577 	mutex_lock(&xnc->lock);
578 
579 	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
580 		xchk_set_incomplete(xnc->sc);
581 		error = -ECANCELED;
582 		goto out_scanlock;
583 	}
584 
585 	error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs);
586 	if (error)
587 		goto out_scanlock;
588 
589 	/*
590 	 * If we don't have ftype to get an accurate count of the subdirectory
591 	 * entries in this directory, take advantage of the fact that on a
592 	 * consistent ftype=0 filesystem, the number of subdirectory
593 	 * backreferences (dotdot entries) pointing towards this directory
594 	 * should be equal to the number of subdirectory entries in the
595 	 * directory.
596 	 */
597 	if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode))
598 		obs.children = obs.backrefs;
599 
600 	total_links = xchk_nlink_total(ip, &obs);
601 	actual_nlink = VFS_I(ip)->i_nlink;
602 
603 	trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs);
604 
605 	/*
606 	 * If we found so many parents that we'd overflow i_nlink, we must flag
607 	 * this as a corruption.  The VFS won't let users increase the link
608 	 * count, but it will let them decrease it.
609 	 */
610 	if (total_links > XFS_NLINK_PINNED) {
611 		xchk_ino_set_corrupt(sc, ip->i_ino);
612 		goto out_corrupt;
613 	} else if (total_links > XFS_MAXLINK) {
614 		xchk_ino_set_warning(sc, ip->i_ino);
615 	}
616 
617 	/* Link counts should match. */
618 	if (total_links != actual_nlink) {
619 		xchk_ino_set_corrupt(sc, ip->i_ino);
620 		goto out_corrupt;
621 	}
622 
623 	if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) {
624 		/*
625 		 * The collection phase ignores directories with zero link
626 		 * count, so we ignore them here too.
627 		 *
628 		 * The number of subdirectory backreferences (dotdot entries)
629 		 * pointing towards this directory should be equal to the
630 		 * number of subdirectory entries in the directory.
631 		 */
632 		if (obs.children != obs.backrefs)
633 			xchk_ino_xref_set_corrupt(sc, ip->i_ino);
634 	} else {
635 		/*
636 		 * Non-directories and unlinked directories should not have
637 		 * back references.
638 		 */
639 		if (obs.backrefs != 0) {
640 			xchk_ino_set_corrupt(sc, ip->i_ino);
641 			goto out_corrupt;
642 		}
643 
644 		/*
645 		 * Non-directories and unlinked directories should not have
646 		 * children.
647 		 */
648 		if (obs.children != 0) {
649 			xchk_ino_set_corrupt(sc, ip->i_ino);
650 			goto out_corrupt;
651 		}
652 	}
653 
654 	if (ip == sc->mp->m_rootip) {
655 		/*
656 		 * For the root of a directory tree, both the '.' and '..'
657 		 * entries should point to the root directory.  The dotdot
658 		 * entry is counted as a parent of the root /and/ a backref of
659 		 * the root directory.
660 		 */
661 		if (obs.parents != 1) {
662 			xchk_ino_set_corrupt(sc, ip->i_ino);
663 			goto out_corrupt;
664 		}
665 	} else if (actual_nlink > 0) {
666 		/*
667 		 * Linked files that are not the root directory should have at
668 		 * least one parent.
669 		 */
670 		if (obs.parents == 0) {
671 			xchk_ino_set_corrupt(sc, ip->i_ino);
672 			goto out_corrupt;
673 		}
674 	}
675 
676 out_corrupt:
677 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
678 		error = -ECANCELED;
679 out_scanlock:
680 	mutex_unlock(&xnc->lock);
681 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
682 	return error;
683 }
684 
685 /*
686  * Check our link count against an inode that wasn't checked previously.  This
687  * is intended to catch directories with dangling links, though we could be
688  * racing with inode allocation in other threads.
689  */
690 STATIC int
691 xchk_nlinks_compare_inum(
692 	struct xchk_nlink_ctrs	*xnc,
693 	xfs_ino_t		ino)
694 {
695 	struct xchk_nlink	obs;
696 	struct xfs_mount	*mp = xnc->sc->mp;
697 	struct xfs_trans	*tp = xnc->sc->tp;
698 	struct xfs_buf		*agi_bp;
699 	struct xfs_inode	*ip;
700 	int			error;
701 
702 	/*
703 	 * The first iget failed, so try again with the variant that returns
704 	 * either an incore inode or the AGI buffer.  If the function returns
705 	 * EINVAL/ENOENT, it should have passed us the AGI buffer so that we
706 	 * can guarantee that the inode won't be allocated while we check for
707 	 * a zero link count in the observed link count data.
708 	 */
709 	error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip);
710 	if (!error) {
711 		/* Actually got an inode, so use the inode compare. */
712 		error = xchk_nlinks_compare_inode(xnc, ip);
713 		xchk_irele(xnc->sc, ip);
714 		return error;
715 	}
716 	if (error == -ENOENT || error == -EINVAL) {
717 		/* No inode was found.  Check for zero link count below. */
718 		error = 0;
719 	}
720 	if (error)
721 		goto out_agi;
722 
723 	/* Ensure that we have protected against inode allocation/freeing. */
724 	if (agi_bp == NULL) {
725 		ASSERT(agi_bp != NULL);
726 		xchk_set_incomplete(xnc->sc);
727 		return -ECANCELED;
728 	}
729 
730 	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
731 		xchk_set_incomplete(xnc->sc);
732 		error = -ECANCELED;
733 		goto out_agi;
734 	}
735 
736 	mutex_lock(&xnc->lock);
737 	error = xchk_nlinks_comparison_read(xnc, ino, &obs);
738 	if (error)
739 		goto out_scanlock;
740 
741 	trace_xchk_nlinks_check_zero(mp, ino, &obs);
742 
743 	/*
744 	 * If we can't grab the inode, the link count had better be zero.  We
745 	 * still hold the AGI to prevent inode allocation/freeing.
746 	 */
747 	if (xchk_nlink_total(NULL, &obs) != 0) {
748 		xchk_ino_set_corrupt(xnc->sc, ino);
749 		error = -ECANCELED;
750 	}
751 
752 out_scanlock:
753 	mutex_unlock(&xnc->lock);
754 out_agi:
755 	if (agi_bp)
756 		xfs_trans_brelse(tp, agi_bp);
757 	return error;
758 }
759 
760 /*
761  * Try to visit every inode in the filesystem to compare the link count.  Move
762  * on if we can't grab an inode, since we'll revisit unchecked nlink records in
763  * the second part.
764  */
765 static int
766 xchk_nlinks_compare_iter(
767 	struct xchk_nlink_ctrs	*xnc,
768 	struct xfs_inode	**ipp)
769 {
770 	int			error;
771 
772 	do {
773 		error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
774 	} while (error == -EBUSY);
775 
776 	return error;
777 }
778 
779 /* Compare the link counts we observed against the live information. */
780 STATIC int
781 xchk_nlinks_compare(
782 	struct xchk_nlink_ctrs	*xnc)
783 {
784 	struct xchk_nlink	nl;
785 	struct xfs_scrub	*sc = xnc->sc;
786 	struct xfs_inode	*ip;
787 	xfarray_idx_t		cur = XFARRAY_CURSOR_INIT;
788 	int			error;
789 
790 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
791 		return 0;
792 
793 	/*
794 	 * Create a new empty transaction so that we can advance the iscan
795 	 * cursor without deadlocking if the inobt has a cycle and push on the
796 	 * inactivation workqueue.
797 	 */
798 	xchk_trans_cancel(sc);
799 	error = xchk_trans_alloc_empty(sc);
800 	if (error)
801 		return error;
802 
803 	/*
804 	 * Use the inobt to walk all allocated inodes to compare the link
805 	 * counts.  Inodes skipped by _compare_iter will be tried again in the
806 	 * next phase of the scan.
807 	 */
808 	xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan);
809 	while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) {
810 		error = xchk_nlinks_compare_inode(xnc, ip);
811 		xchk_iscan_mark_visited(&xnc->compare_iscan, ip);
812 		xchk_irele(sc, ip);
813 		if (error)
814 			break;
815 
816 		if (xchk_should_terminate(sc, &error))
817 			break;
818 	}
819 	xchk_iscan_iter_finish(&xnc->compare_iscan);
820 	xchk_iscan_teardown(&xnc->compare_iscan);
821 	if (error)
822 		return error;
823 
824 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
825 		return 0;
826 
827 	/*
828 	 * Walk all the non-null nlink observations that weren't checked in the
829 	 * previous step.
830 	 */
831 	mutex_lock(&xnc->lock);
832 	while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) {
833 		xfs_ino_t	ino = cur - 1;
834 
835 		if (nl.flags & XCHK_NLINK_COMPARE_SCANNED)
836 			continue;
837 
838 		mutex_unlock(&xnc->lock);
839 
840 		error = xchk_nlinks_compare_inum(xnc, ino);
841 		if (error)
842 			return error;
843 
844 		if (xchk_should_terminate(xnc->sc, &error))
845 			return error;
846 
847 		mutex_lock(&xnc->lock);
848 	}
849 	mutex_unlock(&xnc->lock);
850 
851 	return error;
852 }
853 
854 /* Tear down everything associated with a nlinks check. */
855 static void
856 xchk_nlinks_teardown_scan(
857 	void			*priv)
858 {
859 	struct xchk_nlink_ctrs	*xnc = priv;
860 
861 	/* Discourage any hook functions that might be running. */
862 	xchk_iscan_abort(&xnc->collect_iscan);
863 
864 	xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook);
865 
866 	xfarray_destroy(xnc->nlinks);
867 	xnc->nlinks = NULL;
868 
869 	xchk_iscan_teardown(&xnc->collect_iscan);
870 	mutex_destroy(&xnc->lock);
871 	xnc->sc = NULL;
872 }
873 
874 /*
875  * Scan all inodes in the entire filesystem to generate link count data.  If
876  * the scan is successful, the counts will be left alive for a repair.  If any
877  * error occurs, we'll tear everything down.
878  */
879 STATIC int
880 xchk_nlinks_setup_scan(
881 	struct xfs_scrub	*sc,
882 	struct xchk_nlink_ctrs	*xnc)
883 {
884 	struct xfs_mount	*mp = sc->mp;
885 	char			*descr;
886 	unsigned long long	max_inos;
887 	xfs_agnumber_t		last_agno = mp->m_sb.sb_agcount - 1;
888 	xfs_agino_t		first_agino, last_agino;
889 	int			error;
890 
891 	mutex_init(&xnc->lock);
892 
893 	/* Retry iget every tenth of a second for up to 30 seconds. */
894 	xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan);
895 
896 	/*
897 	 * Set up enough space to store an nlink record for the highest
898 	 * possible inode number in this system.
899 	 */
900 	xfs_agino_range(mp, last_agno, &first_agino, &last_agino);
901 	max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1;
902 	descr = xchk_xfile_descr(sc, "file link counts");
903 	error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos),
904 			sizeof(struct xchk_nlink), &xnc->nlinks);
905 	kfree(descr);
906 	if (error)
907 		goto out_teardown;
908 
909 	/*
910 	 * Hook into the directory entry code so that we can capture updates to
911 	 * file link counts.  The hook only triggers for inodes that were
912 	 * already scanned, and the scanner thread takes each inode's ILOCK,
913 	 * which means that any in-progress inode updates will finish before we
914 	 * can scan the inode.
915 	 */
916 	ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
917 	xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update);
918 	error = xfs_dir_hook_add(mp, &xnc->dhook);
919 	if (error)
920 		goto out_teardown;
921 
922 	/* Use deferred cleanup to pass the inode link count data to repair. */
923 	sc->buf_cleanup = xchk_nlinks_teardown_scan;
924 	return 0;
925 
926 out_teardown:
927 	xchk_nlinks_teardown_scan(xnc);
928 	return error;
929 }
930 
931 /* Scrub the link count of all inodes on the filesystem. */
932 int
933 xchk_nlinks(
934 	struct xfs_scrub	*sc)
935 {
936 	struct xchk_nlink_ctrs	*xnc = sc->buf;
937 	int			error = 0;
938 
939 	/* Set ourselves up to check link counts on the live filesystem. */
940 	error = xchk_nlinks_setup_scan(sc, xnc);
941 	if (error)
942 		return error;
943 
944 	/* Walk all inodes, picking up link count information. */
945 	error = xchk_nlinks_collect(xnc);
946 	if (!xchk_xref_process_error(sc, 0, 0, &error))
947 		return error;
948 
949 	/* Fail fast if we're not playing with a full dataset. */
950 	if (xchk_iscan_aborted(&xnc->collect_iscan))
951 		xchk_set_incomplete(sc);
952 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
953 		return 0;
954 
955 	/* Compare link counts. */
956 	error = xchk_nlinks_compare(xnc);
957 	if (!xchk_xref_process_error(sc, 0, 0, &error))
958 		return error;
959 
960 	/* Check one last time for an incomplete dataset. */
961 	if (xchk_iscan_aborted(&xnc->collect_iscan))
962 		xchk_set_incomplete(sc);
963 
964 	return 0;
965 }
966