xref: /linux/fs/xfs/scrub/nlinks.c (revision 1553a1c48281243359a9529a10ddb551f3b967ab)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_log_format.h"
13 #include "xfs_trans.h"
14 #include "xfs_inode.h"
15 #include "xfs_icache.h"
16 #include "xfs_iwalk.h"
17 #include "xfs_ialloc.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ag.h"
21 #include "scrub/scrub.h"
22 #include "scrub/common.h"
23 #include "scrub/repair.h"
24 #include "scrub/xfile.h"
25 #include "scrub/xfarray.h"
26 #include "scrub/iscan.h"
27 #include "scrub/nlinks.h"
28 #include "scrub/trace.h"
29 #include "scrub/readdir.h"
30 
31 /*
32  * Live Inode Link Count Checking
33  * ==============================
34  *
35  * Inode link counts are "summary" metadata, in the sense that they are
36  * computed as the number of directory entries referencing each file on the
37  * filesystem.  Therefore, we compute the correct link counts by creating a
38  * shadow link count structure and walking every inode.
39  */
40 
41 /* Set us up to scrub inode link counts. */
42 int
43 xchk_setup_nlinks(
44 	struct xfs_scrub	*sc)
45 {
46 	xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
47 
48 	sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
49 	if (!sc->buf)
50 		return -ENOMEM;
51 
52 	return xchk_setup_fs(sc);
53 }
54 
55 /*
56  * Part 1: Collecting file link counts.  For each file, we create a shadow link
57  * counting structure, then walk the entire directory tree, incrementing parent
58  * and child link counts for each directory entry seen.
59  *
60  * To avoid false corruption reports in part 2, any failure in this part must
61  * set the INCOMPLETE flag even when a negative errno is returned.  This care
62  * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
63  * ECANCELED) that are absorbed into a scrub state flag update by
64  * xchk_*_process_error.  Scrub and repair share the same incore data
65  * structures, so the INCOMPLETE flag is critical to prevent a repair based on
66  * insufficient information.
67  *
68  * Because we are scanning a live filesystem, it's possible that another thread
69  * will try to update the link counts for an inode that we've already scanned.
70  * This will cause our counts to be incorrect.  Therefore, we hook all
71  * directory entry updates because that is when link count updates occur.  By
72  * shadowing transaction updates in this manner, live nlink check can ensure by
73  * locking the inode and the shadow structure that its own copies are not out
74  * of date.  Because the hook code runs in a different process context from the
75  * scrub code and the scrub state flags are not accessed atomically, failures
76  * in the hook code must abort the iscan and the scrubber must notice the
77  * aborted scan and set the incomplete flag.
78  *
79  * Note that we use jump labels and srcu notifier hooks to minimize the
80  * overhead when live nlinks is /not/ running.  Locking order for nlink
81  * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock.
82  */
83 
84 /*
85  * Add a delta to an nlink counter, clamping the value to U32_MAX.  Because
86  * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results
87  * even if we lose some precision.
88  */
89 static inline void
90 careful_add(
91 	xfs_nlink_t	*nlinkp,
92 	int		delta)
93 {
94 	uint64_t	new_value = (uint64_t)(*nlinkp) + delta;
95 
96 	BUILD_BUG_ON(XFS_MAXLINK > U32_MAX);
97 	*nlinkp = min_t(uint64_t, new_value, U32_MAX);
98 }
99 
100 /* Update incore link count information.  Caller must hold the nlinks lock. */
101 STATIC int
102 xchk_nlinks_update_incore(
103 	struct xchk_nlink_ctrs	*xnc,
104 	xfs_ino_t		ino,
105 	int			parents_delta,
106 	int			backrefs_delta,
107 	int			children_delta)
108 {
109 	struct xchk_nlink	nl;
110 	int			error;
111 
112 	if (!xnc->nlinks)
113 		return 0;
114 
115 	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
116 	if (error)
117 		return error;
118 
119 	trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta,
120 			backrefs_delta, children_delta);
121 
122 	careful_add(&nl.parents, parents_delta);
123 	careful_add(&nl.backrefs, backrefs_delta);
124 	careful_add(&nl.children, children_delta);
125 
126 	nl.flags |= XCHK_NLINK_WRITTEN;
127 	error = xfarray_store(xnc->nlinks, ino, &nl);
128 	if (error == -EFBIG) {
129 		/*
130 		 * EFBIG means we tried to store data at too high a byte offset
131 		 * in the sparse array.  IOWs, we cannot complete the check and
132 		 * must notify userspace that the check was incomplete.
133 		 */
134 		error = -ECANCELED;
135 	}
136 	return error;
137 }
138 
139 /*
140  * Apply a link count change from the regular filesystem into our shadow link
141  * count structure based on a directory update in progress.
142  */
143 STATIC int
144 xchk_nlinks_live_update(
145 	struct notifier_block		*nb,
146 	unsigned long			action,
147 	void				*data)
148 {
149 	struct xfs_dir_update_params	*p = data;
150 	struct xchk_nlink_ctrs		*xnc;
151 	int				error;
152 
153 	xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
154 
155 	trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
156 			p->delta, p->name->name, p->name->len);
157 
158 	/*
159 	 * If we've already scanned @dp, update the number of parents that link
160 	 * to @ip.  If @ip is a subdirectory, update the number of child links
161 	 * going out of @dp.
162 	 */
163 	if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) {
164 		mutex_lock(&xnc->lock);
165 		error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta,
166 				0, 0);
167 		if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode))
168 			error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
169 					0, p->delta);
170 		mutex_unlock(&xnc->lock);
171 		if (error)
172 			goto out_abort;
173 	}
174 
175 	/*
176 	 * If @ip is a subdirectory and we've already scanned it, update the
177 	 * number of backrefs pointing to @dp.
178 	 */
179 	if (S_ISDIR(VFS_IC(p->ip)->i_mode) &&
180 	    xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) {
181 		mutex_lock(&xnc->lock);
182 		error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
183 				p->delta, 0);
184 		mutex_unlock(&xnc->lock);
185 		if (error)
186 			goto out_abort;
187 	}
188 
189 	return NOTIFY_DONE;
190 
191 out_abort:
192 	xchk_iscan_abort(&xnc->collect_iscan);
193 	return NOTIFY_DONE;
194 }
195 
196 /* Bump the observed link count for the inode referenced by this entry. */
197 STATIC int
198 xchk_nlinks_collect_dirent(
199 	struct xfs_scrub	*sc,
200 	struct xfs_inode	*dp,
201 	xfs_dir2_dataptr_t	dapos,
202 	const struct xfs_name	*name,
203 	xfs_ino_t		ino,
204 	void			*priv)
205 {
206 	struct xchk_nlink_ctrs	*xnc = priv;
207 	bool			dot = false, dotdot = false;
208 	int			error;
209 
210 	/* Does this name make sense? */
211 	if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) {
212 		error = -ECANCELED;
213 		goto out_abort;
214 	}
215 
216 	if (name->len == 1 && name->name[0] == '.')
217 		dot = true;
218 	else if (name->len == 2 && name->name[0] == '.' &&
219 				   name->name[1] == '.')
220 		dotdot = true;
221 
222 	/* Don't accept a '.' entry that points somewhere else. */
223 	if (dot && ino != dp->i_ino) {
224 		error = -ECANCELED;
225 		goto out_abort;
226 	}
227 
228 	/* Don't accept an invalid inode number. */
229 	if (!xfs_verify_dir_ino(sc->mp, ino)) {
230 		error = -ECANCELED;
231 		goto out_abort;
232 	}
233 
234 	/* Update the shadow link counts if we haven't already failed. */
235 
236 	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
237 		error = -ECANCELED;
238 		goto out_incomplete;
239 	}
240 
241 	trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name);
242 
243 	mutex_lock(&xnc->lock);
244 
245 	/*
246 	 * If this is a dotdot entry, it is a back link from dp to ino.  How
247 	 * we handle this depends on whether or not dp is the root directory.
248 	 *
249 	 * The root directory is its own parent, so we pretend the dotdot entry
250 	 * establishes the "parent" of the root directory.  Increment the
251 	 * number of parents of the root directory.
252 	 *
253 	 * Otherwise, increment the number of backrefs pointing back to ino.
254 	 */
255 	if (dotdot) {
256 		if (dp == sc->mp->m_rootip)
257 			error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
258 		else
259 			error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
260 		if (error)
261 			goto out_unlock;
262 	}
263 
264 	/*
265 	 * If this dirent is a forward link from dp to ino, increment the
266 	 * number of parents linking into ino.
267 	 */
268 	if (!dot && !dotdot) {
269 		error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
270 		if (error)
271 			goto out_unlock;
272 	}
273 
274 	/*
275 	 * If this dirent is a forward link to a subdirectory, increment the
276 	 * number of child links of dp.
277 	 */
278 	if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) {
279 		error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1);
280 		if (error)
281 			goto out_unlock;
282 	}
283 
284 	mutex_unlock(&xnc->lock);
285 	return 0;
286 
287 out_unlock:
288 	mutex_unlock(&xnc->lock);
289 out_abort:
290 	xchk_iscan_abort(&xnc->collect_iscan);
291 out_incomplete:
292 	xchk_set_incomplete(sc);
293 	return error;
294 }
295 
296 /* Walk a directory to bump the observed link counts of the children. */
297 STATIC int
298 xchk_nlinks_collect_dir(
299 	struct xchk_nlink_ctrs	*xnc,
300 	struct xfs_inode	*dp)
301 {
302 	struct xfs_scrub	*sc = xnc->sc;
303 	unsigned int		lock_mode;
304 	int			error = 0;
305 
306 	/* Prevent anyone from changing this directory while we walk it. */
307 	xfs_ilock(dp, XFS_IOLOCK_SHARED);
308 	lock_mode = xfs_ilock_data_map_shared(dp);
309 
310 	/*
311 	 * The dotdot entry of an unlinked directory still points to the last
312 	 * parent, but the parent no longer links to this directory.  Skip the
313 	 * directory to avoid overcounting.
314 	 */
315 	if (VFS_I(dp)->i_nlink == 0)
316 		goto out_unlock;
317 
318 	/*
319 	 * We cannot count file links if the directory looks as though it has
320 	 * been zapped by the inode record repair code.
321 	 */
322 	if (xchk_dir_looks_zapped(dp)) {
323 		error = -EBUSY;
324 		goto out_abort;
325 	}
326 
327 	error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc);
328 	if (error == -ECANCELED) {
329 		error = 0;
330 		goto out_unlock;
331 	}
332 	if (error)
333 		goto out_abort;
334 
335 	xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
336 	goto out_unlock;
337 
338 out_abort:
339 	xchk_set_incomplete(sc);
340 	xchk_iscan_abort(&xnc->collect_iscan);
341 out_unlock:
342 	xfs_iunlock(dp, lock_mode);
343 	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
344 	return error;
345 }
346 
347 /* If this looks like a valid pointer, count it. */
348 static inline int
349 xchk_nlinks_collect_metafile(
350 	struct xchk_nlink_ctrs	*xnc,
351 	xfs_ino_t		ino)
352 {
353 	if (!xfs_verify_ino(xnc->sc->mp, ino))
354 		return 0;
355 
356 	trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino);
357 	return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
358 }
359 
360 /* Bump the link counts of metadata files rooted in the superblock. */
361 STATIC int
362 xchk_nlinks_collect_metafiles(
363 	struct xchk_nlink_ctrs	*xnc)
364 {
365 	struct xfs_mount	*mp = xnc->sc->mp;
366 	int			error = -ECANCELED;
367 
368 
369 	if (xchk_iscan_aborted(&xnc->collect_iscan))
370 		goto out_incomplete;
371 
372 	mutex_lock(&xnc->lock);
373 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino);
374 	if (error)
375 		goto out_abort;
376 
377 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino);
378 	if (error)
379 		goto out_abort;
380 
381 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino);
382 	if (error)
383 		goto out_abort;
384 
385 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino);
386 	if (error)
387 		goto out_abort;
388 
389 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino);
390 	if (error)
391 		goto out_abort;
392 	mutex_unlock(&xnc->lock);
393 
394 	return 0;
395 
396 out_abort:
397 	mutex_unlock(&xnc->lock);
398 	xchk_iscan_abort(&xnc->collect_iscan);
399 out_incomplete:
400 	xchk_set_incomplete(xnc->sc);
401 	return error;
402 }
403 
404 /* Advance the collection scan cursor for this non-directory file. */
405 static inline int
406 xchk_nlinks_collect_file(
407 	struct xchk_nlink_ctrs	*xnc,
408 	struct xfs_inode	*ip)
409 {
410 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
411 	xchk_iscan_mark_visited(&xnc->collect_iscan, ip);
412 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
413 	return 0;
414 }
415 
416 /* Walk all directories and count inode links. */
417 STATIC int
418 xchk_nlinks_collect(
419 	struct xchk_nlink_ctrs	*xnc)
420 {
421 	struct xfs_scrub	*sc = xnc->sc;
422 	struct xfs_inode	*ip;
423 	int			error;
424 
425 	/* Count the rt and quota files that are rooted in the superblock. */
426 	error = xchk_nlinks_collect_metafiles(xnc);
427 	if (error)
428 		return error;
429 
430 	/*
431 	 * Set up for a potentially lengthy filesystem scan by reducing our
432 	 * transaction resource usage for the duration.  Specifically:
433 	 *
434 	 * Cancel the transaction to release the log grant space while we scan
435 	 * the filesystem.
436 	 *
437 	 * Create a new empty transaction to eliminate the possibility of the
438 	 * inode scan deadlocking on cyclical metadata.
439 	 *
440 	 * We pass the empty transaction to the file scanning function to avoid
441 	 * repeatedly cycling empty transactions.  This can be done even though
442 	 * we take the IOLOCK to quiesce the file because empty transactions
443 	 * do not take sb_internal.
444 	 */
445 	xchk_trans_cancel(sc);
446 	error = xchk_trans_alloc_empty(sc);
447 	if (error)
448 		return error;
449 
450 	while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
451 		if (S_ISDIR(VFS_I(ip)->i_mode))
452 			error = xchk_nlinks_collect_dir(xnc, ip);
453 		else
454 			error = xchk_nlinks_collect_file(xnc, ip);
455 		xchk_irele(sc, ip);
456 		if (error)
457 			break;
458 
459 		if (xchk_should_terminate(sc, &error))
460 			break;
461 	}
462 	xchk_iscan_iter_finish(&xnc->collect_iscan);
463 	if (error) {
464 		xchk_set_incomplete(sc);
465 		/*
466 		 * If we couldn't grab an inode that was busy with a state
467 		 * change, change the error code so that we exit to userspace
468 		 * as quickly as possible.
469 		 */
470 		if (error == -EBUSY)
471 			return -ECANCELED;
472 		return error;
473 	}
474 
475 	/*
476 	 * Switch out for a real transaction in preparation for building a new
477 	 * tree.
478 	 */
479 	xchk_trans_cancel(sc);
480 	return xchk_setup_fs(sc);
481 }
482 
483 /*
484  * Part 2: Comparing file link counters.  Walk each inode and compare the link
485  * counts against our shadow information; and then walk each shadow link count
486  * structure (that wasn't covered in the first part), comparing it against the
487  * file.
488  */
489 
490 /* Read the observed link count for comparison with the actual inode. */
491 STATIC int
492 xchk_nlinks_comparison_read(
493 	struct xchk_nlink_ctrs	*xnc,
494 	xfs_ino_t		ino,
495 	struct xchk_nlink	*obs)
496 {
497 	struct xchk_nlink	nl;
498 	int			error;
499 
500 	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
501 	if (error)
502 		return error;
503 
504 	nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN);
505 
506 	error = xfarray_store(xnc->nlinks, ino, &nl);
507 	if (error == -EFBIG) {
508 		/*
509 		 * EFBIG means we tried to store data at too high a byte offset
510 		 * in the sparse array.  IOWs, we cannot complete the check and
511 		 * must notify userspace that the check was incomplete.  This
512 		 * shouldn't really happen outside of the collection phase.
513 		 */
514 		xchk_set_incomplete(xnc->sc);
515 		return -ECANCELED;
516 	}
517 	if (error)
518 		return error;
519 
520 	/* Copy the counters, but do not expose the internal state. */
521 	obs->parents = nl.parents;
522 	obs->backrefs = nl.backrefs;
523 	obs->children = nl.children;
524 	obs->flags = 0;
525 	return 0;
526 }
527 
528 /* Check our link count against an inode. */
529 STATIC int
530 xchk_nlinks_compare_inode(
531 	struct xchk_nlink_ctrs	*xnc,
532 	struct xfs_inode	*ip)
533 {
534 	struct xchk_nlink	obs;
535 	struct xfs_scrub	*sc = xnc->sc;
536 	uint64_t		total_links;
537 	unsigned int		actual_nlink;
538 	int			error;
539 
540 	xfs_ilock(ip, XFS_ILOCK_SHARED);
541 	mutex_lock(&xnc->lock);
542 
543 	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
544 		xchk_set_incomplete(xnc->sc);
545 		error = -ECANCELED;
546 		goto out_scanlock;
547 	}
548 
549 	error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs);
550 	if (error)
551 		goto out_scanlock;
552 
553 	/*
554 	 * If we don't have ftype to get an accurate count of the subdirectory
555 	 * entries in this directory, take advantage of the fact that on a
556 	 * consistent ftype=0 filesystem, the number of subdirectory
557 	 * backreferences (dotdot entries) pointing towards this directory
558 	 * should be equal to the number of subdirectory entries in the
559 	 * directory.
560 	 */
561 	if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode))
562 		obs.children = obs.backrefs;
563 
564 	total_links = xchk_nlink_total(ip, &obs);
565 	actual_nlink = VFS_I(ip)->i_nlink;
566 
567 	trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs);
568 
569 	/*
570 	 * If we found so many parents that we'd overflow i_nlink, we must flag
571 	 * this as a corruption.  The VFS won't let users increase the link
572 	 * count, but it will let them decrease it.
573 	 */
574 	if (total_links > XFS_MAXLINK) {
575 		xchk_ino_set_corrupt(sc, ip->i_ino);
576 		goto out_corrupt;
577 	}
578 
579 	/* Link counts should match. */
580 	if (total_links != actual_nlink) {
581 		xchk_ino_set_corrupt(sc, ip->i_ino);
582 		goto out_corrupt;
583 	}
584 
585 	if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) {
586 		/*
587 		 * The collection phase ignores directories with zero link
588 		 * count, so we ignore them here too.
589 		 *
590 		 * The number of subdirectory backreferences (dotdot entries)
591 		 * pointing towards this directory should be equal to the
592 		 * number of subdirectory entries in the directory.
593 		 */
594 		if (obs.children != obs.backrefs)
595 			xchk_ino_xref_set_corrupt(sc, ip->i_ino);
596 	} else {
597 		/*
598 		 * Non-directories and unlinked directories should not have
599 		 * back references.
600 		 */
601 		if (obs.backrefs != 0) {
602 			xchk_ino_set_corrupt(sc, ip->i_ino);
603 			goto out_corrupt;
604 		}
605 
606 		/*
607 		 * Non-directories and unlinked directories should not have
608 		 * children.
609 		 */
610 		if (obs.children != 0) {
611 			xchk_ino_set_corrupt(sc, ip->i_ino);
612 			goto out_corrupt;
613 		}
614 	}
615 
616 	if (ip == sc->mp->m_rootip) {
617 		/*
618 		 * For the root of a directory tree, both the '.' and '..'
619 		 * entries should point to the root directory.  The dotdot
620 		 * entry is counted as a parent of the root /and/ a backref of
621 		 * the root directory.
622 		 */
623 		if (obs.parents != 1) {
624 			xchk_ino_set_corrupt(sc, ip->i_ino);
625 			goto out_corrupt;
626 		}
627 	} else if (actual_nlink > 0) {
628 		/*
629 		 * Linked files that are not the root directory should have at
630 		 * least one parent.
631 		 */
632 		if (obs.parents == 0) {
633 			xchk_ino_set_corrupt(sc, ip->i_ino);
634 			goto out_corrupt;
635 		}
636 	}
637 
638 out_corrupt:
639 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
640 		error = -ECANCELED;
641 out_scanlock:
642 	mutex_unlock(&xnc->lock);
643 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
644 	return error;
645 }
646 
647 /*
648  * Check our link count against an inode that wasn't checked previously.  This
649  * is intended to catch directories with dangling links, though we could be
650  * racing with inode allocation in other threads.
651  */
652 STATIC int
653 xchk_nlinks_compare_inum(
654 	struct xchk_nlink_ctrs	*xnc,
655 	xfs_ino_t		ino)
656 {
657 	struct xchk_nlink	obs;
658 	struct xfs_mount	*mp = xnc->sc->mp;
659 	struct xfs_trans	*tp = xnc->sc->tp;
660 	struct xfs_buf		*agi_bp;
661 	struct xfs_inode	*ip;
662 	int			error;
663 
664 	/*
665 	 * The first iget failed, so try again with the variant that returns
666 	 * either an incore inode or the AGI buffer.  If the function returns
667 	 * EINVAL/ENOENT, it should have passed us the AGI buffer so that we
668 	 * can guarantee that the inode won't be allocated while we check for
669 	 * a zero link count in the observed link count data.
670 	 */
671 	error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip);
672 	if (!error) {
673 		/* Actually got an inode, so use the inode compare. */
674 		error = xchk_nlinks_compare_inode(xnc, ip);
675 		xchk_irele(xnc->sc, ip);
676 		return error;
677 	}
678 	if (error == -ENOENT || error == -EINVAL) {
679 		/* No inode was found.  Check for zero link count below. */
680 		error = 0;
681 	}
682 	if (error)
683 		goto out_agi;
684 
685 	/* Ensure that we have protected against inode allocation/freeing. */
686 	if (agi_bp == NULL) {
687 		ASSERT(agi_bp != NULL);
688 		xchk_set_incomplete(xnc->sc);
689 		return -ECANCELED;
690 	}
691 
692 	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
693 		xchk_set_incomplete(xnc->sc);
694 		error = -ECANCELED;
695 		goto out_agi;
696 	}
697 
698 	mutex_lock(&xnc->lock);
699 	error = xchk_nlinks_comparison_read(xnc, ino, &obs);
700 	if (error)
701 		goto out_scanlock;
702 
703 	trace_xchk_nlinks_check_zero(mp, ino, &obs);
704 
705 	/*
706 	 * If we can't grab the inode, the link count had better be zero.  We
707 	 * still hold the AGI to prevent inode allocation/freeing.
708 	 */
709 	if (xchk_nlink_total(NULL, &obs) != 0) {
710 		xchk_ino_set_corrupt(xnc->sc, ino);
711 		error = -ECANCELED;
712 	}
713 
714 out_scanlock:
715 	mutex_unlock(&xnc->lock);
716 out_agi:
717 	if (agi_bp)
718 		xfs_trans_brelse(tp, agi_bp);
719 	return error;
720 }
721 
722 /*
723  * Try to visit every inode in the filesystem to compare the link count.  Move
724  * on if we can't grab an inode, since we'll revisit unchecked nlink records in
725  * the second part.
726  */
727 static int
728 xchk_nlinks_compare_iter(
729 	struct xchk_nlink_ctrs	*xnc,
730 	struct xfs_inode	**ipp)
731 {
732 	int			error;
733 
734 	do {
735 		error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
736 	} while (error == -EBUSY);
737 
738 	return error;
739 }
740 
741 /* Compare the link counts we observed against the live information. */
742 STATIC int
743 xchk_nlinks_compare(
744 	struct xchk_nlink_ctrs	*xnc)
745 {
746 	struct xchk_nlink	nl;
747 	struct xfs_scrub	*sc = xnc->sc;
748 	struct xfs_inode	*ip;
749 	xfarray_idx_t		cur = XFARRAY_CURSOR_INIT;
750 	int			error;
751 
752 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
753 		return 0;
754 
755 	/*
756 	 * Create a new empty transaction so that we can advance the iscan
757 	 * cursor without deadlocking if the inobt has a cycle and push on the
758 	 * inactivation workqueue.
759 	 */
760 	xchk_trans_cancel(sc);
761 	error = xchk_trans_alloc_empty(sc);
762 	if (error)
763 		return error;
764 
765 	/*
766 	 * Use the inobt to walk all allocated inodes to compare the link
767 	 * counts.  Inodes skipped by _compare_iter will be tried again in the
768 	 * next phase of the scan.
769 	 */
770 	xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan);
771 	while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) {
772 		error = xchk_nlinks_compare_inode(xnc, ip);
773 		xchk_iscan_mark_visited(&xnc->compare_iscan, ip);
774 		xchk_irele(sc, ip);
775 		if (error)
776 			break;
777 
778 		if (xchk_should_terminate(sc, &error))
779 			break;
780 	}
781 	xchk_iscan_iter_finish(&xnc->compare_iscan);
782 	xchk_iscan_teardown(&xnc->compare_iscan);
783 	if (error)
784 		return error;
785 
786 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
787 		return 0;
788 
789 	/*
790 	 * Walk all the non-null nlink observations that weren't checked in the
791 	 * previous step.
792 	 */
793 	mutex_lock(&xnc->lock);
794 	while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) {
795 		xfs_ino_t	ino = cur - 1;
796 
797 		if (nl.flags & XCHK_NLINK_COMPARE_SCANNED)
798 			continue;
799 
800 		mutex_unlock(&xnc->lock);
801 
802 		error = xchk_nlinks_compare_inum(xnc, ino);
803 		if (error)
804 			return error;
805 
806 		if (xchk_should_terminate(xnc->sc, &error))
807 			return error;
808 
809 		mutex_lock(&xnc->lock);
810 	}
811 	mutex_unlock(&xnc->lock);
812 
813 	return error;
814 }
815 
816 /* Tear down everything associated with a nlinks check. */
817 static void
818 xchk_nlinks_teardown_scan(
819 	void			*priv)
820 {
821 	struct xchk_nlink_ctrs	*xnc = priv;
822 
823 	/* Discourage any hook functions that might be running. */
824 	xchk_iscan_abort(&xnc->collect_iscan);
825 
826 	xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook);
827 
828 	xfarray_destroy(xnc->nlinks);
829 	xnc->nlinks = NULL;
830 
831 	xchk_iscan_teardown(&xnc->collect_iscan);
832 	mutex_destroy(&xnc->lock);
833 	xnc->sc = NULL;
834 }
835 
836 /*
837  * Scan all inodes in the entire filesystem to generate link count data.  If
838  * the scan is successful, the counts will be left alive for a repair.  If any
839  * error occurs, we'll tear everything down.
840  */
841 STATIC int
842 xchk_nlinks_setup_scan(
843 	struct xfs_scrub	*sc,
844 	struct xchk_nlink_ctrs	*xnc)
845 {
846 	struct xfs_mount	*mp = sc->mp;
847 	char			*descr;
848 	unsigned long long	max_inos;
849 	xfs_agnumber_t		last_agno = mp->m_sb.sb_agcount - 1;
850 	xfs_agino_t		first_agino, last_agino;
851 	int			error;
852 
853 	ASSERT(xnc->sc == NULL);
854 	xnc->sc = sc;
855 
856 	mutex_init(&xnc->lock);
857 
858 	/* Retry iget every tenth of a second for up to 30 seconds. */
859 	xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan);
860 
861 	/*
862 	 * Set up enough space to store an nlink record for the highest
863 	 * possible inode number in this system.
864 	 */
865 	xfs_agino_range(mp, last_agno, &first_agino, &last_agino);
866 	max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1;
867 	descr = xchk_xfile_descr(sc, "file link counts");
868 	error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos),
869 			sizeof(struct xchk_nlink), &xnc->nlinks);
870 	kfree(descr);
871 	if (error)
872 		goto out_teardown;
873 
874 	/*
875 	 * Hook into the directory entry code so that we can capture updates to
876 	 * file link counts.  The hook only triggers for inodes that were
877 	 * already scanned, and the scanner thread takes each inode's ILOCK,
878 	 * which means that any in-progress inode updates will finish before we
879 	 * can scan the inode.
880 	 */
881 	ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
882 	xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update);
883 	error = xfs_dir_hook_add(mp, &xnc->dhook);
884 	if (error)
885 		goto out_teardown;
886 
887 	/* Use deferred cleanup to pass the inode link count data to repair. */
888 	sc->buf_cleanup = xchk_nlinks_teardown_scan;
889 	return 0;
890 
891 out_teardown:
892 	xchk_nlinks_teardown_scan(xnc);
893 	return error;
894 }
895 
896 /* Scrub the link count of all inodes on the filesystem. */
897 int
898 xchk_nlinks(
899 	struct xfs_scrub	*sc)
900 {
901 	struct xchk_nlink_ctrs	*xnc = sc->buf;
902 	int			error = 0;
903 
904 	/* Set ourselves up to check link counts on the live filesystem. */
905 	error = xchk_nlinks_setup_scan(sc, xnc);
906 	if (error)
907 		return error;
908 
909 	/* Walk all inodes, picking up link count information. */
910 	error = xchk_nlinks_collect(xnc);
911 	if (!xchk_xref_process_error(sc, 0, 0, &error))
912 		return error;
913 
914 	/* Fail fast if we're not playing with a full dataset. */
915 	if (xchk_iscan_aborted(&xnc->collect_iscan))
916 		xchk_set_incomplete(sc);
917 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
918 		return 0;
919 
920 	/* Compare link counts. */
921 	error = xchk_nlinks_compare(xnc);
922 	if (!xchk_xref_process_error(sc, 0, 0, &error))
923 		return error;
924 
925 	/* Check one last time for an incomplete dataset. */
926 	if (xchk_iscan_aborted(&xnc->collect_iscan))
927 		xchk_set_incomplete(sc);
928 
929 	return 0;
930 }
931