xref: /linux/fs/xfs/scrub/nlinks.c (revision 1e58a8ccf2597c9259a8e71a2bffac5e11e12ea0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_log_format.h"
13 #include "xfs_trans.h"
14 #include "xfs_inode.h"
15 #include "xfs_icache.h"
16 #include "xfs_iwalk.h"
17 #include "xfs_ialloc.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ag.h"
21 #include "scrub/scrub.h"
22 #include "scrub/common.h"
23 #include "scrub/repair.h"
24 #include "scrub/xfile.h"
25 #include "scrub/xfarray.h"
26 #include "scrub/iscan.h"
27 #include "scrub/nlinks.h"
28 #include "scrub/trace.h"
29 #include "scrub/readdir.h"
30 #include "scrub/tempfile.h"
31 
32 /*
33  * Live Inode Link Count Checking
34  * ==============================
35  *
36  * Inode link counts are "summary" metadata, in the sense that they are
37  * computed as the number of directory entries referencing each file on the
38  * filesystem.  Therefore, we compute the correct link counts by creating a
39  * shadow link count structure and walking every inode.
40  */
41 
42 /* Set us up to scrub inode link counts. */
43 int
44 xchk_setup_nlinks(
45 	struct xfs_scrub	*sc)
46 {
47 	xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
48 
49 	sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
50 	if (!sc->buf)
51 		return -ENOMEM;
52 
53 	return xchk_setup_fs(sc);
54 }
55 
56 /*
57  * Part 1: Collecting file link counts.  For each file, we create a shadow link
58  * counting structure, then walk the entire directory tree, incrementing parent
59  * and child link counts for each directory entry seen.
60  *
61  * To avoid false corruption reports in part 2, any failure in this part must
62  * set the INCOMPLETE flag even when a negative errno is returned.  This care
63  * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
64  * ECANCELED) that are absorbed into a scrub state flag update by
65  * xchk_*_process_error.  Scrub and repair share the same incore data
66  * structures, so the INCOMPLETE flag is critical to prevent a repair based on
67  * insufficient information.
68  *
69  * Because we are scanning a live filesystem, it's possible that another thread
70  * will try to update the link counts for an inode that we've already scanned.
71  * This will cause our counts to be incorrect.  Therefore, we hook all
72  * directory entry updates because that is when link count updates occur.  By
73  * shadowing transaction updates in this manner, live nlink check can ensure by
74  * locking the inode and the shadow structure that its own copies are not out
75  * of date.  Because the hook code runs in a different process context from the
76  * scrub code and the scrub state flags are not accessed atomically, failures
77  * in the hook code must abort the iscan and the scrubber must notice the
78  * aborted scan and set the incomplete flag.
79  *
80  * Note that we use jump labels and srcu notifier hooks to minimize the
81  * overhead when live nlinks is /not/ running.  Locking order for nlink
82  * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock.
83  */
84 
85 /*
86  * Add a delta to an nlink counter, clamping the value to U32_MAX.  Because
87  * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results
88  * even if we lose some precision.
89  */
90 static inline void
91 careful_add(
92 	xfs_nlink_t	*nlinkp,
93 	int		delta)
94 {
95 	uint64_t	new_value = (uint64_t)(*nlinkp) + delta;
96 
97 	BUILD_BUG_ON(XFS_MAXLINK > U32_MAX);
98 	*nlinkp = min_t(uint64_t, new_value, U32_MAX);
99 }
100 
101 /* Update incore link count information.  Caller must hold the nlinks lock. */
102 STATIC int
103 xchk_nlinks_update_incore(
104 	struct xchk_nlink_ctrs	*xnc,
105 	xfs_ino_t		ino,
106 	int			parents_delta,
107 	int			backrefs_delta,
108 	int			children_delta)
109 {
110 	struct xchk_nlink	nl;
111 	int			error;
112 
113 	if (!xnc->nlinks)
114 		return 0;
115 
116 	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
117 	if (error)
118 		return error;
119 
120 	trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta,
121 			backrefs_delta, children_delta);
122 
123 	careful_add(&nl.parents, parents_delta);
124 	careful_add(&nl.backrefs, backrefs_delta);
125 	careful_add(&nl.children, children_delta);
126 
127 	nl.flags |= XCHK_NLINK_WRITTEN;
128 	error = xfarray_store(xnc->nlinks, ino, &nl);
129 	if (error == -EFBIG) {
130 		/*
131 		 * EFBIG means we tried to store data at too high a byte offset
132 		 * in the sparse array.  IOWs, we cannot complete the check and
133 		 * must notify userspace that the check was incomplete.
134 		 */
135 		error = -ECANCELED;
136 	}
137 	return error;
138 }
139 
140 /*
141  * Apply a link count change from the regular filesystem into our shadow link
142  * count structure based on a directory update in progress.
143  */
144 STATIC int
145 xchk_nlinks_live_update(
146 	struct notifier_block		*nb,
147 	unsigned long			action,
148 	void				*data)
149 {
150 	struct xfs_dir_update_params	*p = data;
151 	struct xchk_nlink_ctrs		*xnc;
152 	int				error;
153 
154 	xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
155 
156 	/*
157 	 * Ignore temporary directories being used to stage dir repairs, since
158 	 * we don't bump the link counts of the children.
159 	 */
160 	if (xrep_is_tempfile(p->dp))
161 		return NOTIFY_DONE;
162 
163 	trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
164 			p->delta, p->name->name, p->name->len);
165 
166 	/*
167 	 * If we've already scanned @dp, update the number of parents that link
168 	 * to @ip.  If @ip is a subdirectory, update the number of child links
169 	 * going out of @dp.
170 	 */
171 	if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) {
172 		mutex_lock(&xnc->lock);
173 		error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta,
174 				0, 0);
175 		if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode))
176 			error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
177 					0, p->delta);
178 		mutex_unlock(&xnc->lock);
179 		if (error)
180 			goto out_abort;
181 	}
182 
183 	/*
184 	 * If @ip is a subdirectory and we've already scanned it, update the
185 	 * number of backrefs pointing to @dp.
186 	 */
187 	if (S_ISDIR(VFS_IC(p->ip)->i_mode) &&
188 	    xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) {
189 		mutex_lock(&xnc->lock);
190 		error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
191 				p->delta, 0);
192 		mutex_unlock(&xnc->lock);
193 		if (error)
194 			goto out_abort;
195 	}
196 
197 	return NOTIFY_DONE;
198 
199 out_abort:
200 	xchk_iscan_abort(&xnc->collect_iscan);
201 	return NOTIFY_DONE;
202 }
203 
204 /* Bump the observed link count for the inode referenced by this entry. */
205 STATIC int
206 xchk_nlinks_collect_dirent(
207 	struct xfs_scrub	*sc,
208 	struct xfs_inode	*dp,
209 	xfs_dir2_dataptr_t	dapos,
210 	const struct xfs_name	*name,
211 	xfs_ino_t		ino,
212 	void			*priv)
213 {
214 	struct xchk_nlink_ctrs	*xnc = priv;
215 	bool			dot = false, dotdot = false;
216 	int			error;
217 
218 	/* Does this name make sense? */
219 	if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) {
220 		error = -ECANCELED;
221 		goto out_abort;
222 	}
223 
224 	if (name->len == 1 && name->name[0] == '.')
225 		dot = true;
226 	else if (name->len == 2 && name->name[0] == '.' &&
227 				   name->name[1] == '.')
228 		dotdot = true;
229 
230 	/* Don't accept a '.' entry that points somewhere else. */
231 	if (dot && ino != dp->i_ino) {
232 		error = -ECANCELED;
233 		goto out_abort;
234 	}
235 
236 	/* Don't accept an invalid inode number. */
237 	if (!xfs_verify_dir_ino(sc->mp, ino)) {
238 		error = -ECANCELED;
239 		goto out_abort;
240 	}
241 
242 	/* Update the shadow link counts if we haven't already failed. */
243 
244 	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
245 		error = -ECANCELED;
246 		goto out_incomplete;
247 	}
248 
249 	trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name);
250 
251 	mutex_lock(&xnc->lock);
252 
253 	/*
254 	 * If this is a dotdot entry, it is a back link from dp to ino.  How
255 	 * we handle this depends on whether or not dp is the root directory.
256 	 *
257 	 * The root directory is its own parent, so we pretend the dotdot entry
258 	 * establishes the "parent" of the root directory.  Increment the
259 	 * number of parents of the root directory.
260 	 *
261 	 * Otherwise, increment the number of backrefs pointing back to ino.
262 	 */
263 	if (dotdot) {
264 		if (dp == sc->mp->m_rootip)
265 			error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
266 		else
267 			error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
268 		if (error)
269 			goto out_unlock;
270 	}
271 
272 	/*
273 	 * If this dirent is a forward link from dp to ino, increment the
274 	 * number of parents linking into ino.
275 	 */
276 	if (!dot && !dotdot) {
277 		error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
278 		if (error)
279 			goto out_unlock;
280 	}
281 
282 	/*
283 	 * If this dirent is a forward link to a subdirectory, increment the
284 	 * number of child links of dp.
285 	 */
286 	if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) {
287 		error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1);
288 		if (error)
289 			goto out_unlock;
290 	}
291 
292 	mutex_unlock(&xnc->lock);
293 	return 0;
294 
295 out_unlock:
296 	mutex_unlock(&xnc->lock);
297 out_abort:
298 	xchk_iscan_abort(&xnc->collect_iscan);
299 out_incomplete:
300 	xchk_set_incomplete(sc);
301 	return error;
302 }
303 
304 /* Walk a directory to bump the observed link counts of the children. */
305 STATIC int
306 xchk_nlinks_collect_dir(
307 	struct xchk_nlink_ctrs	*xnc,
308 	struct xfs_inode	*dp)
309 {
310 	struct xfs_scrub	*sc = xnc->sc;
311 	unsigned int		lock_mode;
312 	int			error = 0;
313 
314 	/*
315 	 * Ignore temporary directories being used to stage dir repairs, since
316 	 * we don't bump the link counts of the children.
317 	 */
318 	if (xrep_is_tempfile(dp))
319 		return 0;
320 
321 	/* Prevent anyone from changing this directory while we walk it. */
322 	xfs_ilock(dp, XFS_IOLOCK_SHARED);
323 	lock_mode = xfs_ilock_data_map_shared(dp);
324 
325 	/*
326 	 * The dotdot entry of an unlinked directory still points to the last
327 	 * parent, but the parent no longer links to this directory.  Skip the
328 	 * directory to avoid overcounting.
329 	 */
330 	if (VFS_I(dp)->i_nlink == 0)
331 		goto out_unlock;
332 
333 	/*
334 	 * We cannot count file links if the directory looks as though it has
335 	 * been zapped by the inode record repair code.
336 	 */
337 	if (xchk_dir_looks_zapped(dp)) {
338 		error = -EBUSY;
339 		goto out_abort;
340 	}
341 
342 	error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc);
343 	if (error == -ECANCELED) {
344 		error = 0;
345 		goto out_unlock;
346 	}
347 	if (error)
348 		goto out_abort;
349 
350 	xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
351 	goto out_unlock;
352 
353 out_abort:
354 	xchk_set_incomplete(sc);
355 	xchk_iscan_abort(&xnc->collect_iscan);
356 out_unlock:
357 	xfs_iunlock(dp, lock_mode);
358 	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
359 	return error;
360 }
361 
362 /* If this looks like a valid pointer, count it. */
363 static inline int
364 xchk_nlinks_collect_metafile(
365 	struct xchk_nlink_ctrs	*xnc,
366 	xfs_ino_t		ino)
367 {
368 	if (!xfs_verify_ino(xnc->sc->mp, ino))
369 		return 0;
370 
371 	trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino);
372 	return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
373 }
374 
375 /* Bump the link counts of metadata files rooted in the superblock. */
376 STATIC int
377 xchk_nlinks_collect_metafiles(
378 	struct xchk_nlink_ctrs	*xnc)
379 {
380 	struct xfs_mount	*mp = xnc->sc->mp;
381 	int			error = -ECANCELED;
382 
383 
384 	if (xchk_iscan_aborted(&xnc->collect_iscan))
385 		goto out_incomplete;
386 
387 	mutex_lock(&xnc->lock);
388 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino);
389 	if (error)
390 		goto out_abort;
391 
392 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino);
393 	if (error)
394 		goto out_abort;
395 
396 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino);
397 	if (error)
398 		goto out_abort;
399 
400 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino);
401 	if (error)
402 		goto out_abort;
403 
404 	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino);
405 	if (error)
406 		goto out_abort;
407 	mutex_unlock(&xnc->lock);
408 
409 	return 0;
410 
411 out_abort:
412 	mutex_unlock(&xnc->lock);
413 	xchk_iscan_abort(&xnc->collect_iscan);
414 out_incomplete:
415 	xchk_set_incomplete(xnc->sc);
416 	return error;
417 }
418 
419 /* Advance the collection scan cursor for this non-directory file. */
420 static inline int
421 xchk_nlinks_collect_file(
422 	struct xchk_nlink_ctrs	*xnc,
423 	struct xfs_inode	*ip)
424 {
425 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
426 	xchk_iscan_mark_visited(&xnc->collect_iscan, ip);
427 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
428 	return 0;
429 }
430 
431 /* Walk all directories and count inode links. */
432 STATIC int
433 xchk_nlinks_collect(
434 	struct xchk_nlink_ctrs	*xnc)
435 {
436 	struct xfs_scrub	*sc = xnc->sc;
437 	struct xfs_inode	*ip;
438 	int			error;
439 
440 	/* Count the rt and quota files that are rooted in the superblock. */
441 	error = xchk_nlinks_collect_metafiles(xnc);
442 	if (error)
443 		return error;
444 
445 	/*
446 	 * Set up for a potentially lengthy filesystem scan by reducing our
447 	 * transaction resource usage for the duration.  Specifically:
448 	 *
449 	 * Cancel the transaction to release the log grant space while we scan
450 	 * the filesystem.
451 	 *
452 	 * Create a new empty transaction to eliminate the possibility of the
453 	 * inode scan deadlocking on cyclical metadata.
454 	 *
455 	 * We pass the empty transaction to the file scanning function to avoid
456 	 * repeatedly cycling empty transactions.  This can be done even though
457 	 * we take the IOLOCK to quiesce the file because empty transactions
458 	 * do not take sb_internal.
459 	 */
460 	xchk_trans_cancel(sc);
461 	error = xchk_trans_alloc_empty(sc);
462 	if (error)
463 		return error;
464 
465 	while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
466 		if (S_ISDIR(VFS_I(ip)->i_mode))
467 			error = xchk_nlinks_collect_dir(xnc, ip);
468 		else
469 			error = xchk_nlinks_collect_file(xnc, ip);
470 		xchk_irele(sc, ip);
471 		if (error)
472 			break;
473 
474 		if (xchk_should_terminate(sc, &error))
475 			break;
476 	}
477 	xchk_iscan_iter_finish(&xnc->collect_iscan);
478 	if (error) {
479 		xchk_set_incomplete(sc);
480 		/*
481 		 * If we couldn't grab an inode that was busy with a state
482 		 * change, change the error code so that we exit to userspace
483 		 * as quickly as possible.
484 		 */
485 		if (error == -EBUSY)
486 			return -ECANCELED;
487 		return error;
488 	}
489 
490 	/*
491 	 * Switch out for a real transaction in preparation for building a new
492 	 * tree.
493 	 */
494 	xchk_trans_cancel(sc);
495 	return xchk_setup_fs(sc);
496 }
497 
498 /*
499  * Part 2: Comparing file link counters.  Walk each inode and compare the link
500  * counts against our shadow information; and then walk each shadow link count
501  * structure (that wasn't covered in the first part), comparing it against the
502  * file.
503  */
504 
505 /* Read the observed link count for comparison with the actual inode. */
506 STATIC int
507 xchk_nlinks_comparison_read(
508 	struct xchk_nlink_ctrs	*xnc,
509 	xfs_ino_t		ino,
510 	struct xchk_nlink	*obs)
511 {
512 	struct xchk_nlink	nl;
513 	int			error;
514 
515 	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
516 	if (error)
517 		return error;
518 
519 	nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN);
520 
521 	error = xfarray_store(xnc->nlinks, ino, &nl);
522 	if (error == -EFBIG) {
523 		/*
524 		 * EFBIG means we tried to store data at too high a byte offset
525 		 * in the sparse array.  IOWs, we cannot complete the check and
526 		 * must notify userspace that the check was incomplete.  This
527 		 * shouldn't really happen outside of the collection phase.
528 		 */
529 		xchk_set_incomplete(xnc->sc);
530 		return -ECANCELED;
531 	}
532 	if (error)
533 		return error;
534 
535 	/* Copy the counters, but do not expose the internal state. */
536 	obs->parents = nl.parents;
537 	obs->backrefs = nl.backrefs;
538 	obs->children = nl.children;
539 	obs->flags = 0;
540 	return 0;
541 }
542 
543 /* Check our link count against an inode. */
544 STATIC int
545 xchk_nlinks_compare_inode(
546 	struct xchk_nlink_ctrs	*xnc,
547 	struct xfs_inode	*ip)
548 {
549 	struct xchk_nlink	obs;
550 	struct xfs_scrub	*sc = xnc->sc;
551 	uint64_t		total_links;
552 	unsigned int		actual_nlink;
553 	int			error;
554 
555 	/*
556 	 * Ignore temporary files being used to stage repairs, since we assume
557 	 * they're correct for non-directories, and the directory repair code
558 	 * doesn't bump the link counts for the children.
559 	 */
560 	if (xrep_is_tempfile(ip))
561 		return 0;
562 
563 	xfs_ilock(ip, XFS_ILOCK_SHARED);
564 	mutex_lock(&xnc->lock);
565 
566 	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
567 		xchk_set_incomplete(xnc->sc);
568 		error = -ECANCELED;
569 		goto out_scanlock;
570 	}
571 
572 	error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs);
573 	if (error)
574 		goto out_scanlock;
575 
576 	/*
577 	 * If we don't have ftype to get an accurate count of the subdirectory
578 	 * entries in this directory, take advantage of the fact that on a
579 	 * consistent ftype=0 filesystem, the number of subdirectory
580 	 * backreferences (dotdot entries) pointing towards this directory
581 	 * should be equal to the number of subdirectory entries in the
582 	 * directory.
583 	 */
584 	if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode))
585 		obs.children = obs.backrefs;
586 
587 	total_links = xchk_nlink_total(ip, &obs);
588 	actual_nlink = VFS_I(ip)->i_nlink;
589 
590 	trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs);
591 
592 	/*
593 	 * If we found so many parents that we'd overflow i_nlink, we must flag
594 	 * this as a corruption.  The VFS won't let users increase the link
595 	 * count, but it will let them decrease it.
596 	 */
597 	if (total_links > XFS_MAXLINK) {
598 		xchk_ino_set_corrupt(sc, ip->i_ino);
599 		goto out_corrupt;
600 	}
601 
602 	/* Link counts should match. */
603 	if (total_links != actual_nlink) {
604 		xchk_ino_set_corrupt(sc, ip->i_ino);
605 		goto out_corrupt;
606 	}
607 
608 	if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) {
609 		/*
610 		 * The collection phase ignores directories with zero link
611 		 * count, so we ignore them here too.
612 		 *
613 		 * The number of subdirectory backreferences (dotdot entries)
614 		 * pointing towards this directory should be equal to the
615 		 * number of subdirectory entries in the directory.
616 		 */
617 		if (obs.children != obs.backrefs)
618 			xchk_ino_xref_set_corrupt(sc, ip->i_ino);
619 	} else {
620 		/*
621 		 * Non-directories and unlinked directories should not have
622 		 * back references.
623 		 */
624 		if (obs.backrefs != 0) {
625 			xchk_ino_set_corrupt(sc, ip->i_ino);
626 			goto out_corrupt;
627 		}
628 
629 		/*
630 		 * Non-directories and unlinked directories should not have
631 		 * children.
632 		 */
633 		if (obs.children != 0) {
634 			xchk_ino_set_corrupt(sc, ip->i_ino);
635 			goto out_corrupt;
636 		}
637 	}
638 
639 	if (ip == sc->mp->m_rootip) {
640 		/*
641 		 * For the root of a directory tree, both the '.' and '..'
642 		 * entries should point to the root directory.  The dotdot
643 		 * entry is counted as a parent of the root /and/ a backref of
644 		 * the root directory.
645 		 */
646 		if (obs.parents != 1) {
647 			xchk_ino_set_corrupt(sc, ip->i_ino);
648 			goto out_corrupt;
649 		}
650 	} else if (actual_nlink > 0) {
651 		/*
652 		 * Linked files that are not the root directory should have at
653 		 * least one parent.
654 		 */
655 		if (obs.parents == 0) {
656 			xchk_ino_set_corrupt(sc, ip->i_ino);
657 			goto out_corrupt;
658 		}
659 	}
660 
661 out_corrupt:
662 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
663 		error = -ECANCELED;
664 out_scanlock:
665 	mutex_unlock(&xnc->lock);
666 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
667 	return error;
668 }
669 
670 /*
671  * Check our link count against an inode that wasn't checked previously.  This
672  * is intended to catch directories with dangling links, though we could be
673  * racing with inode allocation in other threads.
674  */
675 STATIC int
676 xchk_nlinks_compare_inum(
677 	struct xchk_nlink_ctrs	*xnc,
678 	xfs_ino_t		ino)
679 {
680 	struct xchk_nlink	obs;
681 	struct xfs_mount	*mp = xnc->sc->mp;
682 	struct xfs_trans	*tp = xnc->sc->tp;
683 	struct xfs_buf		*agi_bp;
684 	struct xfs_inode	*ip;
685 	int			error;
686 
687 	/*
688 	 * The first iget failed, so try again with the variant that returns
689 	 * either an incore inode or the AGI buffer.  If the function returns
690 	 * EINVAL/ENOENT, it should have passed us the AGI buffer so that we
691 	 * can guarantee that the inode won't be allocated while we check for
692 	 * a zero link count in the observed link count data.
693 	 */
694 	error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip);
695 	if (!error) {
696 		/* Actually got an inode, so use the inode compare. */
697 		error = xchk_nlinks_compare_inode(xnc, ip);
698 		xchk_irele(xnc->sc, ip);
699 		return error;
700 	}
701 	if (error == -ENOENT || error == -EINVAL) {
702 		/* No inode was found.  Check for zero link count below. */
703 		error = 0;
704 	}
705 	if (error)
706 		goto out_agi;
707 
708 	/* Ensure that we have protected against inode allocation/freeing. */
709 	if (agi_bp == NULL) {
710 		ASSERT(agi_bp != NULL);
711 		xchk_set_incomplete(xnc->sc);
712 		return -ECANCELED;
713 	}
714 
715 	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
716 		xchk_set_incomplete(xnc->sc);
717 		error = -ECANCELED;
718 		goto out_agi;
719 	}
720 
721 	mutex_lock(&xnc->lock);
722 	error = xchk_nlinks_comparison_read(xnc, ino, &obs);
723 	if (error)
724 		goto out_scanlock;
725 
726 	trace_xchk_nlinks_check_zero(mp, ino, &obs);
727 
728 	/*
729 	 * If we can't grab the inode, the link count had better be zero.  We
730 	 * still hold the AGI to prevent inode allocation/freeing.
731 	 */
732 	if (xchk_nlink_total(NULL, &obs) != 0) {
733 		xchk_ino_set_corrupt(xnc->sc, ino);
734 		error = -ECANCELED;
735 	}
736 
737 out_scanlock:
738 	mutex_unlock(&xnc->lock);
739 out_agi:
740 	if (agi_bp)
741 		xfs_trans_brelse(tp, agi_bp);
742 	return error;
743 }
744 
745 /*
746  * Try to visit every inode in the filesystem to compare the link count.  Move
747  * on if we can't grab an inode, since we'll revisit unchecked nlink records in
748  * the second part.
749  */
750 static int
751 xchk_nlinks_compare_iter(
752 	struct xchk_nlink_ctrs	*xnc,
753 	struct xfs_inode	**ipp)
754 {
755 	int			error;
756 
757 	do {
758 		error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
759 	} while (error == -EBUSY);
760 
761 	return error;
762 }
763 
764 /* Compare the link counts we observed against the live information. */
765 STATIC int
766 xchk_nlinks_compare(
767 	struct xchk_nlink_ctrs	*xnc)
768 {
769 	struct xchk_nlink	nl;
770 	struct xfs_scrub	*sc = xnc->sc;
771 	struct xfs_inode	*ip;
772 	xfarray_idx_t		cur = XFARRAY_CURSOR_INIT;
773 	int			error;
774 
775 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
776 		return 0;
777 
778 	/*
779 	 * Create a new empty transaction so that we can advance the iscan
780 	 * cursor without deadlocking if the inobt has a cycle and push on the
781 	 * inactivation workqueue.
782 	 */
783 	xchk_trans_cancel(sc);
784 	error = xchk_trans_alloc_empty(sc);
785 	if (error)
786 		return error;
787 
788 	/*
789 	 * Use the inobt to walk all allocated inodes to compare the link
790 	 * counts.  Inodes skipped by _compare_iter will be tried again in the
791 	 * next phase of the scan.
792 	 */
793 	xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan);
794 	while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) {
795 		error = xchk_nlinks_compare_inode(xnc, ip);
796 		xchk_iscan_mark_visited(&xnc->compare_iscan, ip);
797 		xchk_irele(sc, ip);
798 		if (error)
799 			break;
800 
801 		if (xchk_should_terminate(sc, &error))
802 			break;
803 	}
804 	xchk_iscan_iter_finish(&xnc->compare_iscan);
805 	xchk_iscan_teardown(&xnc->compare_iscan);
806 	if (error)
807 		return error;
808 
809 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
810 		return 0;
811 
812 	/*
813 	 * Walk all the non-null nlink observations that weren't checked in the
814 	 * previous step.
815 	 */
816 	mutex_lock(&xnc->lock);
817 	while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) {
818 		xfs_ino_t	ino = cur - 1;
819 
820 		if (nl.flags & XCHK_NLINK_COMPARE_SCANNED)
821 			continue;
822 
823 		mutex_unlock(&xnc->lock);
824 
825 		error = xchk_nlinks_compare_inum(xnc, ino);
826 		if (error)
827 			return error;
828 
829 		if (xchk_should_terminate(xnc->sc, &error))
830 			return error;
831 
832 		mutex_lock(&xnc->lock);
833 	}
834 	mutex_unlock(&xnc->lock);
835 
836 	return error;
837 }
838 
839 /* Tear down everything associated with a nlinks check. */
840 static void
841 xchk_nlinks_teardown_scan(
842 	void			*priv)
843 {
844 	struct xchk_nlink_ctrs	*xnc = priv;
845 
846 	/* Discourage any hook functions that might be running. */
847 	xchk_iscan_abort(&xnc->collect_iscan);
848 
849 	xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook);
850 
851 	xfarray_destroy(xnc->nlinks);
852 	xnc->nlinks = NULL;
853 
854 	xchk_iscan_teardown(&xnc->collect_iscan);
855 	mutex_destroy(&xnc->lock);
856 	xnc->sc = NULL;
857 }
858 
859 /*
860  * Scan all inodes in the entire filesystem to generate link count data.  If
861  * the scan is successful, the counts will be left alive for a repair.  If any
862  * error occurs, we'll tear everything down.
863  */
864 STATIC int
865 xchk_nlinks_setup_scan(
866 	struct xfs_scrub	*sc,
867 	struct xchk_nlink_ctrs	*xnc)
868 {
869 	struct xfs_mount	*mp = sc->mp;
870 	char			*descr;
871 	unsigned long long	max_inos;
872 	xfs_agnumber_t		last_agno = mp->m_sb.sb_agcount - 1;
873 	xfs_agino_t		first_agino, last_agino;
874 	int			error;
875 
876 	ASSERT(xnc->sc == NULL);
877 	xnc->sc = sc;
878 
879 	mutex_init(&xnc->lock);
880 
881 	/* Retry iget every tenth of a second for up to 30 seconds. */
882 	xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan);
883 
884 	/*
885 	 * Set up enough space to store an nlink record for the highest
886 	 * possible inode number in this system.
887 	 */
888 	xfs_agino_range(mp, last_agno, &first_agino, &last_agino);
889 	max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1;
890 	descr = xchk_xfile_descr(sc, "file link counts");
891 	error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos),
892 			sizeof(struct xchk_nlink), &xnc->nlinks);
893 	kfree(descr);
894 	if (error)
895 		goto out_teardown;
896 
897 	/*
898 	 * Hook into the directory entry code so that we can capture updates to
899 	 * file link counts.  The hook only triggers for inodes that were
900 	 * already scanned, and the scanner thread takes each inode's ILOCK,
901 	 * which means that any in-progress inode updates will finish before we
902 	 * can scan the inode.
903 	 */
904 	ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
905 	xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update);
906 	error = xfs_dir_hook_add(mp, &xnc->dhook);
907 	if (error)
908 		goto out_teardown;
909 
910 	/* Use deferred cleanup to pass the inode link count data to repair. */
911 	sc->buf_cleanup = xchk_nlinks_teardown_scan;
912 	return 0;
913 
914 out_teardown:
915 	xchk_nlinks_teardown_scan(xnc);
916 	return error;
917 }
918 
919 /* Scrub the link count of all inodes on the filesystem. */
920 int
921 xchk_nlinks(
922 	struct xfs_scrub	*sc)
923 {
924 	struct xchk_nlink_ctrs	*xnc = sc->buf;
925 	int			error = 0;
926 
927 	/* Set ourselves up to check link counts on the live filesystem. */
928 	error = xchk_nlinks_setup_scan(sc, xnc);
929 	if (error)
930 		return error;
931 
932 	/* Walk all inodes, picking up link count information. */
933 	error = xchk_nlinks_collect(xnc);
934 	if (!xchk_xref_process_error(sc, 0, 0, &error))
935 		return error;
936 
937 	/* Fail fast if we're not playing with a full dataset. */
938 	if (xchk_iscan_aborted(&xnc->collect_iscan))
939 		xchk_set_incomplete(sc);
940 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
941 		return 0;
942 
943 	/* Compare link counts. */
944 	error = xchk_nlinks_compare(xnc);
945 	if (!xchk_xref_process_error(sc, 0, 0, &error))
946 		return error;
947 
948 	/* Check one last time for an incomplete dataset. */
949 	if (xchk_iscan_aborted(&xnc->collect_iscan))
950 		xchk_set_incomplete(sc);
951 
952 	return 0;
953 }
954