xref: /linux/fs/xfs/scrub/common.c (revision 7f4f3b14e8079ecde096bd734af10e30d40c27b7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode.h"
16 #include "xfs_icache.h"
17 #include "xfs_alloc.h"
18 #include "xfs_alloc_btree.h"
19 #include "xfs_ialloc.h"
20 #include "xfs_ialloc_btree.h"
21 #include "xfs_refcount_btree.h"
22 #include "xfs_rmap.h"
23 #include "xfs_rmap_btree.h"
24 #include "xfs_log.h"
25 #include "xfs_trans_priv.h"
26 #include "xfs_da_format.h"
27 #include "xfs_da_btree.h"
28 #include "xfs_dir2_priv.h"
29 #include "xfs_dir2.h"
30 #include "xfs_attr.h"
31 #include "xfs_reflink.h"
32 #include "xfs_ag.h"
33 #include "xfs_error.h"
34 #include "xfs_quota.h"
35 #include "xfs_exchmaps.h"
36 #include "xfs_rtbitmap.h"
37 #include "xfs_rtgroup.h"
38 #include "scrub/scrub.h"
39 #include "scrub/common.h"
40 #include "scrub/trace.h"
41 #include "scrub/repair.h"
42 #include "scrub/health.h"
43 #include "scrub/tempfile.h"
44 
45 /* Common code for the metadata scrubbers. */
46 
47 /*
48  * Handling operational errors.
49  *
50  * The *_process_error() family of functions are used to process error return
51  * codes from functions called as part of a scrub operation.
52  *
53  * If there's no error, we return true to tell the caller that it's ok
54  * to move on to the next check in its list.
55  *
56  * For non-verifier errors (e.g. ENOMEM) we return false to tell the
57  * caller that something bad happened, and we preserve *error so that
58  * the caller can return the *error up the stack to userspace.
59  *
60  * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
61  * OFLAG_CORRUPT in sm_flags and the *error is cleared.  In other words,
62  * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
63  * not via return codes.  We return false to tell the caller that
64  * something bad happened.  Since the error has been cleared, the caller
65  * will (presumably) return that zero and scrubbing will move on to
66  * whatever's next.
67  *
68  * ftrace can be used to record the precise metadata location and the
69  * approximate code location of the failed operation.
70  */
71 
72 /* Check for operational errors. */
73 static bool
74 __xchk_process_error(
75 	struct xfs_scrub	*sc,
76 	xfs_agnumber_t		agno,
77 	xfs_agblock_t		bno,
78 	int			*error,
79 	__u32			errflag,
80 	void			*ret_ip)
81 {
82 	switch (*error) {
83 	case 0:
84 		return true;
85 	case -EDEADLOCK:
86 	case -ECHRNG:
87 		/* Used to restart an op with deadlock avoidance. */
88 		trace_xchk_deadlock_retry(
89 				sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
90 				sc->sm, *error);
91 		break;
92 	case -ECANCELED:
93 		/*
94 		 * ECANCELED here means that the caller set one of the scrub
95 		 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
96 		 * quickly.  Set error to zero and do not continue.
97 		 */
98 		trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
99 		*error = 0;
100 		break;
101 	case -EFSBADCRC:
102 	case -EFSCORRUPTED:
103 		/* Note the badness but don't abort. */
104 		sc->sm->sm_flags |= errflag;
105 		*error = 0;
106 		fallthrough;
107 	default:
108 		trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
109 		break;
110 	}
111 	return false;
112 }
113 
114 bool
115 xchk_process_error(
116 	struct xfs_scrub	*sc,
117 	xfs_agnumber_t		agno,
118 	xfs_agblock_t		bno,
119 	int			*error)
120 {
121 	return __xchk_process_error(sc, agno, bno, error,
122 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
123 }
124 
125 bool
126 xchk_process_rt_error(
127 	struct xfs_scrub	*sc,
128 	xfs_rgnumber_t		rgno,
129 	xfs_rgblock_t		rgbno,
130 	int			*error)
131 {
132 	return __xchk_process_error(sc, rgno, rgbno, error,
133 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
134 }
135 
136 bool
137 xchk_xref_process_error(
138 	struct xfs_scrub	*sc,
139 	xfs_agnumber_t		agno,
140 	xfs_agblock_t		bno,
141 	int			*error)
142 {
143 	return __xchk_process_error(sc, agno, bno, error,
144 			XFS_SCRUB_OFLAG_XFAIL, __return_address);
145 }
146 
147 /* Check for operational errors for a file offset. */
148 static bool
149 __xchk_fblock_process_error(
150 	struct xfs_scrub	*sc,
151 	int			whichfork,
152 	xfs_fileoff_t		offset,
153 	int			*error,
154 	__u32			errflag,
155 	void			*ret_ip)
156 {
157 	switch (*error) {
158 	case 0:
159 		return true;
160 	case -EDEADLOCK:
161 	case -ECHRNG:
162 		/* Used to restart an op with deadlock avoidance. */
163 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
164 		break;
165 	case -ECANCELED:
166 		/*
167 		 * ECANCELED here means that the caller set one of the scrub
168 		 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
169 		 * quickly.  Set error to zero and do not continue.
170 		 */
171 		trace_xchk_file_op_error(sc, whichfork, offset, *error,
172 				ret_ip);
173 		*error = 0;
174 		break;
175 	case -EFSBADCRC:
176 	case -EFSCORRUPTED:
177 		/* Note the badness but don't abort. */
178 		sc->sm->sm_flags |= errflag;
179 		*error = 0;
180 		fallthrough;
181 	default:
182 		trace_xchk_file_op_error(sc, whichfork, offset, *error,
183 				ret_ip);
184 		break;
185 	}
186 	return false;
187 }
188 
189 bool
190 xchk_fblock_process_error(
191 	struct xfs_scrub	*sc,
192 	int			whichfork,
193 	xfs_fileoff_t		offset,
194 	int			*error)
195 {
196 	return __xchk_fblock_process_error(sc, whichfork, offset, error,
197 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
198 }
199 
200 bool
201 xchk_fblock_xref_process_error(
202 	struct xfs_scrub	*sc,
203 	int			whichfork,
204 	xfs_fileoff_t		offset,
205 	int			*error)
206 {
207 	return __xchk_fblock_process_error(sc, whichfork, offset, error,
208 			XFS_SCRUB_OFLAG_XFAIL, __return_address);
209 }
210 
211 /*
212  * Handling scrub corruption/optimization/warning checks.
213  *
214  * The *_set_{corrupt,preen,warning}() family of functions are used to
215  * record the presence of metadata that is incorrect (corrupt), could be
216  * optimized somehow (preen), or should be flagged for administrative
217  * review but is not incorrect (warn).
218  *
219  * ftrace can be used to record the precise metadata location and
220  * approximate code location of the failed check.
221  */
222 
223 /* Record a block which could be optimized. */
224 void
225 xchk_block_set_preen(
226 	struct xfs_scrub	*sc,
227 	struct xfs_buf		*bp)
228 {
229 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
230 	trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
231 }
232 
233 /*
234  * Record an inode which could be optimized.  The trace data will
235  * include the block given by bp if bp is given; otherwise it will use
236  * the block location of the inode record itself.
237  */
238 void
239 xchk_ino_set_preen(
240 	struct xfs_scrub	*sc,
241 	xfs_ino_t		ino)
242 {
243 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
244 	trace_xchk_ino_preen(sc, ino, __return_address);
245 }
246 
247 /* Record something being wrong with the filesystem primary superblock. */
248 void
249 xchk_set_corrupt(
250 	struct xfs_scrub	*sc)
251 {
252 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
253 	trace_xchk_fs_error(sc, 0, __return_address);
254 }
255 
256 /* Record a corrupt block. */
257 void
258 xchk_block_set_corrupt(
259 	struct xfs_scrub	*sc,
260 	struct xfs_buf		*bp)
261 {
262 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
263 	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
264 }
265 
266 #ifdef CONFIG_XFS_QUOTA
267 /* Record a corrupt quota counter. */
268 void
269 xchk_qcheck_set_corrupt(
270 	struct xfs_scrub	*sc,
271 	unsigned int		dqtype,
272 	xfs_dqid_t		id)
273 {
274 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
275 	trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
276 }
277 #endif
278 
279 /* Record a corruption while cross-referencing. */
280 void
281 xchk_block_xref_set_corrupt(
282 	struct xfs_scrub	*sc,
283 	struct xfs_buf		*bp)
284 {
285 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
286 	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
287 }
288 
289 /*
290  * Record a corrupt inode.  The trace data will include the block given
291  * by bp if bp is given; otherwise it will use the block location of the
292  * inode record itself.
293  */
294 void
295 xchk_ino_set_corrupt(
296 	struct xfs_scrub	*sc,
297 	xfs_ino_t		ino)
298 {
299 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
300 	trace_xchk_ino_error(sc, ino, __return_address);
301 }
302 
303 /* Record a corruption while cross-referencing with an inode. */
304 void
305 xchk_ino_xref_set_corrupt(
306 	struct xfs_scrub	*sc,
307 	xfs_ino_t		ino)
308 {
309 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
310 	trace_xchk_ino_error(sc, ino, __return_address);
311 }
312 
313 /* Record corruption in a block indexed by a file fork. */
314 void
315 xchk_fblock_set_corrupt(
316 	struct xfs_scrub	*sc,
317 	int			whichfork,
318 	xfs_fileoff_t		offset)
319 {
320 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
321 	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
322 }
323 
324 /* Record a corruption while cross-referencing a fork block. */
325 void
326 xchk_fblock_xref_set_corrupt(
327 	struct xfs_scrub	*sc,
328 	int			whichfork,
329 	xfs_fileoff_t		offset)
330 {
331 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
332 	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
333 }
334 
335 /*
336  * Warn about inodes that need administrative review but is not
337  * incorrect.
338  */
339 void
340 xchk_ino_set_warning(
341 	struct xfs_scrub	*sc,
342 	xfs_ino_t		ino)
343 {
344 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
345 	trace_xchk_ino_warning(sc, ino, __return_address);
346 }
347 
348 /* Warn about a block indexed by a file fork that needs review. */
349 void
350 xchk_fblock_set_warning(
351 	struct xfs_scrub	*sc,
352 	int			whichfork,
353 	xfs_fileoff_t		offset)
354 {
355 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
356 	trace_xchk_fblock_warning(sc, whichfork, offset, __return_address);
357 }
358 
359 /* Signal an incomplete scrub. */
360 void
361 xchk_set_incomplete(
362 	struct xfs_scrub	*sc)
363 {
364 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
365 	trace_xchk_incomplete(sc, __return_address);
366 }
367 
368 /*
369  * rmap scrubbing -- compute the number of blocks with a given owner,
370  * at least according to the reverse mapping data.
371  */
372 
373 struct xchk_rmap_ownedby_info {
374 	const struct xfs_owner_info	*oinfo;
375 	xfs_filblks_t			*blocks;
376 };
377 
378 STATIC int
379 xchk_count_rmap_ownedby_irec(
380 	struct xfs_btree_cur		*cur,
381 	const struct xfs_rmap_irec	*rec,
382 	void				*priv)
383 {
384 	struct xchk_rmap_ownedby_info	*sroi = priv;
385 	bool				irec_attr;
386 	bool				oinfo_attr;
387 
388 	irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
389 	oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
390 
391 	if (rec->rm_owner != sroi->oinfo->oi_owner)
392 		return 0;
393 
394 	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
395 		(*sroi->blocks) += rec->rm_blockcount;
396 
397 	return 0;
398 }
399 
400 /*
401  * Calculate the number of blocks the rmap thinks are owned by something.
402  * The caller should pass us an rmapbt cursor.
403  */
404 int
405 xchk_count_rmap_ownedby_ag(
406 	struct xfs_scrub		*sc,
407 	struct xfs_btree_cur		*cur,
408 	const struct xfs_owner_info	*oinfo,
409 	xfs_filblks_t			*blocks)
410 {
411 	struct xchk_rmap_ownedby_info	sroi = {
412 		.oinfo			= oinfo,
413 		.blocks			= blocks,
414 	};
415 
416 	*blocks = 0;
417 	return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
418 			&sroi);
419 }
420 
421 /*
422  * AG scrubbing
423  *
424  * These helpers facilitate locking an allocation group's header
425  * buffers, setting up cursors for all btrees that are present, and
426  * cleaning everything up once we're through.
427  */
428 
429 /* Decide if we want to return an AG header read failure. */
430 static inline bool
431 want_ag_read_header_failure(
432 	struct xfs_scrub	*sc,
433 	unsigned int		type)
434 {
435 	/* Return all AG header read failures when scanning btrees. */
436 	if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
437 	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
438 	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
439 		return true;
440 	/*
441 	 * If we're scanning a given type of AG header, we only want to
442 	 * see read failures from that specific header.  We'd like the
443 	 * other headers to cross-check them, but this isn't required.
444 	 */
445 	if (sc->sm->sm_type == type)
446 		return true;
447 	return false;
448 }
449 
450 /*
451  * Grab the AG header buffers for the attached perag structure.
452  *
453  * The headers should be released by xchk_ag_free, but as a fail safe we attach
454  * all the buffers we grab to the scrub transaction so they'll all be freed
455  * when we cancel it.
456  */
457 static inline int
458 xchk_perag_read_headers(
459 	struct xfs_scrub	*sc,
460 	struct xchk_ag		*sa)
461 {
462 	int			error;
463 
464 	error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp);
465 	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
466 		return error;
467 
468 	error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
469 	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
470 		return error;
471 
472 	return 0;
473 }
474 
475 /*
476  * Grab the AG headers for the attached perag structure and wait for pending
477  * intents to drain.
478  */
479 int
480 xchk_perag_drain_and_lock(
481 	struct xfs_scrub	*sc)
482 {
483 	struct xchk_ag		*sa = &sc->sa;
484 	int			error = 0;
485 
486 	ASSERT(sa->pag != NULL);
487 	ASSERT(sa->agi_bp == NULL);
488 	ASSERT(sa->agf_bp == NULL);
489 
490 	do {
491 		if (xchk_should_terminate(sc, &error))
492 			return error;
493 
494 		error = xchk_perag_read_headers(sc, sa);
495 		if (error)
496 			return error;
497 
498 		/*
499 		 * If we've grabbed an inode for scrubbing then we assume that
500 		 * holding its ILOCK will suffice to coordinate with any intent
501 		 * chains involving this inode.
502 		 */
503 		if (sc->ip)
504 			return 0;
505 
506 		/*
507 		 * Decide if this AG is quiet enough for all metadata to be
508 		 * consistent with each other.  XFS allows the AG header buffer
509 		 * locks to cycle across transaction rolls while processing
510 		 * chains of deferred ops, which means that there could be
511 		 * other threads in the middle of processing a chain of
512 		 * deferred ops.  For regular operations we are careful about
513 		 * ordering operations to prevent collisions between threads
514 		 * (which is why we don't need a per-AG lock), but scrub and
515 		 * repair have to serialize against chained operations.
516 		 *
517 		 * We just locked all the AG headers buffers; now take a look
518 		 * to see if there are any intents in progress.  If there are,
519 		 * drop the AG headers and wait for the intents to drain.
520 		 * Since we hold all the AG header locks for the duration of
521 		 * the scrub, this is the only time we have to sample the
522 		 * intents counter; any threads increasing it after this point
523 		 * can't possibly be in the middle of a chain of AG metadata
524 		 * updates.
525 		 *
526 		 * Obviously, this should be slanted against scrub and in favor
527 		 * of runtime threads.
528 		 */
529 		if (!xfs_group_intent_busy(pag_group(sa->pag)))
530 			return 0;
531 
532 		if (sa->agf_bp) {
533 			xfs_trans_brelse(sc->tp, sa->agf_bp);
534 			sa->agf_bp = NULL;
535 		}
536 
537 		if (sa->agi_bp) {
538 			xfs_trans_brelse(sc->tp, sa->agi_bp);
539 			sa->agi_bp = NULL;
540 		}
541 
542 		if (!(sc->flags & XCHK_FSGATES_DRAIN))
543 			return -ECHRNG;
544 		error = xfs_group_intent_drain(pag_group(sa->pag));
545 		if (error == -ERESTARTSYS)
546 			error = -EINTR;
547 	} while (!error);
548 
549 	return error;
550 }
551 
552 /*
553  * Grab the per-AG structure, grab all AG header buffers, and wait until there
554  * aren't any pending intents.  Returns -ENOENT if we can't grab the perag
555  * structure.
556  */
557 int
558 xchk_ag_read_headers(
559 	struct xfs_scrub	*sc,
560 	xfs_agnumber_t		agno,
561 	struct xchk_ag		*sa)
562 {
563 	struct xfs_mount	*mp = sc->mp;
564 
565 	ASSERT(!sa->pag);
566 	sa->pag = xfs_perag_get(mp, agno);
567 	if (!sa->pag)
568 		return -ENOENT;
569 
570 	return xchk_perag_drain_and_lock(sc);
571 }
572 
573 /* Release all the AG btree cursors. */
574 void
575 xchk_ag_btcur_free(
576 	struct xchk_ag		*sa)
577 {
578 	if (sa->refc_cur)
579 		xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
580 	if (sa->rmap_cur)
581 		xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
582 	if (sa->fino_cur)
583 		xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
584 	if (sa->ino_cur)
585 		xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
586 	if (sa->cnt_cur)
587 		xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
588 	if (sa->bno_cur)
589 		xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
590 
591 	sa->refc_cur = NULL;
592 	sa->rmap_cur = NULL;
593 	sa->fino_cur = NULL;
594 	sa->ino_cur = NULL;
595 	sa->bno_cur = NULL;
596 	sa->cnt_cur = NULL;
597 }
598 
599 /* Initialize all the btree cursors for an AG. */
600 void
601 xchk_ag_btcur_init(
602 	struct xfs_scrub	*sc,
603 	struct xchk_ag		*sa)
604 {
605 	struct xfs_mount	*mp = sc->mp;
606 
607 	if (sa->agf_bp) {
608 		/* Set up a bnobt cursor for cross-referencing. */
609 		sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
610 				sa->pag);
611 		xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
612 				XFS_SCRUB_TYPE_BNOBT);
613 
614 		/* Set up a cntbt cursor for cross-referencing. */
615 		sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
616 				sa->pag);
617 		xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
618 				XFS_SCRUB_TYPE_CNTBT);
619 
620 		/* Set up a rmapbt cursor for cross-referencing. */
621 		if (xfs_has_rmapbt(mp)) {
622 			sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
623 					sa->agf_bp, sa->pag);
624 			xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
625 					XFS_SCRUB_TYPE_RMAPBT);
626 		}
627 
628 		/* Set up a refcountbt cursor for cross-referencing. */
629 		if (xfs_has_reflink(mp)) {
630 			sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
631 					sa->agf_bp, sa->pag);
632 			xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
633 					XFS_SCRUB_TYPE_REFCNTBT);
634 		}
635 	}
636 
637 	if (sa->agi_bp) {
638 		/* Set up a inobt cursor for cross-referencing. */
639 		sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
640 				sa->agi_bp);
641 		xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
642 				XFS_SCRUB_TYPE_INOBT);
643 
644 		/* Set up a finobt cursor for cross-referencing. */
645 		if (xfs_has_finobt(mp)) {
646 			sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
647 					sa->agi_bp);
648 			xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
649 					XFS_SCRUB_TYPE_FINOBT);
650 		}
651 	}
652 }
653 
654 /* Release the AG header context and btree cursors. */
655 void
656 xchk_ag_free(
657 	struct xfs_scrub	*sc,
658 	struct xchk_ag		*sa)
659 {
660 	xchk_ag_btcur_free(sa);
661 	xrep_reset_perag_resv(sc);
662 	if (sa->agf_bp) {
663 		xfs_trans_brelse(sc->tp, sa->agf_bp);
664 		sa->agf_bp = NULL;
665 	}
666 	if (sa->agi_bp) {
667 		xfs_trans_brelse(sc->tp, sa->agi_bp);
668 		sa->agi_bp = NULL;
669 	}
670 	if (sa->pag) {
671 		xfs_perag_put(sa->pag);
672 		sa->pag = NULL;
673 	}
674 }
675 
676 /*
677  * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
678  * order.  Locking order requires us to get the AGI before the AGF.  We use the
679  * transaction to avoid deadlocking on crosslinked metadata buffers; either the
680  * caller passes one in (bmap scrub) or we have to create a transaction
681  * ourselves.  Returns ENOENT if the perag struct cannot be grabbed.
682  */
683 int
684 xchk_ag_init(
685 	struct xfs_scrub	*sc,
686 	xfs_agnumber_t		agno,
687 	struct xchk_ag		*sa)
688 {
689 	int			error;
690 
691 	error = xchk_ag_read_headers(sc, agno, sa);
692 	if (error)
693 		return error;
694 
695 	xchk_ag_btcur_init(sc, sa);
696 	return 0;
697 }
698 
699 #ifdef CONFIG_XFS_RT
700 /*
701  * For scrubbing a realtime group, grab all the in-core resources we'll need to
702  * check the metadata, which means taking the ILOCK of the realtime group's
703  * metadata inodes.  Callers must not join these inodes to the transaction with
704  * non-zero lockflags or concurrency problems will result.  The @rtglock_flags
705  * argument takes XFS_RTGLOCK_* flags.
706  */
707 int
708 xchk_rtgroup_init(
709 	struct xfs_scrub	*sc,
710 	xfs_rgnumber_t		rgno,
711 	struct xchk_rt		*sr)
712 {
713 	ASSERT(sr->rtg == NULL);
714 	ASSERT(sr->rtlock_flags == 0);
715 
716 	sr->rtg = xfs_rtgroup_get(sc->mp, rgno);
717 	if (!sr->rtg)
718 		return -ENOENT;
719 	return 0;
720 }
721 
722 void
723 xchk_rtgroup_lock(
724 	struct xchk_rt		*sr,
725 	unsigned int		rtglock_flags)
726 {
727 	xfs_rtgroup_lock(sr->rtg, rtglock_flags);
728 	sr->rtlock_flags = rtglock_flags;
729 }
730 
731 /*
732  * Unlock the realtime group.  This must be done /after/ committing (or
733  * cancelling) the scrub transaction.
734  */
735 static void
736 xchk_rtgroup_unlock(
737 	struct xchk_rt		*sr)
738 {
739 	ASSERT(sr->rtg != NULL);
740 
741 	if (sr->rtlock_flags) {
742 		xfs_rtgroup_unlock(sr->rtg, sr->rtlock_flags);
743 		sr->rtlock_flags = 0;
744 	}
745 }
746 
747 /*
748  * Unlock the realtime group and release its resources.  This must be done
749  * /after/ committing (or cancelling) the scrub transaction.
750  */
751 void
752 xchk_rtgroup_free(
753 	struct xfs_scrub	*sc,
754 	struct xchk_rt		*sr)
755 {
756 	ASSERT(sr->rtg != NULL);
757 
758 	xchk_rtgroup_unlock(sr);
759 
760 	xfs_rtgroup_put(sr->rtg);
761 	sr->rtg = NULL;
762 }
763 #endif /* CONFIG_XFS_RT */
764 
765 /* Per-scrubber setup functions */
766 
767 void
768 xchk_trans_cancel(
769 	struct xfs_scrub	*sc)
770 {
771 	xfs_trans_cancel(sc->tp);
772 	sc->tp = NULL;
773 }
774 
775 int
776 xchk_trans_alloc_empty(
777 	struct xfs_scrub	*sc)
778 {
779 	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
780 }
781 
782 /*
783  * Grab an empty transaction so that we can re-grab locked buffers if
784  * one of our btrees turns out to be cyclic.
785  *
786  * If we're going to repair something, we need to ask for the largest possible
787  * log reservation so that we can handle the worst case scenario for metadata
788  * updates while rebuilding a metadata item.  We also need to reserve as many
789  * blocks in the head transaction as we think we're going to need to rebuild
790  * the metadata object.
791  */
792 int
793 xchk_trans_alloc(
794 	struct xfs_scrub	*sc,
795 	uint			resblks)
796 {
797 	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
798 		return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
799 				resblks, 0, 0, &sc->tp);
800 
801 	return xchk_trans_alloc_empty(sc);
802 }
803 
804 /* Set us up with a transaction and an empty context. */
805 int
806 xchk_setup_fs(
807 	struct xfs_scrub	*sc)
808 {
809 	uint			resblks;
810 
811 	resblks = xrep_calc_ag_resblks(sc);
812 	return xchk_trans_alloc(sc, resblks);
813 }
814 
815 /* Set us up with AG headers and btree cursors. */
816 int
817 xchk_setup_ag_btree(
818 	struct xfs_scrub	*sc,
819 	bool			force_log)
820 {
821 	struct xfs_mount	*mp = sc->mp;
822 	int			error;
823 
824 	/*
825 	 * If the caller asks us to checkpont the log, do so.  This
826 	 * expensive operation should be performed infrequently and only
827 	 * as a last resort.  Any caller that sets force_log should
828 	 * document why they need to do so.
829 	 */
830 	if (force_log) {
831 		error = xchk_checkpoint_log(mp);
832 		if (error)
833 			return error;
834 	}
835 
836 	error = xchk_setup_fs(sc);
837 	if (error)
838 		return error;
839 
840 	return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa);
841 }
842 
843 /* Push everything out of the log onto disk. */
844 int
845 xchk_checkpoint_log(
846 	struct xfs_mount	*mp)
847 {
848 	int			error;
849 
850 	error = xfs_log_force(mp, XFS_LOG_SYNC);
851 	if (error)
852 		return error;
853 	xfs_ail_push_all_sync(mp->m_ail);
854 	return 0;
855 }
856 
857 /* Verify that an inode is allocated ondisk, then return its cached inode. */
858 int
859 xchk_iget(
860 	struct xfs_scrub	*sc,
861 	xfs_ino_t		inum,
862 	struct xfs_inode	**ipp)
863 {
864 	ASSERT(sc->tp != NULL);
865 
866 	return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp);
867 }
868 
869 /*
870  * Try to grab an inode in a manner that avoids races with physical inode
871  * allocation.  If we can't, return the locked AGI buffer so that the caller
872  * can single-step the loading process to see where things went wrong.
873  * Callers must have a valid scrub transaction.
874  *
875  * If the iget succeeds, return 0, a NULL AGI, and the inode.
876  *
877  * If the iget fails, return the error, the locked AGI, and a NULL inode.  This
878  * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
879  * no longer allocated; or any other corruption or runtime error.
880  *
881  * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
882  *
883  * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
884  */
885 int
886 xchk_iget_agi(
887 	struct xfs_scrub	*sc,
888 	xfs_ino_t		inum,
889 	struct xfs_buf		**agi_bpp,
890 	struct xfs_inode	**ipp)
891 {
892 	struct xfs_mount	*mp = sc->mp;
893 	struct xfs_trans	*tp = sc->tp;
894 	struct xfs_perag	*pag;
895 	int			error;
896 
897 	ASSERT(sc->tp != NULL);
898 
899 again:
900 	*agi_bpp = NULL;
901 	*ipp = NULL;
902 	error = 0;
903 
904 	if (xchk_should_terminate(sc, &error))
905 		return error;
906 
907 	/*
908 	 * Attach the AGI buffer to the scrub transaction to avoid deadlocks
909 	 * in the iget cache miss path.
910 	 */
911 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
912 	error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp);
913 	xfs_perag_put(pag);
914 	if (error)
915 		return error;
916 
917 	error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0,
918 			ipp);
919 	if (error == -EAGAIN) {
920 		/*
921 		 * The inode may be in core but temporarily unavailable and may
922 		 * require the AGI buffer before it can be returned.  Drop the
923 		 * AGI buffer and retry the lookup.
924 		 *
925 		 * Incore lookup will fail with EAGAIN on a cache hit if the
926 		 * inode is queued to the inactivation list.  The inactivation
927 		 * worker may remove the inode from the unlinked list and hence
928 		 * needs the AGI.
929 		 *
930 		 * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
931 		 * to allow inodegc to make progress and move the inode to
932 		 * IRECLAIMABLE state where xfs_iget will be able to return it
933 		 * again if it can lock the inode.
934 		 */
935 		xfs_trans_brelse(tp, *agi_bpp);
936 		delay(1);
937 		goto again;
938 	}
939 	if (error)
940 		return error;
941 
942 	/* We got the inode, so we can release the AGI. */
943 	ASSERT(*ipp != NULL);
944 	xfs_trans_brelse(tp, *agi_bpp);
945 	*agi_bpp = NULL;
946 	return 0;
947 }
948 
949 #ifdef CONFIG_XFS_QUOTA
950 /*
951  * Try to attach dquots to this inode if we think we might want to repair it.
952  * Callers must not hold any ILOCKs.  If the dquots are broken and cannot be
953  * attached, a quotacheck will be scheduled.
954  */
955 int
956 xchk_ino_dqattach(
957 	struct xfs_scrub	*sc)
958 {
959 	ASSERT(sc->tp != NULL);
960 	ASSERT(sc->ip != NULL);
961 
962 	if (!xchk_could_repair(sc))
963 		return 0;
964 
965 	return xrep_ino_dqattach(sc);
966 }
967 #endif
968 
969 /* Install an inode that we opened by handle for scrubbing. */
970 int
971 xchk_install_handle_inode(
972 	struct xfs_scrub	*sc,
973 	struct xfs_inode	*ip)
974 {
975 	if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
976 		xchk_irele(sc, ip);
977 		return -ENOENT;
978 	}
979 
980 	sc->ip = ip;
981 	return 0;
982 }
983 
984 /*
985  * Install an already-referenced inode for scrubbing.  Get our own reference to
986  * the inode to make disposal simpler.  The inode must not be in I_FREEING or
987  * I_WILL_FREE state!
988  */
989 int
990 xchk_install_live_inode(
991 	struct xfs_scrub	*sc,
992 	struct xfs_inode	*ip)
993 {
994 	if (!igrab(VFS_I(ip))) {
995 		xchk_ino_set_corrupt(sc, ip->i_ino);
996 		return -EFSCORRUPTED;
997 	}
998 
999 	sc->ip = ip;
1000 	return 0;
1001 }
1002 
1003 /*
1004  * In preparation to scrub metadata structures that hang off of an inode,
1005  * grab either the inode referenced in the scrub control structure or the
1006  * inode passed in.  If the inumber does not reference an allocated inode
1007  * record, the function returns ENOENT to end the scrub early.  The inode
1008  * is not locked.
1009  */
1010 int
1011 xchk_iget_for_scrubbing(
1012 	struct xfs_scrub	*sc)
1013 {
1014 	struct xfs_imap		imap;
1015 	struct xfs_mount	*mp = sc->mp;
1016 	struct xfs_perag	*pag;
1017 	struct xfs_buf		*agi_bp;
1018 	struct xfs_inode	*ip_in = XFS_I(file_inode(sc->file));
1019 	struct xfs_inode	*ip = NULL;
1020 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
1021 	int			error;
1022 
1023 	ASSERT(sc->tp == NULL);
1024 
1025 	/* We want to scan the inode we already had opened. */
1026 	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
1027 		return xchk_install_live_inode(sc, ip_in);
1028 
1029 	/*
1030 	 * On pre-metadir filesystems, reject internal metadata files.  For
1031 	 * metadir filesystems, limited scrubbing of any file in the metadata
1032 	 * directory tree by handle is allowed, because that is the only way to
1033 	 * validate the lack of parent pointers in the sb-root metadata inodes.
1034 	 */
1035 	if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino))
1036 		return -ENOENT;
1037 	/* Reject obviously bad inode numbers. */
1038 	if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
1039 		return -ENOENT;
1040 
1041 	/* Try a safe untrusted iget. */
1042 	error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
1043 	if (!error)
1044 		return xchk_install_handle_inode(sc, ip);
1045 	if (error == -ENOENT)
1046 		return error;
1047 	if (error != -EINVAL)
1048 		goto out_error;
1049 
1050 	/*
1051 	 * EINVAL with IGET_UNTRUSTED probably means one of several things:
1052 	 * userspace gave us an inode number that doesn't correspond to fs
1053 	 * space; the inode btree lacks a record for this inode; or there is a
1054 	 * record, and it says this inode is free.
1055 	 *
1056 	 * We want to look up this inode in the inobt to distinguish two
1057 	 * scenarios: (1) the inobt says the inode is free, in which case
1058 	 * there's nothing to do; and (2) the inobt says the inode is
1059 	 * allocated, but loading it failed due to corruption.
1060 	 *
1061 	 * Allocate a transaction and grab the AGI to prevent inobt activity
1062 	 * in this AG.  Retry the iget in case someone allocated a new inode
1063 	 * after the first iget failed.
1064 	 */
1065 	error = xchk_trans_alloc(sc, 0);
1066 	if (error)
1067 		goto out_error;
1068 
1069 	error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
1070 	if (error == 0) {
1071 		/* Actually got the inode, so install it. */
1072 		xchk_trans_cancel(sc);
1073 		return xchk_install_handle_inode(sc, ip);
1074 	}
1075 	if (error == -ENOENT)
1076 		goto out_gone;
1077 	if (error != -EINVAL)
1078 		goto out_cancel;
1079 
1080 	/* Ensure that we have protected against inode allocation/freeing. */
1081 	if (agi_bp == NULL) {
1082 		ASSERT(agi_bp != NULL);
1083 		error = -ECANCELED;
1084 		goto out_cancel;
1085 	}
1086 
1087 	/*
1088 	 * Untrusted iget failed a second time.  Let's try an inobt lookup.
1089 	 * If the inobt thinks this the inode neither can exist inside the
1090 	 * filesystem nor is allocated, return ENOENT to signal that the check
1091 	 * can be skipped.
1092 	 *
1093 	 * If the lookup returns corruption, we'll mark this inode corrupt and
1094 	 * exit to userspace.  There's little chance of fixing anything until
1095 	 * the inobt is straightened out, but there's nothing we can do here.
1096 	 *
1097 	 * If the lookup encounters any other error, exit to userspace.
1098 	 *
1099 	 * If the lookup succeeds, something else must be very wrong in the fs
1100 	 * such that setting up the incore inode failed in some strange way.
1101 	 * Treat those as corruptions.
1102 	 */
1103 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
1104 	if (!pag) {
1105 		error = -EFSCORRUPTED;
1106 		goto out_cancel;
1107 	}
1108 
1109 	error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
1110 			XFS_IGET_UNTRUSTED);
1111 	xfs_perag_put(pag);
1112 	if (error == -EINVAL || error == -ENOENT)
1113 		goto out_gone;
1114 	if (!error)
1115 		error = -EFSCORRUPTED;
1116 
1117 out_cancel:
1118 	xchk_trans_cancel(sc);
1119 out_error:
1120 	trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
1121 			error, __return_address);
1122 	return error;
1123 out_gone:
1124 	/* The file is gone, so there's nothing to check. */
1125 	xchk_trans_cancel(sc);
1126 	return -ENOENT;
1127 }
1128 
1129 /* Release an inode, possibly dropping it in the process. */
1130 void
1131 xchk_irele(
1132 	struct xfs_scrub	*sc,
1133 	struct xfs_inode	*ip)
1134 {
1135 	if (sc->tp) {
1136 		/*
1137 		 * If we are in a transaction, we /cannot/ drop the inode
1138 		 * ourselves, because the VFS will trigger writeback, which
1139 		 * can require a transaction.  Clear DONTCACHE to force the
1140 		 * inode to the LRU, where someone else can take care of
1141 		 * dropping it.
1142 		 *
1143 		 * Note that when we grabbed our reference to the inode, it
1144 		 * could have had an active ref and DONTCACHE set if a sysadmin
1145 		 * is trying to coerce a change in file access mode.  icache
1146 		 * hits do not clear DONTCACHE, so we must do it here.
1147 		 */
1148 		spin_lock(&VFS_I(ip)->i_lock);
1149 		VFS_I(ip)->i_state &= ~I_DONTCACHE;
1150 		spin_unlock(&VFS_I(ip)->i_lock);
1151 	}
1152 
1153 	xfs_irele(ip);
1154 }
1155 
1156 /*
1157  * Set us up to scrub metadata mapped by a file's fork.  Callers must not use
1158  * this to operate on user-accessible regular file data because the MMAPLOCK is
1159  * not taken.
1160  */
1161 int
1162 xchk_setup_inode_contents(
1163 	struct xfs_scrub	*sc,
1164 	unsigned int		resblks)
1165 {
1166 	int			error;
1167 
1168 	error = xchk_iget_for_scrubbing(sc);
1169 	if (error)
1170 		return error;
1171 
1172 	error = xrep_tempfile_adjust_directory_tree(sc);
1173 	if (error)
1174 		return error;
1175 
1176 	/* Lock the inode so the VFS cannot touch this file. */
1177 	xchk_ilock(sc, XFS_IOLOCK_EXCL);
1178 
1179 	error = xchk_trans_alloc(sc, resblks);
1180 	if (error)
1181 		goto out;
1182 
1183 	error = xchk_ino_dqattach(sc);
1184 	if (error)
1185 		goto out;
1186 
1187 	xchk_ilock(sc, XFS_ILOCK_EXCL);
1188 out:
1189 	/* scrub teardown will unlock and release the inode for us */
1190 	return error;
1191 }
1192 
1193 void
1194 xchk_ilock(
1195 	struct xfs_scrub	*sc,
1196 	unsigned int		ilock_flags)
1197 {
1198 	xfs_ilock(sc->ip, ilock_flags);
1199 	sc->ilock_flags |= ilock_flags;
1200 }
1201 
1202 bool
1203 xchk_ilock_nowait(
1204 	struct xfs_scrub	*sc,
1205 	unsigned int		ilock_flags)
1206 {
1207 	if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
1208 		sc->ilock_flags |= ilock_flags;
1209 		return true;
1210 	}
1211 
1212 	return false;
1213 }
1214 
1215 void
1216 xchk_iunlock(
1217 	struct xfs_scrub	*sc,
1218 	unsigned int		ilock_flags)
1219 {
1220 	sc->ilock_flags &= ~ilock_flags;
1221 	xfs_iunlock(sc->ip, ilock_flags);
1222 }
1223 
1224 /*
1225  * Predicate that decides if we need to evaluate the cross-reference check.
1226  * If there was an error accessing the cross-reference btree, just delete
1227  * the cursor and skip the check.
1228  */
1229 bool
1230 xchk_should_check_xref(
1231 	struct xfs_scrub	*sc,
1232 	int			*error,
1233 	struct xfs_btree_cur	**curpp)
1234 {
1235 	/* No point in xref if we already know we're corrupt. */
1236 	if (xchk_skip_xref(sc->sm))
1237 		return false;
1238 
1239 	if (*error == 0)
1240 		return true;
1241 
1242 	if (curpp) {
1243 		/* If we've already given up on xref, just bail out. */
1244 		if (!*curpp)
1245 			return false;
1246 
1247 		/* xref error, delete cursor and bail out. */
1248 		xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
1249 		*curpp = NULL;
1250 	}
1251 
1252 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
1253 	trace_xchk_xref_error(sc, *error, __return_address);
1254 
1255 	/*
1256 	 * Errors encountered during cross-referencing with another
1257 	 * data structure should not cause this scrubber to abort.
1258 	 */
1259 	*error = 0;
1260 	return false;
1261 }
1262 
1263 /* Run the structure verifiers on in-memory buffers to detect bad memory. */
1264 void
1265 xchk_buffer_recheck(
1266 	struct xfs_scrub	*sc,
1267 	struct xfs_buf		*bp)
1268 {
1269 	xfs_failaddr_t		fa;
1270 
1271 	if (bp->b_ops == NULL) {
1272 		xchk_block_set_corrupt(sc, bp);
1273 		return;
1274 	}
1275 	if (bp->b_ops->verify_struct == NULL) {
1276 		xchk_set_incomplete(sc);
1277 		return;
1278 	}
1279 	fa = bp->b_ops->verify_struct(bp);
1280 	if (!fa)
1281 		return;
1282 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
1283 	trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
1284 }
1285 
1286 static inline int
1287 xchk_metadata_inode_subtype(
1288 	struct xfs_scrub	*sc,
1289 	unsigned int		scrub_type)
1290 {
1291 	struct xfs_scrub_subord	*sub;
1292 	int			error;
1293 
1294 	sub = xchk_scrub_create_subord(sc, scrub_type);
1295 	error = sub->sc.ops->scrub(&sub->sc);
1296 	xchk_scrub_free_subord(sub);
1297 	return error;
1298 }
1299 
1300 /*
1301  * Scrub the attr/data forks of a metadata inode.  The metadata inode must be
1302  * pointed to by sc->ip and the ILOCK must be held.
1303  */
1304 int
1305 xchk_metadata_inode_forks(
1306 	struct xfs_scrub	*sc)
1307 {
1308 	bool			shared;
1309 	int			error;
1310 
1311 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1312 		return 0;
1313 
1314 	/* Check the inode record. */
1315 	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1316 	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1317 		return error;
1318 
1319 	/* Metadata inodes don't live on the rt device. */
1320 	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
1321 		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1322 		return 0;
1323 	}
1324 
1325 	/* They should never participate in reflink. */
1326 	if (xfs_is_reflink_inode(sc->ip)) {
1327 		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1328 		return 0;
1329 	}
1330 
1331 	/* Invoke the data fork scrubber. */
1332 	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1333 	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1334 		return error;
1335 
1336 	/* Look for incorrect shared blocks. */
1337 	if (xfs_has_reflink(sc->mp)) {
1338 		error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
1339 				&shared);
1340 		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
1341 				&error))
1342 			return error;
1343 		if (shared)
1344 			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1345 	}
1346 
1347 	/*
1348 	 * Metadata files can only have extended attributes on metadir
1349 	 * filesystems, either for parent pointers or for actual xattr data.
1350 	 */
1351 	if (xfs_inode_hasattr(sc->ip)) {
1352 		if (!xfs_has_metadir(sc->mp)) {
1353 			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1354 			return 0;
1355 		}
1356 
1357 		error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
1358 		if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1359 			return error;
1360 	}
1361 
1362 	return 0;
1363 }
1364 
1365 /*
1366  * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
1367  * operation.  Callers must not hold any locks that intersect with the CPU
1368  * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
1369  * to change kernel code.
1370  */
1371 void
1372 xchk_fsgates_enable(
1373 	struct xfs_scrub	*sc,
1374 	unsigned int		scrub_fsgates)
1375 {
1376 	ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
1377 	ASSERT(!(sc->flags & scrub_fsgates));
1378 
1379 	trace_xchk_fsgates_enable(sc, scrub_fsgates);
1380 
1381 	if (scrub_fsgates & XCHK_FSGATES_DRAIN)
1382 		xfs_drain_wait_enable();
1383 
1384 	if (scrub_fsgates & XCHK_FSGATES_QUOTA)
1385 		xfs_dqtrx_hook_enable();
1386 
1387 	if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
1388 		xfs_dir_hook_enable();
1389 
1390 	if (scrub_fsgates & XCHK_FSGATES_RMAP)
1391 		xfs_rmap_hook_enable();
1392 
1393 	sc->flags |= scrub_fsgates;
1394 }
1395 
1396 /*
1397  * Decide if this is this a cached inode that's also allocated.  The caller
1398  * must hold a reference to an AG and the AGI buffer lock to prevent inodes
1399  * from being allocated or freed.
1400  *
1401  * Look up an inode by number in the given file system.  If the inode number
1402  * is invalid, return -EINVAL.  If the inode is not in cache, return -ENODATA.
1403  * If the inode is being reclaimed, return -ENODATA because we know the inode
1404  * cache cannot be updating the ondisk metadata.
1405  *
1406  * Otherwise, the incore inode is the one we want, and it is either live,
1407  * somewhere in the inactivation machinery, or reclaimable.  The inode is
1408  * allocated if i_mode is nonzero.  In all three cases, the cached inode will
1409  * be more up to date than the ondisk inode buffer, so we must use the incore
1410  * i_mode.
1411  */
1412 int
1413 xchk_inode_is_allocated(
1414 	struct xfs_scrub	*sc,
1415 	xfs_agino_t		agino,
1416 	bool			*inuse)
1417 {
1418 	struct xfs_mount	*mp = sc->mp;
1419 	struct xfs_perag	*pag = sc->sa.pag;
1420 	xfs_ino_t		ino;
1421 	struct xfs_inode	*ip;
1422 	int			error;
1423 
1424 	/* caller must hold perag reference */
1425 	if (pag == NULL) {
1426 		ASSERT(pag != NULL);
1427 		return -EINVAL;
1428 	}
1429 
1430 	/* caller must have AGI buffer */
1431 	if (sc->sa.agi_bp == NULL) {
1432 		ASSERT(sc->sa.agi_bp != NULL);
1433 		return -EINVAL;
1434 	}
1435 
1436 	/* reject inode numbers outside existing AGs */
1437 	ino = xfs_agino_to_ino(pag, agino);
1438 	if (!xfs_verify_ino(mp, ino))
1439 		return -EINVAL;
1440 
1441 	error = -ENODATA;
1442 	rcu_read_lock();
1443 	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1444 	if (!ip) {
1445 		/* cache miss */
1446 		goto out_rcu;
1447 	}
1448 
1449 	/*
1450 	 * If the inode number doesn't match, the incore inode got reused
1451 	 * during an RCU grace period and the radix tree hasn't been updated.
1452 	 * This isn't the inode we want.
1453 	 */
1454 	spin_lock(&ip->i_flags_lock);
1455 	if (ip->i_ino != ino)
1456 		goto out_skip;
1457 
1458 	trace_xchk_inode_is_allocated(ip);
1459 
1460 	/*
1461 	 * We have an incore inode that matches the inode we want, and the
1462 	 * caller holds the perag structure and the AGI buffer.  Let's check
1463 	 * our assumptions below:
1464 	 */
1465 
1466 #ifdef DEBUG
1467 	/*
1468 	 * (1) If the incore inode is live (i.e. referenced from the dcache),
1469 	 * it will not be INEW, nor will it be in the inactivation or reclaim
1470 	 * machinery.  The ondisk inode had better be allocated.  This is the
1471 	 * most trivial case.
1472 	 */
1473 	if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
1474 			     XFS_INACTIVATING))) {
1475 		/* live inode */
1476 		ASSERT(VFS_I(ip)->i_mode != 0);
1477 	}
1478 
1479 	/*
1480 	 * If the incore inode is INEW, there are several possibilities:
1481 	 *
1482 	 * (2) For a file that is being created, note that we allocate the
1483 	 * ondisk inode before allocating, initializing, and adding the incore
1484 	 * inode to the radix tree.
1485 	 *
1486 	 * (3) If the incore inode is being recycled, the inode has to be
1487 	 * allocated because we don't allow freed inodes to be recycled.
1488 	 * Recycling doesn't touch i_mode.
1489 	 */
1490 	if (ip->i_flags & XFS_INEW) {
1491 		/* created on disk already or recycling */
1492 		ASSERT(VFS_I(ip)->i_mode != 0);
1493 	}
1494 
1495 	/*
1496 	 * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
1497 	 * inactivation has not started (!INACTIVATING), it is still allocated.
1498 	 */
1499 	if ((ip->i_flags & XFS_NEED_INACTIVE) &&
1500 	    !(ip->i_flags & XFS_INACTIVATING)) {
1501 		/* definitely before difree */
1502 		ASSERT(VFS_I(ip)->i_mode != 0);
1503 	}
1504 #endif
1505 
1506 	/*
1507 	 * If the incore inode is undergoing inactivation (INACTIVATING), there
1508 	 * are two possibilities:
1509 	 *
1510 	 * (5) It is before the point where it would get freed ondisk, in which
1511 	 * case i_mode is still nonzero.
1512 	 *
1513 	 * (6) It has already been freed, in which case i_mode is zero.
1514 	 *
1515 	 * We don't take the ILOCK here, but difree and dialloc update the AGI,
1516 	 * and we've taken the AGI buffer lock, which prevents that from
1517 	 * happening.
1518 	 */
1519 
1520 	/*
1521 	 * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
1522 	 * reclaim (IRECLAIMABLE) could be allocated or free.  i_mode still
1523 	 * reflects the ondisk state.
1524 	 */
1525 
1526 	/*
1527 	 * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
1528 	 * the flush code uses i_mode to format the ondisk inode.
1529 	 */
1530 
1531 	/*
1532 	 * (9) If the inode is in IRECLAIM and was reachable via the radix
1533 	 * tree, it still has the same i_mode as it did before it entered
1534 	 * reclaim.  The inode object is still alive because we hold the RCU
1535 	 * read lock.
1536 	 */
1537 
1538 	*inuse = VFS_I(ip)->i_mode != 0;
1539 	error = 0;
1540 
1541 out_skip:
1542 	spin_unlock(&ip->i_flags_lock);
1543 out_rcu:
1544 	rcu_read_unlock();
1545 	return error;
1546 }
1547 
1548 /* Is this inode a root directory for either tree? */
1549 bool
1550 xchk_inode_is_dirtree_root(const struct xfs_inode *ip)
1551 {
1552 	struct xfs_mount	*mp = ip->i_mount;
1553 
1554 	return ip == mp->m_rootip ||
1555 		(xfs_has_metadir(mp) && ip == mp->m_metadirip);
1556 }
1557 
1558 /* Does the superblock point down to this inode? */
1559 bool
1560 xchk_inode_is_sb_rooted(const struct xfs_inode *ip)
1561 {
1562 	return xchk_inode_is_dirtree_root(ip) ||
1563 	       xfs_is_sb_inum(ip->i_mount, ip->i_ino);
1564 }
1565 
1566 /* What is the root directory inumber for this inode? */
1567 xfs_ino_t
1568 xchk_inode_rootdir_inum(const struct xfs_inode *ip)
1569 {
1570 	struct xfs_mount	*mp = ip->i_mount;
1571 
1572 	if (xfs_is_metadir_inode(ip))
1573 		return mp->m_metadirip->i_ino;
1574 	return mp->m_rootip->i_ino;
1575 }
1576