xref: /linux/fs/xfs/scrub/common.c (revision 58154dbda4345299bff30eb78cbce6bc6dafcf84)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode.h"
16 #include "xfs_icache.h"
17 #include "xfs_alloc.h"
18 #include "xfs_alloc_btree.h"
19 #include "xfs_ialloc.h"
20 #include "xfs_ialloc_btree.h"
21 #include "xfs_refcount_btree.h"
22 #include "xfs_rmap.h"
23 #include "xfs_rmap_btree.h"
24 #include "xfs_log.h"
25 #include "xfs_trans_priv.h"
26 #include "xfs_da_format.h"
27 #include "xfs_da_btree.h"
28 #include "xfs_dir2_priv.h"
29 #include "xfs_attr.h"
30 #include "xfs_reflink.h"
31 #include "xfs_ag.h"
32 #include "xfs_error.h"
33 #include "xfs_quota.h"
34 #include "xfs_exchmaps.h"
35 #include "xfs_rtbitmap.h"
36 #include "scrub/scrub.h"
37 #include "scrub/common.h"
38 #include "scrub/trace.h"
39 #include "scrub/repair.h"
40 #include "scrub/health.h"
41 
42 /* Common code for the metadata scrubbers. */
43 
44 /*
45  * Handling operational errors.
46  *
47  * The *_process_error() family of functions are used to process error return
48  * codes from functions called as part of a scrub operation.
49  *
50  * If there's no error, we return true to tell the caller that it's ok
51  * to move on to the next check in its list.
52  *
53  * For non-verifier errors (e.g. ENOMEM) we return false to tell the
54  * caller that something bad happened, and we preserve *error so that
55  * the caller can return the *error up the stack to userspace.
56  *
57  * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
58  * OFLAG_CORRUPT in sm_flags and the *error is cleared.  In other words,
59  * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
60  * not via return codes.  We return false to tell the caller that
61  * something bad happened.  Since the error has been cleared, the caller
62  * will (presumably) return that zero and scrubbing will move on to
63  * whatever's next.
64  *
65  * ftrace can be used to record the precise metadata location and the
66  * approximate code location of the failed operation.
67  */
68 
69 /* Check for operational errors. */
70 static bool
71 __xchk_process_error(
72 	struct xfs_scrub	*sc,
73 	xfs_agnumber_t		agno,
74 	xfs_agblock_t		bno,
75 	int			*error,
76 	__u32			errflag,
77 	void			*ret_ip)
78 {
79 	switch (*error) {
80 	case 0:
81 		return true;
82 	case -EDEADLOCK:
83 	case -ECHRNG:
84 		/* Used to restart an op with deadlock avoidance. */
85 		trace_xchk_deadlock_retry(
86 				sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
87 				sc->sm, *error);
88 		break;
89 	case -ECANCELED:
90 		/*
91 		 * ECANCELED here means that the caller set one of the scrub
92 		 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
93 		 * quickly.  Set error to zero and do not continue.
94 		 */
95 		trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
96 		*error = 0;
97 		break;
98 	case -EFSBADCRC:
99 	case -EFSCORRUPTED:
100 		/* Note the badness but don't abort. */
101 		sc->sm->sm_flags |= errflag;
102 		*error = 0;
103 		fallthrough;
104 	default:
105 		trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
106 		break;
107 	}
108 	return false;
109 }
110 
111 bool
112 xchk_process_error(
113 	struct xfs_scrub	*sc,
114 	xfs_agnumber_t		agno,
115 	xfs_agblock_t		bno,
116 	int			*error)
117 {
118 	return __xchk_process_error(sc, agno, bno, error,
119 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
120 }
121 
122 bool
123 xchk_xref_process_error(
124 	struct xfs_scrub	*sc,
125 	xfs_agnumber_t		agno,
126 	xfs_agblock_t		bno,
127 	int			*error)
128 {
129 	return __xchk_process_error(sc, agno, bno, error,
130 			XFS_SCRUB_OFLAG_XFAIL, __return_address);
131 }
132 
133 /* Check for operational errors for a file offset. */
134 static bool
135 __xchk_fblock_process_error(
136 	struct xfs_scrub	*sc,
137 	int			whichfork,
138 	xfs_fileoff_t		offset,
139 	int			*error,
140 	__u32			errflag,
141 	void			*ret_ip)
142 {
143 	switch (*error) {
144 	case 0:
145 		return true;
146 	case -EDEADLOCK:
147 	case -ECHRNG:
148 		/* Used to restart an op with deadlock avoidance. */
149 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
150 		break;
151 	case -ECANCELED:
152 		/*
153 		 * ECANCELED here means that the caller set one of the scrub
154 		 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
155 		 * quickly.  Set error to zero and do not continue.
156 		 */
157 		trace_xchk_file_op_error(sc, whichfork, offset, *error,
158 				ret_ip);
159 		*error = 0;
160 		break;
161 	case -EFSBADCRC:
162 	case -EFSCORRUPTED:
163 		/* Note the badness but don't abort. */
164 		sc->sm->sm_flags |= errflag;
165 		*error = 0;
166 		fallthrough;
167 	default:
168 		trace_xchk_file_op_error(sc, whichfork, offset, *error,
169 				ret_ip);
170 		break;
171 	}
172 	return false;
173 }
174 
175 bool
176 xchk_fblock_process_error(
177 	struct xfs_scrub	*sc,
178 	int			whichfork,
179 	xfs_fileoff_t		offset,
180 	int			*error)
181 {
182 	return __xchk_fblock_process_error(sc, whichfork, offset, error,
183 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
184 }
185 
186 bool
187 xchk_fblock_xref_process_error(
188 	struct xfs_scrub	*sc,
189 	int			whichfork,
190 	xfs_fileoff_t		offset,
191 	int			*error)
192 {
193 	return __xchk_fblock_process_error(sc, whichfork, offset, error,
194 			XFS_SCRUB_OFLAG_XFAIL, __return_address);
195 }
196 
197 /*
198  * Handling scrub corruption/optimization/warning checks.
199  *
200  * The *_set_{corrupt,preen,warning}() family of functions are used to
201  * record the presence of metadata that is incorrect (corrupt), could be
202  * optimized somehow (preen), or should be flagged for administrative
203  * review but is not incorrect (warn).
204  *
205  * ftrace can be used to record the precise metadata location and
206  * approximate code location of the failed check.
207  */
208 
209 /* Record a block which could be optimized. */
210 void
211 xchk_block_set_preen(
212 	struct xfs_scrub	*sc,
213 	struct xfs_buf		*bp)
214 {
215 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
216 	trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
217 }
218 
219 /*
220  * Record an inode which could be optimized.  The trace data will
221  * include the block given by bp if bp is given; otherwise it will use
222  * the block location of the inode record itself.
223  */
224 void
225 xchk_ino_set_preen(
226 	struct xfs_scrub	*sc,
227 	xfs_ino_t		ino)
228 {
229 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
230 	trace_xchk_ino_preen(sc, ino, __return_address);
231 }
232 
233 /* Record something being wrong with the filesystem primary superblock. */
234 void
235 xchk_set_corrupt(
236 	struct xfs_scrub	*sc)
237 {
238 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
239 	trace_xchk_fs_error(sc, 0, __return_address);
240 }
241 
242 /* Record a corrupt block. */
243 void
244 xchk_block_set_corrupt(
245 	struct xfs_scrub	*sc,
246 	struct xfs_buf		*bp)
247 {
248 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
249 	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
250 }
251 
252 #ifdef CONFIG_XFS_QUOTA
253 /* Record a corrupt quota counter. */
254 void
255 xchk_qcheck_set_corrupt(
256 	struct xfs_scrub	*sc,
257 	unsigned int		dqtype,
258 	xfs_dqid_t		id)
259 {
260 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
261 	trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
262 }
263 #endif
264 
265 /* Record a corruption while cross-referencing. */
266 void
267 xchk_block_xref_set_corrupt(
268 	struct xfs_scrub	*sc,
269 	struct xfs_buf		*bp)
270 {
271 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
272 	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
273 }
274 
275 /*
276  * Record a corrupt inode.  The trace data will include the block given
277  * by bp if bp is given; otherwise it will use the block location of the
278  * inode record itself.
279  */
280 void
281 xchk_ino_set_corrupt(
282 	struct xfs_scrub	*sc,
283 	xfs_ino_t		ino)
284 {
285 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
286 	trace_xchk_ino_error(sc, ino, __return_address);
287 }
288 
289 /* Record a corruption while cross-referencing with an inode. */
290 void
291 xchk_ino_xref_set_corrupt(
292 	struct xfs_scrub	*sc,
293 	xfs_ino_t		ino)
294 {
295 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
296 	trace_xchk_ino_error(sc, ino, __return_address);
297 }
298 
299 /* Record corruption in a block indexed by a file fork. */
300 void
301 xchk_fblock_set_corrupt(
302 	struct xfs_scrub	*sc,
303 	int			whichfork,
304 	xfs_fileoff_t		offset)
305 {
306 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
307 	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
308 }
309 
310 /* Record a corruption while cross-referencing a fork block. */
311 void
312 xchk_fblock_xref_set_corrupt(
313 	struct xfs_scrub	*sc,
314 	int			whichfork,
315 	xfs_fileoff_t		offset)
316 {
317 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
318 	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
319 }
320 
321 /*
322  * Warn about inodes that need administrative review but is not
323  * incorrect.
324  */
325 void
326 xchk_ino_set_warning(
327 	struct xfs_scrub	*sc,
328 	xfs_ino_t		ino)
329 {
330 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
331 	trace_xchk_ino_warning(sc, ino, __return_address);
332 }
333 
334 /* Warn about a block indexed by a file fork that needs review. */
335 void
336 xchk_fblock_set_warning(
337 	struct xfs_scrub	*sc,
338 	int			whichfork,
339 	xfs_fileoff_t		offset)
340 {
341 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
342 	trace_xchk_fblock_warning(sc, whichfork, offset, __return_address);
343 }
344 
345 /* Signal an incomplete scrub. */
346 void
347 xchk_set_incomplete(
348 	struct xfs_scrub	*sc)
349 {
350 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
351 	trace_xchk_incomplete(sc, __return_address);
352 }
353 
354 /*
355  * rmap scrubbing -- compute the number of blocks with a given owner,
356  * at least according to the reverse mapping data.
357  */
358 
359 struct xchk_rmap_ownedby_info {
360 	const struct xfs_owner_info	*oinfo;
361 	xfs_filblks_t			*blocks;
362 };
363 
364 STATIC int
365 xchk_count_rmap_ownedby_irec(
366 	struct xfs_btree_cur		*cur,
367 	const struct xfs_rmap_irec	*rec,
368 	void				*priv)
369 {
370 	struct xchk_rmap_ownedby_info	*sroi = priv;
371 	bool				irec_attr;
372 	bool				oinfo_attr;
373 
374 	irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
375 	oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
376 
377 	if (rec->rm_owner != sroi->oinfo->oi_owner)
378 		return 0;
379 
380 	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
381 		(*sroi->blocks) += rec->rm_blockcount;
382 
383 	return 0;
384 }
385 
386 /*
387  * Calculate the number of blocks the rmap thinks are owned by something.
388  * The caller should pass us an rmapbt cursor.
389  */
390 int
391 xchk_count_rmap_ownedby_ag(
392 	struct xfs_scrub		*sc,
393 	struct xfs_btree_cur		*cur,
394 	const struct xfs_owner_info	*oinfo,
395 	xfs_filblks_t			*blocks)
396 {
397 	struct xchk_rmap_ownedby_info	sroi = {
398 		.oinfo			= oinfo,
399 		.blocks			= blocks,
400 	};
401 
402 	*blocks = 0;
403 	return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
404 			&sroi);
405 }
406 
407 /*
408  * AG scrubbing
409  *
410  * These helpers facilitate locking an allocation group's header
411  * buffers, setting up cursors for all btrees that are present, and
412  * cleaning everything up once we're through.
413  */
414 
415 /* Decide if we want to return an AG header read failure. */
416 static inline bool
417 want_ag_read_header_failure(
418 	struct xfs_scrub	*sc,
419 	unsigned int		type)
420 {
421 	/* Return all AG header read failures when scanning btrees. */
422 	if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
423 	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
424 	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
425 		return true;
426 	/*
427 	 * If we're scanning a given type of AG header, we only want to
428 	 * see read failures from that specific header.  We'd like the
429 	 * other headers to cross-check them, but this isn't required.
430 	 */
431 	if (sc->sm->sm_type == type)
432 		return true;
433 	return false;
434 }
435 
436 /*
437  * Grab the AG header buffers for the attached perag structure.
438  *
439  * The headers should be released by xchk_ag_free, but as a fail safe we attach
440  * all the buffers we grab to the scrub transaction so they'll all be freed
441  * when we cancel it.
442  */
443 static inline int
444 xchk_perag_read_headers(
445 	struct xfs_scrub	*sc,
446 	struct xchk_ag		*sa)
447 {
448 	int			error;
449 
450 	error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp);
451 	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
452 		return error;
453 
454 	error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
455 	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
456 		return error;
457 
458 	return 0;
459 }
460 
461 /*
462  * Grab the AG headers for the attached perag structure and wait for pending
463  * intents to drain.
464  */
465 int
466 xchk_perag_drain_and_lock(
467 	struct xfs_scrub	*sc)
468 {
469 	struct xchk_ag		*sa = &sc->sa;
470 	int			error = 0;
471 
472 	ASSERT(sa->pag != NULL);
473 	ASSERT(sa->agi_bp == NULL);
474 	ASSERT(sa->agf_bp == NULL);
475 
476 	do {
477 		if (xchk_should_terminate(sc, &error))
478 			return error;
479 
480 		error = xchk_perag_read_headers(sc, sa);
481 		if (error)
482 			return error;
483 
484 		/*
485 		 * If we've grabbed an inode for scrubbing then we assume that
486 		 * holding its ILOCK will suffice to coordinate with any intent
487 		 * chains involving this inode.
488 		 */
489 		if (sc->ip)
490 			return 0;
491 
492 		/*
493 		 * Decide if this AG is quiet enough for all metadata to be
494 		 * consistent with each other.  XFS allows the AG header buffer
495 		 * locks to cycle across transaction rolls while processing
496 		 * chains of deferred ops, which means that there could be
497 		 * other threads in the middle of processing a chain of
498 		 * deferred ops.  For regular operations we are careful about
499 		 * ordering operations to prevent collisions between threads
500 		 * (which is why we don't need a per-AG lock), but scrub and
501 		 * repair have to serialize against chained operations.
502 		 *
503 		 * We just locked all the AG headers buffers; now take a look
504 		 * to see if there are any intents in progress.  If there are,
505 		 * drop the AG headers and wait for the intents to drain.
506 		 * Since we hold all the AG header locks for the duration of
507 		 * the scrub, this is the only time we have to sample the
508 		 * intents counter; any threads increasing it after this point
509 		 * can't possibly be in the middle of a chain of AG metadata
510 		 * updates.
511 		 *
512 		 * Obviously, this should be slanted against scrub and in favor
513 		 * of runtime threads.
514 		 */
515 		if (!xfs_perag_intent_busy(sa->pag))
516 			return 0;
517 
518 		if (sa->agf_bp) {
519 			xfs_trans_brelse(sc->tp, sa->agf_bp);
520 			sa->agf_bp = NULL;
521 		}
522 
523 		if (sa->agi_bp) {
524 			xfs_trans_brelse(sc->tp, sa->agi_bp);
525 			sa->agi_bp = NULL;
526 		}
527 
528 		if (!(sc->flags & XCHK_FSGATES_DRAIN))
529 			return -ECHRNG;
530 		error = xfs_perag_intent_drain(sa->pag);
531 		if (error == -ERESTARTSYS)
532 			error = -EINTR;
533 	} while (!error);
534 
535 	return error;
536 }
537 
538 /*
539  * Grab the per-AG structure, grab all AG header buffers, and wait until there
540  * aren't any pending intents.  Returns -ENOENT if we can't grab the perag
541  * structure.
542  */
543 int
544 xchk_ag_read_headers(
545 	struct xfs_scrub	*sc,
546 	xfs_agnumber_t		agno,
547 	struct xchk_ag		*sa)
548 {
549 	struct xfs_mount	*mp = sc->mp;
550 
551 	ASSERT(!sa->pag);
552 	sa->pag = xfs_perag_get(mp, agno);
553 	if (!sa->pag)
554 		return -ENOENT;
555 
556 	return xchk_perag_drain_and_lock(sc);
557 }
558 
559 /* Release all the AG btree cursors. */
560 void
561 xchk_ag_btcur_free(
562 	struct xchk_ag		*sa)
563 {
564 	if (sa->refc_cur)
565 		xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
566 	if (sa->rmap_cur)
567 		xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
568 	if (sa->fino_cur)
569 		xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
570 	if (sa->ino_cur)
571 		xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
572 	if (sa->cnt_cur)
573 		xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
574 	if (sa->bno_cur)
575 		xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
576 
577 	sa->refc_cur = NULL;
578 	sa->rmap_cur = NULL;
579 	sa->fino_cur = NULL;
580 	sa->ino_cur = NULL;
581 	sa->bno_cur = NULL;
582 	sa->cnt_cur = NULL;
583 }
584 
585 /* Initialize all the btree cursors for an AG. */
586 void
587 xchk_ag_btcur_init(
588 	struct xfs_scrub	*sc,
589 	struct xchk_ag		*sa)
590 {
591 	struct xfs_mount	*mp = sc->mp;
592 
593 	if (sa->agf_bp) {
594 		/* Set up a bnobt cursor for cross-referencing. */
595 		sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
596 				sa->pag);
597 		xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
598 				XFS_SCRUB_TYPE_BNOBT);
599 
600 		/* Set up a cntbt cursor for cross-referencing. */
601 		sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
602 				sa->pag);
603 		xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
604 				XFS_SCRUB_TYPE_CNTBT);
605 
606 		/* Set up a rmapbt cursor for cross-referencing. */
607 		if (xfs_has_rmapbt(mp)) {
608 			sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
609 					sa->agf_bp, sa->pag);
610 			xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
611 					XFS_SCRUB_TYPE_RMAPBT);
612 		}
613 
614 		/* Set up a refcountbt cursor for cross-referencing. */
615 		if (xfs_has_reflink(mp)) {
616 			sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
617 					sa->agf_bp, sa->pag);
618 			xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
619 					XFS_SCRUB_TYPE_REFCNTBT);
620 		}
621 	}
622 
623 	if (sa->agi_bp) {
624 		/* Set up a inobt cursor for cross-referencing. */
625 		sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
626 				sa->agi_bp);
627 		xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
628 				XFS_SCRUB_TYPE_INOBT);
629 
630 		/* Set up a finobt cursor for cross-referencing. */
631 		if (xfs_has_finobt(mp)) {
632 			sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
633 					sa->agi_bp);
634 			xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
635 					XFS_SCRUB_TYPE_FINOBT);
636 		}
637 	}
638 }
639 
640 /* Release the AG header context and btree cursors. */
641 void
642 xchk_ag_free(
643 	struct xfs_scrub	*sc,
644 	struct xchk_ag		*sa)
645 {
646 	xchk_ag_btcur_free(sa);
647 	xrep_reset_perag_resv(sc);
648 	if (sa->agf_bp) {
649 		xfs_trans_brelse(sc->tp, sa->agf_bp);
650 		sa->agf_bp = NULL;
651 	}
652 	if (sa->agi_bp) {
653 		xfs_trans_brelse(sc->tp, sa->agi_bp);
654 		sa->agi_bp = NULL;
655 	}
656 	if (sa->pag) {
657 		xfs_perag_put(sa->pag);
658 		sa->pag = NULL;
659 	}
660 }
661 
662 /*
663  * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
664  * order.  Locking order requires us to get the AGI before the AGF.  We use the
665  * transaction to avoid deadlocking on crosslinked metadata buffers; either the
666  * caller passes one in (bmap scrub) or we have to create a transaction
667  * ourselves.  Returns ENOENT if the perag struct cannot be grabbed.
668  */
669 int
670 xchk_ag_init(
671 	struct xfs_scrub	*sc,
672 	xfs_agnumber_t		agno,
673 	struct xchk_ag		*sa)
674 {
675 	int			error;
676 
677 	error = xchk_ag_read_headers(sc, agno, sa);
678 	if (error)
679 		return error;
680 
681 	xchk_ag_btcur_init(sc, sa);
682 	return 0;
683 }
684 
685 /* Per-scrubber setup functions */
686 
687 void
688 xchk_trans_cancel(
689 	struct xfs_scrub	*sc)
690 {
691 	xfs_trans_cancel(sc->tp);
692 	sc->tp = NULL;
693 }
694 
695 int
696 xchk_trans_alloc_empty(
697 	struct xfs_scrub	*sc)
698 {
699 	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
700 }
701 
702 /*
703  * Grab an empty transaction so that we can re-grab locked buffers if
704  * one of our btrees turns out to be cyclic.
705  *
706  * If we're going to repair something, we need to ask for the largest possible
707  * log reservation so that we can handle the worst case scenario for metadata
708  * updates while rebuilding a metadata item.  We also need to reserve as many
709  * blocks in the head transaction as we think we're going to need to rebuild
710  * the metadata object.
711  */
712 int
713 xchk_trans_alloc(
714 	struct xfs_scrub	*sc,
715 	uint			resblks)
716 {
717 	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
718 		return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
719 				resblks, 0, 0, &sc->tp);
720 
721 	return xchk_trans_alloc_empty(sc);
722 }
723 
724 /* Set us up with a transaction and an empty context. */
725 int
726 xchk_setup_fs(
727 	struct xfs_scrub	*sc)
728 {
729 	uint			resblks;
730 
731 	resblks = xrep_calc_ag_resblks(sc);
732 	return xchk_trans_alloc(sc, resblks);
733 }
734 
735 /* Set us up with AG headers and btree cursors. */
736 int
737 xchk_setup_ag_btree(
738 	struct xfs_scrub	*sc,
739 	bool			force_log)
740 {
741 	struct xfs_mount	*mp = sc->mp;
742 	int			error;
743 
744 	/*
745 	 * If the caller asks us to checkpont the log, do so.  This
746 	 * expensive operation should be performed infrequently and only
747 	 * as a last resort.  Any caller that sets force_log should
748 	 * document why they need to do so.
749 	 */
750 	if (force_log) {
751 		error = xchk_checkpoint_log(mp);
752 		if (error)
753 			return error;
754 	}
755 
756 	error = xchk_setup_fs(sc);
757 	if (error)
758 		return error;
759 
760 	return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa);
761 }
762 
763 /* Push everything out of the log onto disk. */
764 int
765 xchk_checkpoint_log(
766 	struct xfs_mount	*mp)
767 {
768 	int			error;
769 
770 	error = xfs_log_force(mp, XFS_LOG_SYNC);
771 	if (error)
772 		return error;
773 	xfs_ail_push_all_sync(mp->m_ail);
774 	return 0;
775 }
776 
777 /* Verify that an inode is allocated ondisk, then return its cached inode. */
778 int
779 xchk_iget(
780 	struct xfs_scrub	*sc,
781 	xfs_ino_t		inum,
782 	struct xfs_inode	**ipp)
783 {
784 	ASSERT(sc->tp != NULL);
785 
786 	return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp);
787 }
788 
789 /*
790  * Try to grab an inode in a manner that avoids races with physical inode
791  * allocation.  If we can't, return the locked AGI buffer so that the caller
792  * can single-step the loading process to see where things went wrong.
793  * Callers must have a valid scrub transaction.
794  *
795  * If the iget succeeds, return 0, a NULL AGI, and the inode.
796  *
797  * If the iget fails, return the error, the locked AGI, and a NULL inode.  This
798  * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
799  * no longer allocated; or any other corruption or runtime error.
800  *
801  * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
802  *
803  * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
804  */
805 int
806 xchk_iget_agi(
807 	struct xfs_scrub	*sc,
808 	xfs_ino_t		inum,
809 	struct xfs_buf		**agi_bpp,
810 	struct xfs_inode	**ipp)
811 {
812 	struct xfs_mount	*mp = sc->mp;
813 	struct xfs_trans	*tp = sc->tp;
814 	struct xfs_perag	*pag;
815 	int			error;
816 
817 	ASSERT(sc->tp != NULL);
818 
819 again:
820 	*agi_bpp = NULL;
821 	*ipp = NULL;
822 	error = 0;
823 
824 	if (xchk_should_terminate(sc, &error))
825 		return error;
826 
827 	/*
828 	 * Attach the AGI buffer to the scrub transaction to avoid deadlocks
829 	 * in the iget cache miss path.
830 	 */
831 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
832 	error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp);
833 	xfs_perag_put(pag);
834 	if (error)
835 		return error;
836 
837 	error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0,
838 			ipp);
839 	if (error == -EAGAIN) {
840 		/*
841 		 * The inode may be in core but temporarily unavailable and may
842 		 * require the AGI buffer before it can be returned.  Drop the
843 		 * AGI buffer and retry the lookup.
844 		 *
845 		 * Incore lookup will fail with EAGAIN on a cache hit if the
846 		 * inode is queued to the inactivation list.  The inactivation
847 		 * worker may remove the inode from the unlinked list and hence
848 		 * needs the AGI.
849 		 *
850 		 * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
851 		 * to allow inodegc to make progress and move the inode to
852 		 * IRECLAIMABLE state where xfs_iget will be able to return it
853 		 * again if it can lock the inode.
854 		 */
855 		xfs_trans_brelse(tp, *agi_bpp);
856 		delay(1);
857 		goto again;
858 	}
859 	if (error)
860 		return error;
861 
862 	/* We got the inode, so we can release the AGI. */
863 	ASSERT(*ipp != NULL);
864 	xfs_trans_brelse(tp, *agi_bpp);
865 	*agi_bpp = NULL;
866 	return 0;
867 }
868 
869 #ifdef CONFIG_XFS_QUOTA
870 /*
871  * Try to attach dquots to this inode if we think we might want to repair it.
872  * Callers must not hold any ILOCKs.  If the dquots are broken and cannot be
873  * attached, a quotacheck will be scheduled.
874  */
875 int
876 xchk_ino_dqattach(
877 	struct xfs_scrub	*sc)
878 {
879 	ASSERT(sc->tp != NULL);
880 	ASSERT(sc->ip != NULL);
881 
882 	if (!xchk_could_repair(sc))
883 		return 0;
884 
885 	return xrep_ino_dqattach(sc);
886 }
887 #endif
888 
889 /* Install an inode that we opened by handle for scrubbing. */
890 int
891 xchk_install_handle_inode(
892 	struct xfs_scrub	*sc,
893 	struct xfs_inode	*ip)
894 {
895 	if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
896 		xchk_irele(sc, ip);
897 		return -ENOENT;
898 	}
899 
900 	sc->ip = ip;
901 	return 0;
902 }
903 
904 /*
905  * Install an already-referenced inode for scrubbing.  Get our own reference to
906  * the inode to make disposal simpler.  The inode must not be in I_FREEING or
907  * I_WILL_FREE state!
908  */
909 int
910 xchk_install_live_inode(
911 	struct xfs_scrub	*sc,
912 	struct xfs_inode	*ip)
913 {
914 	if (!igrab(VFS_I(ip))) {
915 		xchk_ino_set_corrupt(sc, ip->i_ino);
916 		return -EFSCORRUPTED;
917 	}
918 
919 	sc->ip = ip;
920 	return 0;
921 }
922 
923 /*
924  * In preparation to scrub metadata structures that hang off of an inode,
925  * grab either the inode referenced in the scrub control structure or the
926  * inode passed in.  If the inumber does not reference an allocated inode
927  * record, the function returns ENOENT to end the scrub early.  The inode
928  * is not locked.
929  */
930 int
931 xchk_iget_for_scrubbing(
932 	struct xfs_scrub	*sc)
933 {
934 	struct xfs_imap		imap;
935 	struct xfs_mount	*mp = sc->mp;
936 	struct xfs_perag	*pag;
937 	struct xfs_buf		*agi_bp;
938 	struct xfs_inode	*ip_in = XFS_I(file_inode(sc->file));
939 	struct xfs_inode	*ip = NULL;
940 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
941 	int			error;
942 
943 	ASSERT(sc->tp == NULL);
944 
945 	/* We want to scan the inode we already had opened. */
946 	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
947 		return xchk_install_live_inode(sc, ip_in);
948 
949 	/* Reject internal metadata files and obviously bad inode numbers. */
950 	if (xfs_internal_inum(mp, sc->sm->sm_ino))
951 		return -ENOENT;
952 	if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
953 		return -ENOENT;
954 
955 	/* Try a safe untrusted iget. */
956 	error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
957 	if (!error)
958 		return xchk_install_handle_inode(sc, ip);
959 	if (error == -ENOENT)
960 		return error;
961 	if (error != -EINVAL)
962 		goto out_error;
963 
964 	/*
965 	 * EINVAL with IGET_UNTRUSTED probably means one of several things:
966 	 * userspace gave us an inode number that doesn't correspond to fs
967 	 * space; the inode btree lacks a record for this inode; or there is a
968 	 * record, and it says this inode is free.
969 	 *
970 	 * We want to look up this inode in the inobt to distinguish two
971 	 * scenarios: (1) the inobt says the inode is free, in which case
972 	 * there's nothing to do; and (2) the inobt says the inode is
973 	 * allocated, but loading it failed due to corruption.
974 	 *
975 	 * Allocate a transaction and grab the AGI to prevent inobt activity
976 	 * in this AG.  Retry the iget in case someone allocated a new inode
977 	 * after the first iget failed.
978 	 */
979 	error = xchk_trans_alloc(sc, 0);
980 	if (error)
981 		goto out_error;
982 
983 	error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
984 	if (error == 0) {
985 		/* Actually got the inode, so install it. */
986 		xchk_trans_cancel(sc);
987 		return xchk_install_handle_inode(sc, ip);
988 	}
989 	if (error == -ENOENT)
990 		goto out_gone;
991 	if (error != -EINVAL)
992 		goto out_cancel;
993 
994 	/* Ensure that we have protected against inode allocation/freeing. */
995 	if (agi_bp == NULL) {
996 		ASSERT(agi_bp != NULL);
997 		error = -ECANCELED;
998 		goto out_cancel;
999 	}
1000 
1001 	/*
1002 	 * Untrusted iget failed a second time.  Let's try an inobt lookup.
1003 	 * If the inobt thinks this the inode neither can exist inside the
1004 	 * filesystem nor is allocated, return ENOENT to signal that the check
1005 	 * can be skipped.
1006 	 *
1007 	 * If the lookup returns corruption, we'll mark this inode corrupt and
1008 	 * exit to userspace.  There's little chance of fixing anything until
1009 	 * the inobt is straightened out, but there's nothing we can do here.
1010 	 *
1011 	 * If the lookup encounters any other error, exit to userspace.
1012 	 *
1013 	 * If the lookup succeeds, something else must be very wrong in the fs
1014 	 * such that setting up the incore inode failed in some strange way.
1015 	 * Treat those as corruptions.
1016 	 */
1017 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
1018 	if (!pag) {
1019 		error = -EFSCORRUPTED;
1020 		goto out_cancel;
1021 	}
1022 
1023 	error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
1024 			XFS_IGET_UNTRUSTED);
1025 	xfs_perag_put(pag);
1026 	if (error == -EINVAL || error == -ENOENT)
1027 		goto out_gone;
1028 	if (!error)
1029 		error = -EFSCORRUPTED;
1030 
1031 out_cancel:
1032 	xchk_trans_cancel(sc);
1033 out_error:
1034 	trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
1035 			error, __return_address);
1036 	return error;
1037 out_gone:
1038 	/* The file is gone, so there's nothing to check. */
1039 	xchk_trans_cancel(sc);
1040 	return -ENOENT;
1041 }
1042 
1043 /* Release an inode, possibly dropping it in the process. */
1044 void
1045 xchk_irele(
1046 	struct xfs_scrub	*sc,
1047 	struct xfs_inode	*ip)
1048 {
1049 	if (sc->tp) {
1050 		/*
1051 		 * If we are in a transaction, we /cannot/ drop the inode
1052 		 * ourselves, because the VFS will trigger writeback, which
1053 		 * can require a transaction.  Clear DONTCACHE to force the
1054 		 * inode to the LRU, where someone else can take care of
1055 		 * dropping it.
1056 		 *
1057 		 * Note that when we grabbed our reference to the inode, it
1058 		 * could have had an active ref and DONTCACHE set if a sysadmin
1059 		 * is trying to coerce a change in file access mode.  icache
1060 		 * hits do not clear DONTCACHE, so we must do it here.
1061 		 */
1062 		spin_lock(&VFS_I(ip)->i_lock);
1063 		VFS_I(ip)->i_state &= ~I_DONTCACHE;
1064 		spin_unlock(&VFS_I(ip)->i_lock);
1065 	}
1066 
1067 	xfs_irele(ip);
1068 }
1069 
1070 /*
1071  * Set us up to scrub metadata mapped by a file's fork.  Callers must not use
1072  * this to operate on user-accessible regular file data because the MMAPLOCK is
1073  * not taken.
1074  */
1075 int
1076 xchk_setup_inode_contents(
1077 	struct xfs_scrub	*sc,
1078 	unsigned int		resblks)
1079 {
1080 	int			error;
1081 
1082 	error = xchk_iget_for_scrubbing(sc);
1083 	if (error)
1084 		return error;
1085 
1086 	/* Lock the inode so the VFS cannot touch this file. */
1087 	xchk_ilock(sc, XFS_IOLOCK_EXCL);
1088 
1089 	error = xchk_trans_alloc(sc, resblks);
1090 	if (error)
1091 		goto out;
1092 
1093 	error = xchk_ino_dqattach(sc);
1094 	if (error)
1095 		goto out;
1096 
1097 	xchk_ilock(sc, XFS_ILOCK_EXCL);
1098 out:
1099 	/* scrub teardown will unlock and release the inode for us */
1100 	return error;
1101 }
1102 
1103 void
1104 xchk_ilock(
1105 	struct xfs_scrub	*sc,
1106 	unsigned int		ilock_flags)
1107 {
1108 	xfs_ilock(sc->ip, ilock_flags);
1109 	sc->ilock_flags |= ilock_flags;
1110 }
1111 
1112 bool
1113 xchk_ilock_nowait(
1114 	struct xfs_scrub	*sc,
1115 	unsigned int		ilock_flags)
1116 {
1117 	if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
1118 		sc->ilock_flags |= ilock_flags;
1119 		return true;
1120 	}
1121 
1122 	return false;
1123 }
1124 
1125 void
1126 xchk_iunlock(
1127 	struct xfs_scrub	*sc,
1128 	unsigned int		ilock_flags)
1129 {
1130 	sc->ilock_flags &= ~ilock_flags;
1131 	xfs_iunlock(sc->ip, ilock_flags);
1132 }
1133 
1134 /*
1135  * Predicate that decides if we need to evaluate the cross-reference check.
1136  * If there was an error accessing the cross-reference btree, just delete
1137  * the cursor and skip the check.
1138  */
1139 bool
1140 xchk_should_check_xref(
1141 	struct xfs_scrub	*sc,
1142 	int			*error,
1143 	struct xfs_btree_cur	**curpp)
1144 {
1145 	/* No point in xref if we already know we're corrupt. */
1146 	if (xchk_skip_xref(sc->sm))
1147 		return false;
1148 
1149 	if (*error == 0)
1150 		return true;
1151 
1152 	if (curpp) {
1153 		/* If we've already given up on xref, just bail out. */
1154 		if (!*curpp)
1155 			return false;
1156 
1157 		/* xref error, delete cursor and bail out. */
1158 		xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
1159 		*curpp = NULL;
1160 	}
1161 
1162 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
1163 	trace_xchk_xref_error(sc, *error, __return_address);
1164 
1165 	/*
1166 	 * Errors encountered during cross-referencing with another
1167 	 * data structure should not cause this scrubber to abort.
1168 	 */
1169 	*error = 0;
1170 	return false;
1171 }
1172 
1173 /* Run the structure verifiers on in-memory buffers to detect bad memory. */
1174 void
1175 xchk_buffer_recheck(
1176 	struct xfs_scrub	*sc,
1177 	struct xfs_buf		*bp)
1178 {
1179 	xfs_failaddr_t		fa;
1180 
1181 	if (bp->b_ops == NULL) {
1182 		xchk_block_set_corrupt(sc, bp);
1183 		return;
1184 	}
1185 	if (bp->b_ops->verify_struct == NULL) {
1186 		xchk_set_incomplete(sc);
1187 		return;
1188 	}
1189 	fa = bp->b_ops->verify_struct(bp);
1190 	if (!fa)
1191 		return;
1192 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
1193 	trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
1194 }
1195 
1196 static inline int
1197 xchk_metadata_inode_subtype(
1198 	struct xfs_scrub	*sc,
1199 	unsigned int		scrub_type)
1200 {
1201 	struct xfs_scrub_subord	*sub;
1202 	int			error;
1203 
1204 	sub = xchk_scrub_create_subord(sc, scrub_type);
1205 	error = sub->sc.ops->scrub(&sub->sc);
1206 	xchk_scrub_free_subord(sub);
1207 	return error;
1208 }
1209 
1210 /*
1211  * Scrub the attr/data forks of a metadata inode.  The metadata inode must be
1212  * pointed to by sc->ip and the ILOCK must be held.
1213  */
1214 int
1215 xchk_metadata_inode_forks(
1216 	struct xfs_scrub	*sc)
1217 {
1218 	bool			shared;
1219 	int			error;
1220 
1221 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1222 		return 0;
1223 
1224 	/* Check the inode record. */
1225 	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1226 	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1227 		return error;
1228 
1229 	/* Metadata inodes don't live on the rt device. */
1230 	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
1231 		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1232 		return 0;
1233 	}
1234 
1235 	/* They should never participate in reflink. */
1236 	if (xfs_is_reflink_inode(sc->ip)) {
1237 		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1238 		return 0;
1239 	}
1240 
1241 	/* They also should never have extended attributes. */
1242 	if (xfs_inode_hasattr(sc->ip)) {
1243 		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1244 		return 0;
1245 	}
1246 
1247 	/* Invoke the data fork scrubber. */
1248 	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1249 	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1250 		return error;
1251 
1252 	/* Look for incorrect shared blocks. */
1253 	if (xfs_has_reflink(sc->mp)) {
1254 		error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
1255 				&shared);
1256 		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
1257 				&error))
1258 			return error;
1259 		if (shared)
1260 			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1261 	}
1262 
1263 	return 0;
1264 }
1265 
1266 /*
1267  * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
1268  * operation.  Callers must not hold any locks that intersect with the CPU
1269  * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
1270  * to change kernel code.
1271  */
1272 void
1273 xchk_fsgates_enable(
1274 	struct xfs_scrub	*sc,
1275 	unsigned int		scrub_fsgates)
1276 {
1277 	ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
1278 	ASSERT(!(sc->flags & scrub_fsgates));
1279 
1280 	trace_xchk_fsgates_enable(sc, scrub_fsgates);
1281 
1282 	if (scrub_fsgates & XCHK_FSGATES_DRAIN)
1283 		xfs_drain_wait_enable();
1284 
1285 	if (scrub_fsgates & XCHK_FSGATES_QUOTA)
1286 		xfs_dqtrx_hook_enable();
1287 
1288 	if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
1289 		xfs_dir_hook_enable();
1290 
1291 	if (scrub_fsgates & XCHK_FSGATES_RMAP)
1292 		xfs_rmap_hook_enable();
1293 
1294 	sc->flags |= scrub_fsgates;
1295 }
1296 
1297 /*
1298  * Decide if this is this a cached inode that's also allocated.  The caller
1299  * must hold a reference to an AG and the AGI buffer lock to prevent inodes
1300  * from being allocated or freed.
1301  *
1302  * Look up an inode by number in the given file system.  If the inode number
1303  * is invalid, return -EINVAL.  If the inode is not in cache, return -ENODATA.
1304  * If the inode is being reclaimed, return -ENODATA because we know the inode
1305  * cache cannot be updating the ondisk metadata.
1306  *
1307  * Otherwise, the incore inode is the one we want, and it is either live,
1308  * somewhere in the inactivation machinery, or reclaimable.  The inode is
1309  * allocated if i_mode is nonzero.  In all three cases, the cached inode will
1310  * be more up to date than the ondisk inode buffer, so we must use the incore
1311  * i_mode.
1312  */
1313 int
1314 xchk_inode_is_allocated(
1315 	struct xfs_scrub	*sc,
1316 	xfs_agino_t		agino,
1317 	bool			*inuse)
1318 {
1319 	struct xfs_mount	*mp = sc->mp;
1320 	struct xfs_perag	*pag = sc->sa.pag;
1321 	xfs_ino_t		ino;
1322 	struct xfs_inode	*ip;
1323 	int			error;
1324 
1325 	/* caller must hold perag reference */
1326 	if (pag == NULL) {
1327 		ASSERT(pag != NULL);
1328 		return -EINVAL;
1329 	}
1330 
1331 	/* caller must have AGI buffer */
1332 	if (sc->sa.agi_bp == NULL) {
1333 		ASSERT(sc->sa.agi_bp != NULL);
1334 		return -EINVAL;
1335 	}
1336 
1337 	/* reject inode numbers outside existing AGs */
1338 	ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1339 	if (!xfs_verify_ino(mp, ino))
1340 		return -EINVAL;
1341 
1342 	error = -ENODATA;
1343 	rcu_read_lock();
1344 	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1345 	if (!ip) {
1346 		/* cache miss */
1347 		goto out_rcu;
1348 	}
1349 
1350 	/*
1351 	 * If the inode number doesn't match, the incore inode got reused
1352 	 * during an RCU grace period and the radix tree hasn't been updated.
1353 	 * This isn't the inode we want.
1354 	 */
1355 	spin_lock(&ip->i_flags_lock);
1356 	if (ip->i_ino != ino)
1357 		goto out_skip;
1358 
1359 	trace_xchk_inode_is_allocated(ip);
1360 
1361 	/*
1362 	 * We have an incore inode that matches the inode we want, and the
1363 	 * caller holds the perag structure and the AGI buffer.  Let's check
1364 	 * our assumptions below:
1365 	 */
1366 
1367 #ifdef DEBUG
1368 	/*
1369 	 * (1) If the incore inode is live (i.e. referenced from the dcache),
1370 	 * it will not be INEW, nor will it be in the inactivation or reclaim
1371 	 * machinery.  The ondisk inode had better be allocated.  This is the
1372 	 * most trivial case.
1373 	 */
1374 	if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
1375 			     XFS_INACTIVATING))) {
1376 		/* live inode */
1377 		ASSERT(VFS_I(ip)->i_mode != 0);
1378 	}
1379 
1380 	/*
1381 	 * If the incore inode is INEW, there are several possibilities:
1382 	 *
1383 	 * (2) For a file that is being created, note that we allocate the
1384 	 * ondisk inode before allocating, initializing, and adding the incore
1385 	 * inode to the radix tree.
1386 	 *
1387 	 * (3) If the incore inode is being recycled, the inode has to be
1388 	 * allocated because we don't allow freed inodes to be recycled.
1389 	 * Recycling doesn't touch i_mode.
1390 	 */
1391 	if (ip->i_flags & XFS_INEW) {
1392 		/* created on disk already or recycling */
1393 		ASSERT(VFS_I(ip)->i_mode != 0);
1394 	}
1395 
1396 	/*
1397 	 * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
1398 	 * inactivation has not started (!INACTIVATING), it is still allocated.
1399 	 */
1400 	if ((ip->i_flags & XFS_NEED_INACTIVE) &&
1401 	    !(ip->i_flags & XFS_INACTIVATING)) {
1402 		/* definitely before difree */
1403 		ASSERT(VFS_I(ip)->i_mode != 0);
1404 	}
1405 #endif
1406 
1407 	/*
1408 	 * If the incore inode is undergoing inactivation (INACTIVATING), there
1409 	 * are two possibilities:
1410 	 *
1411 	 * (5) It is before the point where it would get freed ondisk, in which
1412 	 * case i_mode is still nonzero.
1413 	 *
1414 	 * (6) It has already been freed, in which case i_mode is zero.
1415 	 *
1416 	 * We don't take the ILOCK here, but difree and dialloc update the AGI,
1417 	 * and we've taken the AGI buffer lock, which prevents that from
1418 	 * happening.
1419 	 */
1420 
1421 	/*
1422 	 * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
1423 	 * reclaim (IRECLAIMABLE) could be allocated or free.  i_mode still
1424 	 * reflects the ondisk state.
1425 	 */
1426 
1427 	/*
1428 	 * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
1429 	 * the flush code uses i_mode to format the ondisk inode.
1430 	 */
1431 
1432 	/*
1433 	 * (9) If the inode is in IRECLAIM and was reachable via the radix
1434 	 * tree, it still has the same i_mode as it did before it entered
1435 	 * reclaim.  The inode object is still alive because we hold the RCU
1436 	 * read lock.
1437 	 */
1438 
1439 	*inuse = VFS_I(ip)->i_mode != 0;
1440 	error = 0;
1441 
1442 out_skip:
1443 	spin_unlock(&ip->i_flags_lock);
1444 out_rcu:
1445 	rcu_read_unlock();
1446 	return error;
1447 }
1448