xref: /linux/fs/xfs/scrub/agheader_repair.c (revision 3ff78451b8e446e9a548b98a0d4dd8d24dc5780b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans.h"
15 #include "xfs_sb.h"
16 #include "xfs_alloc.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_ialloc.h"
19 #include "xfs_ialloc_btree.h"
20 #include "xfs_rmap.h"
21 #include "xfs_rmap_btree.h"
22 #include "xfs_refcount_btree.h"
23 #include "xfs_ag.h"
24 #include "xfs_inode.h"
25 #include "xfs_iunlink_item.h"
26 #include "scrub/scrub.h"
27 #include "scrub/common.h"
28 #include "scrub/trace.h"
29 #include "scrub/repair.h"
30 #include "scrub/bitmap.h"
31 #include "scrub/agb_bitmap.h"
32 #include "scrub/agino_bitmap.h"
33 #include "scrub/reap.h"
34 #include "scrub/xfile.h"
35 #include "scrub/xfarray.h"
36 
37 /* Superblock */
38 
39 /* Repair the superblock. */
40 int
41 xrep_superblock(
42 	struct xfs_scrub	*sc)
43 {
44 	struct xfs_mount	*mp = sc->mp;
45 	struct xfs_buf		*bp;
46 	xfs_agnumber_t		agno;
47 	int			error;
48 
49 	/* Don't try to repair AG 0's sb; let xfs_repair deal with it. */
50 	agno = sc->sm->sm_agno;
51 	if (agno == 0)
52 		return -EOPNOTSUPP;
53 
54 	error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp);
55 	if (error)
56 		return error;
57 
58 	/* Last chance to abort before we start committing fixes. */
59 	if (xchk_should_terminate(sc, &error))
60 		return error;
61 
62 	/* Copy AG 0's superblock to this one. */
63 	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
64 	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
65 
66 	/*
67 	 * Don't write out a secondary super with NEEDSREPAIR or log incompat
68 	 * features set, since both are ignored when set on a secondary.
69 	 */
70 	if (xfs_has_crc(mp)) {
71 		struct xfs_dsb		*sb = bp->b_addr;
72 
73 		sb->sb_features_incompat &=
74 				~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
75 		sb->sb_features_log_incompat = 0;
76 	}
77 
78 	/* Write this to disk. */
79 	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
80 	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
81 	return 0;
82 }
83 
84 /* AGF */
85 
86 struct xrep_agf_allocbt {
87 	struct xfs_scrub	*sc;
88 	xfs_agblock_t		freeblks;
89 	xfs_agblock_t		longest;
90 };
91 
92 /* Record free space shape information. */
93 STATIC int
94 xrep_agf_walk_allocbt(
95 	struct xfs_btree_cur		*cur,
96 	const struct xfs_alloc_rec_incore *rec,
97 	void				*priv)
98 {
99 	struct xrep_agf_allocbt		*raa = priv;
100 	int				error = 0;
101 
102 	if (xchk_should_terminate(raa->sc, &error))
103 		return error;
104 
105 	raa->freeblks += rec->ar_blockcount;
106 	if (rec->ar_blockcount > raa->longest)
107 		raa->longest = rec->ar_blockcount;
108 	return error;
109 }
110 
111 /* Does this AGFL block look sane? */
112 STATIC int
113 xrep_agf_check_agfl_block(
114 	struct xfs_mount	*mp,
115 	xfs_agblock_t		agbno,
116 	void			*priv)
117 {
118 	struct xfs_scrub	*sc = priv;
119 
120 	if (!xfs_verify_agbno(sc->sa.pag, agbno))
121 		return -EFSCORRUPTED;
122 	return 0;
123 }
124 
125 /*
126  * Offset within the xrep_find_ag_btree array for each btree type.  Avoid the
127  * XFS_BTNUM_ names here to avoid creating a sparse array.
128  */
129 enum {
130 	XREP_AGF_BNOBT = 0,
131 	XREP_AGF_CNTBT,
132 	XREP_AGF_RMAPBT,
133 	XREP_AGF_REFCOUNTBT,
134 	XREP_AGF_END,
135 	XREP_AGF_MAX
136 };
137 
138 /* Check a btree root candidate. */
139 static inline bool
140 xrep_check_btree_root(
141 	struct xfs_scrub		*sc,
142 	struct xrep_find_ag_btree	*fab)
143 {
144 	return xfs_verify_agbno(sc->sa.pag, fab->root) &&
145 	       fab->height <= fab->maxlevels;
146 }
147 
148 /*
149  * Given the btree roots described by *fab, find the roots, check them for
150  * sanity, and pass the root data back out via *fab.
151  *
152  * This is /also/ a chicken and egg problem because we have to use the rmapbt
153  * (rooted in the AGF) to find the btrees rooted in the AGF.  We also have no
154  * idea if the btrees make any sense.  If we hit obvious corruptions in those
155  * btrees we'll bail out.
156  */
157 STATIC int
158 xrep_agf_find_btrees(
159 	struct xfs_scrub		*sc,
160 	struct xfs_buf			*agf_bp,
161 	struct xrep_find_ag_btree	*fab,
162 	struct xfs_buf			*agfl_bp)
163 {
164 	struct xfs_agf			*old_agf = agf_bp->b_addr;
165 	int				error;
166 
167 	/* Go find the root data. */
168 	error = xrep_find_ag_btree_roots(sc, agf_bp, fab, agfl_bp);
169 	if (error)
170 		return error;
171 
172 	/* We must find the bnobt, cntbt, and rmapbt roots. */
173 	if (!xrep_check_btree_root(sc, &fab[XREP_AGF_BNOBT]) ||
174 	    !xrep_check_btree_root(sc, &fab[XREP_AGF_CNTBT]) ||
175 	    !xrep_check_btree_root(sc, &fab[XREP_AGF_RMAPBT]))
176 		return -EFSCORRUPTED;
177 
178 	/*
179 	 * We relied on the rmapbt to reconstruct the AGF.  If we get a
180 	 * different root then something's seriously wrong.
181 	 */
182 	if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root))
183 		return -EFSCORRUPTED;
184 
185 	/* We must find the refcountbt root if that feature is enabled. */
186 	if (xfs_has_reflink(sc->mp) &&
187 	    !xrep_check_btree_root(sc, &fab[XREP_AGF_REFCOUNTBT]))
188 		return -EFSCORRUPTED;
189 
190 	return 0;
191 }
192 
193 /*
194  * Reinitialize the AGF header, making an in-core copy of the old contents so
195  * that we know which in-core state needs to be reinitialized.
196  */
197 STATIC void
198 xrep_agf_init_header(
199 	struct xfs_scrub	*sc,
200 	struct xfs_buf		*agf_bp,
201 	struct xfs_agf		*old_agf)
202 {
203 	struct xfs_mount	*mp = sc->mp;
204 	struct xfs_perag	*pag = sc->sa.pag;
205 	struct xfs_agf		*agf = agf_bp->b_addr;
206 
207 	memcpy(old_agf, agf, sizeof(*old_agf));
208 	memset(agf, 0, BBTOB(agf_bp->b_length));
209 	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
210 	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
211 	agf->agf_seqno = cpu_to_be32(pag->pag_agno);
212 	agf->agf_length = cpu_to_be32(pag->block_count);
213 	agf->agf_flfirst = old_agf->agf_flfirst;
214 	agf->agf_fllast = old_agf->agf_fllast;
215 	agf->agf_flcount = old_agf->agf_flcount;
216 	if (xfs_has_crc(mp))
217 		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
218 
219 	/* Mark the incore AGF data stale until we're done fixing things. */
220 	ASSERT(xfs_perag_initialised_agf(pag));
221 	clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
222 }
223 
224 /* Set btree root information in an AGF. */
225 STATIC void
226 xrep_agf_set_roots(
227 	struct xfs_scrub		*sc,
228 	struct xfs_agf			*agf,
229 	struct xrep_find_ag_btree	*fab)
230 {
231 	agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root);
232 	agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height);
233 
234 	agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root);
235 	agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height);
236 
237 	agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
238 	agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
239 
240 	if (xfs_has_reflink(sc->mp)) {
241 		agf->agf_refcount_root =
242 				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].root);
243 		agf->agf_refcount_level =
244 				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].height);
245 	}
246 }
247 
248 /* Update all AGF fields which derive from btree contents. */
249 STATIC int
250 xrep_agf_calc_from_btrees(
251 	struct xfs_scrub	*sc,
252 	struct xfs_buf		*agf_bp)
253 {
254 	struct xrep_agf_allocbt	raa = { .sc = sc };
255 	struct xfs_btree_cur	*cur = NULL;
256 	struct xfs_agf		*agf = agf_bp->b_addr;
257 	struct xfs_mount	*mp = sc->mp;
258 	xfs_agblock_t		btreeblks;
259 	xfs_agblock_t		blocks;
260 	int			error;
261 
262 	/* Update the AGF counters from the bnobt. */
263 	cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
264 	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
265 	if (error)
266 		goto err;
267 	error = xfs_btree_count_blocks(cur, &blocks);
268 	if (error)
269 		goto err;
270 	xfs_btree_del_cursor(cur, error);
271 	btreeblks = blocks - 1;
272 	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
273 	agf->agf_longest = cpu_to_be32(raa.longest);
274 
275 	/* Update the AGF counters from the cntbt. */
276 	cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
277 	error = xfs_btree_count_blocks(cur, &blocks);
278 	if (error)
279 		goto err;
280 	xfs_btree_del_cursor(cur, error);
281 	btreeblks += blocks - 1;
282 
283 	/* Update the AGF counters from the rmapbt. */
284 	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
285 	error = xfs_btree_count_blocks(cur, &blocks);
286 	if (error)
287 		goto err;
288 	xfs_btree_del_cursor(cur, error);
289 	agf->agf_rmap_blocks = cpu_to_be32(blocks);
290 	btreeblks += blocks - 1;
291 
292 	agf->agf_btreeblks = cpu_to_be32(btreeblks);
293 
294 	/* Update the AGF counters from the refcountbt. */
295 	if (xfs_has_reflink(mp)) {
296 		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
297 				sc->sa.pag);
298 		error = xfs_btree_count_blocks(cur, &blocks);
299 		if (error)
300 			goto err;
301 		xfs_btree_del_cursor(cur, error);
302 		agf->agf_refcount_blocks = cpu_to_be32(blocks);
303 	}
304 
305 	return 0;
306 err:
307 	xfs_btree_del_cursor(cur, error);
308 	return error;
309 }
310 
311 /* Commit the new AGF and reinitialize the incore state. */
312 STATIC int
313 xrep_agf_commit_new(
314 	struct xfs_scrub	*sc,
315 	struct xfs_buf		*agf_bp)
316 {
317 	struct xfs_perag	*pag;
318 	struct xfs_agf		*agf = agf_bp->b_addr;
319 
320 	/* Trigger fdblocks recalculation */
321 	xfs_force_summary_recalc(sc->mp);
322 
323 	/* Write this to disk. */
324 	xfs_trans_buf_set_type(sc->tp, agf_bp, XFS_BLFT_AGF_BUF);
325 	xfs_trans_log_buf(sc->tp, agf_bp, 0, BBTOB(agf_bp->b_length) - 1);
326 
327 	/* Now reinitialize the in-core counters we changed. */
328 	pag = sc->sa.pag;
329 	pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
330 	pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
331 	pag->pagf_longest = be32_to_cpu(agf->agf_longest);
332 	pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
333 	pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
334 	pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
335 	pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
336 	set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
337 
338 	return xrep_roll_ag_trans(sc);
339 }
340 
341 /* Repair the AGF. v5 filesystems only. */
342 int
343 xrep_agf(
344 	struct xfs_scrub		*sc)
345 {
346 	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
347 		[XREP_AGF_BNOBT] = {
348 			.rmap_owner = XFS_RMAP_OWN_AG,
349 			.buf_ops = &xfs_bnobt_buf_ops,
350 			.maxlevels = sc->mp->m_alloc_maxlevels,
351 		},
352 		[XREP_AGF_CNTBT] = {
353 			.rmap_owner = XFS_RMAP_OWN_AG,
354 			.buf_ops = &xfs_cntbt_buf_ops,
355 			.maxlevels = sc->mp->m_alloc_maxlevels,
356 		},
357 		[XREP_AGF_RMAPBT] = {
358 			.rmap_owner = XFS_RMAP_OWN_AG,
359 			.buf_ops = &xfs_rmapbt_buf_ops,
360 			.maxlevels = sc->mp->m_rmap_maxlevels,
361 		},
362 		[XREP_AGF_REFCOUNTBT] = {
363 			.rmap_owner = XFS_RMAP_OWN_REFC,
364 			.buf_ops = &xfs_refcountbt_buf_ops,
365 			.maxlevels = sc->mp->m_refc_maxlevels,
366 		},
367 		[XREP_AGF_END] = {
368 			.buf_ops = NULL,
369 		},
370 	};
371 	struct xfs_agf			old_agf;
372 	struct xfs_mount		*mp = sc->mp;
373 	struct xfs_buf			*agf_bp;
374 	struct xfs_buf			*agfl_bp;
375 	struct xfs_agf			*agf;
376 	int				error;
377 
378 	/* We require the rmapbt to rebuild anything. */
379 	if (!xfs_has_rmapbt(mp))
380 		return -EOPNOTSUPP;
381 
382 	/*
383 	 * Make sure we have the AGF buffer, as scrub might have decided it
384 	 * was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED.
385 	 */
386 	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
387 			XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
388 						XFS_AGF_DADDR(mp)),
389 			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
390 	if (error)
391 		return error;
392 	agf_bp->b_ops = &xfs_agf_buf_ops;
393 	agf = agf_bp->b_addr;
394 
395 	/*
396 	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
397 	 * the AGFL now; these blocks might have once been part of the
398 	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
399 	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
400 	 * because we can't do any serious cross-referencing with any of the
401 	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
402 	 * then we'll bail out.
403 	 */
404 	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
405 	if (error)
406 		return error;
407 
408 	/*
409 	 * Spot-check the AGFL blocks; if they're obviously corrupt then
410 	 * there's nothing we can do but bail out.
411 	 */
412 	error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp,
413 			xrep_agf_check_agfl_block, sc);
414 	if (error)
415 		return error;
416 
417 	/*
418 	 * Find the AGF btree roots.  This is also a chicken-and-egg situation;
419 	 * see the function for more details.
420 	 */
421 	error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
422 	if (error)
423 		return error;
424 
425 	/* Last chance to abort before we start committing fixes. */
426 	if (xchk_should_terminate(sc, &error))
427 		return error;
428 
429 	/* Start rewriting the header and implant the btrees we found. */
430 	xrep_agf_init_header(sc, agf_bp, &old_agf);
431 	xrep_agf_set_roots(sc, agf, fab);
432 	error = xrep_agf_calc_from_btrees(sc, agf_bp);
433 	if (error)
434 		goto out_revert;
435 
436 	/* Commit the changes and reinitialize incore state. */
437 	return xrep_agf_commit_new(sc, agf_bp);
438 
439 out_revert:
440 	/* Mark the incore AGF state stale and revert the AGF. */
441 	clear_bit(XFS_AGSTATE_AGF_INIT, &sc->sa.pag->pag_opstate);
442 	memcpy(agf, &old_agf, sizeof(old_agf));
443 	return error;
444 }
445 
446 /* AGFL */
447 
448 struct xrep_agfl {
449 	/* Bitmap of alleged AGFL blocks that we're not going to add. */
450 	struct xagb_bitmap	crossed;
451 
452 	/* Bitmap of other OWN_AG metadata blocks. */
453 	struct xagb_bitmap	agmetablocks;
454 
455 	/* Bitmap of free space. */
456 	struct xagb_bitmap	*freesp;
457 
458 	/* rmapbt cursor for finding crosslinked blocks */
459 	struct xfs_btree_cur	*rmap_cur;
460 
461 	struct xfs_scrub	*sc;
462 };
463 
464 /* Record all OWN_AG (free space btree) information from the rmap data. */
465 STATIC int
466 xrep_agfl_walk_rmap(
467 	struct xfs_btree_cur	*cur,
468 	const struct xfs_rmap_irec *rec,
469 	void			*priv)
470 {
471 	struct xrep_agfl	*ra = priv;
472 	int			error = 0;
473 
474 	if (xchk_should_terminate(ra->sc, &error))
475 		return error;
476 
477 	/* Record all the OWN_AG blocks. */
478 	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
479 		error = xagb_bitmap_set(ra->freesp, rec->rm_startblock,
480 				rec->rm_blockcount);
481 		if (error)
482 			return error;
483 	}
484 
485 	return xagb_bitmap_set_btcur_path(&ra->agmetablocks, cur);
486 }
487 
488 /* Strike out the blocks that are cross-linked according to the rmapbt. */
489 STATIC int
490 xrep_agfl_check_extent(
491 	uint32_t		agbno,
492 	uint32_t		len,
493 	void			*priv)
494 {
495 	struct xrep_agfl	*ra = priv;
496 	xfs_agblock_t		last_agbno = agbno + len - 1;
497 	int			error;
498 
499 	while (agbno <= last_agbno) {
500 		bool		other_owners;
501 
502 		error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1,
503 				&XFS_RMAP_OINFO_AG, &other_owners);
504 		if (error)
505 			return error;
506 
507 		if (other_owners) {
508 			error = xagb_bitmap_set(&ra->crossed, agbno, 1);
509 			if (error)
510 				return error;
511 		}
512 
513 		if (xchk_should_terminate(ra->sc, &error))
514 			return error;
515 		agbno++;
516 	}
517 
518 	return 0;
519 }
520 
521 /*
522  * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce
523  * which blocks belong to the AGFL.
524  *
525  * Compute the set of old AGFL blocks by subtracting from the list of OWN_AG
526  * blocks the list of blocks owned by all other OWN_AG metadata (bnobt, cntbt,
527  * rmapbt).  These are the old AGFL blocks, so return that list and the number
528  * of blocks we're actually going to put back on the AGFL.
529  */
530 STATIC int
531 xrep_agfl_collect_blocks(
532 	struct xfs_scrub	*sc,
533 	struct xfs_buf		*agf_bp,
534 	struct xagb_bitmap	*agfl_extents,
535 	xfs_agblock_t		*flcount)
536 {
537 	struct xrep_agfl	ra;
538 	struct xfs_mount	*mp = sc->mp;
539 	struct xfs_btree_cur	*cur;
540 	int			error;
541 
542 	ra.sc = sc;
543 	ra.freesp = agfl_extents;
544 	xagb_bitmap_init(&ra.agmetablocks);
545 	xagb_bitmap_init(&ra.crossed);
546 
547 	/* Find all space used by the free space btrees & rmapbt. */
548 	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
549 	error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra);
550 	xfs_btree_del_cursor(cur, error);
551 	if (error)
552 		goto out_bmp;
553 
554 	/* Find all blocks currently being used by the bnobt. */
555 	cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
556 	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
557 	xfs_btree_del_cursor(cur, error);
558 	if (error)
559 		goto out_bmp;
560 
561 	/* Find all blocks currently being used by the cntbt. */
562 	cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
563 	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
564 	xfs_btree_del_cursor(cur, error);
565 	if (error)
566 		goto out_bmp;
567 
568 	/*
569 	 * Drop the freesp meta blocks that are in use by btrees.
570 	 * The remaining blocks /should/ be AGFL blocks.
571 	 */
572 	error = xagb_bitmap_disunion(agfl_extents, &ra.agmetablocks);
573 	if (error)
574 		goto out_bmp;
575 
576 	/* Strike out the blocks that are cross-linked. */
577 	ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
578 	error = xagb_bitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
579 	xfs_btree_del_cursor(ra.rmap_cur, error);
580 	if (error)
581 		goto out_bmp;
582 	error = xagb_bitmap_disunion(agfl_extents, &ra.crossed);
583 	if (error)
584 		goto out_bmp;
585 
586 	/*
587 	 * Calculate the new AGFL size.  If we found more blocks than fit in
588 	 * the AGFL we'll free them later.
589 	 */
590 	*flcount = min_t(uint64_t, xagb_bitmap_hweight(agfl_extents),
591 			 xfs_agfl_size(mp));
592 
593 out_bmp:
594 	xagb_bitmap_destroy(&ra.crossed);
595 	xagb_bitmap_destroy(&ra.agmetablocks);
596 	return error;
597 }
598 
599 /* Update the AGF and reset the in-core state. */
600 STATIC void
601 xrep_agfl_update_agf(
602 	struct xfs_scrub	*sc,
603 	struct xfs_buf		*agf_bp,
604 	xfs_agblock_t		flcount)
605 {
606 	struct xfs_agf		*agf = agf_bp->b_addr;
607 
608 	ASSERT(flcount <= xfs_agfl_size(sc->mp));
609 
610 	/* Trigger fdblocks recalculation */
611 	xfs_force_summary_recalc(sc->mp);
612 
613 	/* Update the AGF counters. */
614 	if (xfs_perag_initialised_agf(sc->sa.pag)) {
615 		sc->sa.pag->pagf_flcount = flcount;
616 		clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET,
617 				&sc->sa.pag->pag_opstate);
618 	}
619 	agf->agf_flfirst = cpu_to_be32(0);
620 	agf->agf_flcount = cpu_to_be32(flcount);
621 	if (flcount)
622 		agf->agf_fllast = cpu_to_be32(flcount - 1);
623 	else
624 		agf->agf_fllast = cpu_to_be32(xfs_agfl_size(sc->mp) - 1);
625 
626 	xfs_alloc_log_agf(sc->tp, agf_bp,
627 			XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
628 }
629 
630 struct xrep_agfl_fill {
631 	struct xagb_bitmap	used_extents;
632 	struct xfs_scrub	*sc;
633 	__be32			*agfl_bno;
634 	xfs_agblock_t		flcount;
635 	unsigned int		fl_off;
636 };
637 
638 /* Fill the AGFL with whatever blocks are in this extent. */
639 static int
640 xrep_agfl_fill(
641 	uint32_t		start,
642 	uint32_t		len,
643 	void			*priv)
644 {
645 	struct xrep_agfl_fill	*af = priv;
646 	struct xfs_scrub	*sc = af->sc;
647 	xfs_agblock_t		agbno = start;
648 	int			error;
649 
650 	trace_xrep_agfl_insert(sc->sa.pag, agbno, len);
651 
652 	while (agbno < start + len && af->fl_off < af->flcount)
653 		af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);
654 
655 	error = xagb_bitmap_set(&af->used_extents, start, agbno - 1);
656 	if (error)
657 		return error;
658 
659 	if (af->fl_off == af->flcount)
660 		return -ECANCELED;
661 
662 	return 0;
663 }
664 
665 /* Write out a totally new AGFL. */
666 STATIC int
667 xrep_agfl_init_header(
668 	struct xfs_scrub	*sc,
669 	struct xfs_buf		*agfl_bp,
670 	struct xagb_bitmap	*agfl_extents,
671 	xfs_agblock_t		flcount)
672 {
673 	struct xrep_agfl_fill	af = {
674 		.sc		= sc,
675 		.flcount	= flcount,
676 	};
677 	struct xfs_mount	*mp = sc->mp;
678 	struct xfs_agfl		*agfl;
679 	int			error;
680 
681 	ASSERT(flcount <= xfs_agfl_size(mp));
682 
683 	/*
684 	 * Start rewriting the header by setting the bno[] array to
685 	 * NULLAGBLOCK, then setting AGFL header fields.
686 	 */
687 	agfl = XFS_BUF_TO_AGFL(agfl_bp);
688 	memset(agfl, 0xFF, BBTOB(agfl_bp->b_length));
689 	agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
690 	agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
691 	uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
692 
693 	/*
694 	 * Fill the AGFL with the remaining blocks.  If agfl_extents has more
695 	 * blocks than fit in the AGFL, they will be freed in a subsequent
696 	 * step.
697 	 */
698 	xagb_bitmap_init(&af.used_extents);
699 	af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp),
700 	xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af);
701 	error = xagb_bitmap_disunion(agfl_extents, &af.used_extents);
702 	if (error)
703 		return error;
704 
705 	/* Write new AGFL to disk. */
706 	xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
707 	xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1);
708 	xagb_bitmap_destroy(&af.used_extents);
709 	return 0;
710 }
711 
712 /* Repair the AGFL. */
713 int
714 xrep_agfl(
715 	struct xfs_scrub	*sc)
716 {
717 	struct xagb_bitmap	agfl_extents;
718 	struct xfs_mount	*mp = sc->mp;
719 	struct xfs_buf		*agf_bp;
720 	struct xfs_buf		*agfl_bp;
721 	xfs_agblock_t		flcount;
722 	int			error;
723 
724 	/* We require the rmapbt to rebuild anything. */
725 	if (!xfs_has_rmapbt(mp))
726 		return -EOPNOTSUPP;
727 
728 	xagb_bitmap_init(&agfl_extents);
729 
730 	/*
731 	 * Read the AGF so that we can query the rmapbt.  We hope that there's
732 	 * nothing wrong with the AGF, but all the AG header repair functions
733 	 * have this chicken-and-egg problem.
734 	 */
735 	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
736 	if (error)
737 		return error;
738 
739 	/*
740 	 * Make sure we have the AGFL buffer, as scrub might have decided it
741 	 * was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED.
742 	 */
743 	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
744 			XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
745 						XFS_AGFL_DADDR(mp)),
746 			XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL);
747 	if (error)
748 		return error;
749 	agfl_bp->b_ops = &xfs_agfl_buf_ops;
750 
751 	/* Gather all the extents we're going to put on the new AGFL. */
752 	error = xrep_agfl_collect_blocks(sc, agf_bp, &agfl_extents, &flcount);
753 	if (error)
754 		goto err;
755 
756 	/* Last chance to abort before we start committing fixes. */
757 	if (xchk_should_terminate(sc, &error))
758 		goto err;
759 
760 	/*
761 	 * Update AGF and AGFL.  We reset the global free block counter when
762 	 * we adjust the AGF flcount (which can fail) so avoid updating any
763 	 * buffers until we know that part works.
764 	 */
765 	xrep_agfl_update_agf(sc, agf_bp, flcount);
766 	error = xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount);
767 	if (error)
768 		goto err;
769 
770 	/*
771 	 * Ok, the AGFL should be ready to go now.  Roll the transaction to
772 	 * make the new AGFL permanent before we start using it to return
773 	 * freespace overflow to the freespace btrees.
774 	 */
775 	sc->sa.agf_bp = agf_bp;
776 	error = xrep_roll_ag_trans(sc);
777 	if (error)
778 		goto err;
779 
780 	/* Dump any AGFL overflow. */
781 	error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
782 			XFS_AG_RESV_AGFL);
783 	if (error)
784 		goto err;
785 
786 err:
787 	xagb_bitmap_destroy(&agfl_extents);
788 	return error;
789 }
790 
791 /* AGI */
792 
793 /*
794  * Offset within the xrep_find_ag_btree array for each btree type.  Avoid the
795  * XFS_BTNUM_ names here to avoid creating a sparse array.
796  */
797 enum {
798 	XREP_AGI_INOBT = 0,
799 	XREP_AGI_FINOBT,
800 	XREP_AGI_END,
801 	XREP_AGI_MAX
802 };
803 
804 #define XREP_AGI_LOOKUP_BATCH		32
805 
806 struct xrep_agi {
807 	struct xfs_scrub		*sc;
808 
809 	/* AGI buffer, tracked separately */
810 	struct xfs_buf			*agi_bp;
811 
812 	/* context for finding btree roots */
813 	struct xrep_find_ag_btree	fab[XREP_AGI_MAX];
814 
815 	/* old AGI contents in case we have to revert */
816 	struct xfs_agi			old_agi;
817 
818 	/* bitmap of which inodes are unlinked */
819 	struct xagino_bitmap		iunlink_bmp;
820 
821 	/* heads of the unlinked inode bucket lists */
822 	xfs_agino_t			iunlink_heads[XFS_AGI_UNLINKED_BUCKETS];
823 
824 	/* scratchpad for batched lookups of the radix tree */
825 	struct xfs_inode		*lookup_batch[XREP_AGI_LOOKUP_BATCH];
826 
827 	/* Map of ino -> next_ino for unlinked inode processing. */
828 	struct xfarray			*iunlink_next;
829 
830 	/* Map of ino -> prev_ino for unlinked inode processing. */
831 	struct xfarray			*iunlink_prev;
832 };
833 
834 static void
835 xrep_agi_buf_cleanup(
836 	void		*buf)
837 {
838 	struct xrep_agi	*ragi = buf;
839 
840 	xfarray_destroy(ragi->iunlink_prev);
841 	xfarray_destroy(ragi->iunlink_next);
842 	xagino_bitmap_destroy(&ragi->iunlink_bmp);
843 }
844 
845 /*
846  * Given the inode btree roots described by *fab, find the roots, check them
847  * for sanity, and pass the root data back out via *fab.
848  */
849 STATIC int
850 xrep_agi_find_btrees(
851 	struct xrep_agi			*ragi)
852 {
853 	struct xfs_scrub		*sc = ragi->sc;
854 	struct xrep_find_ag_btree	*fab = ragi->fab;
855 	struct xfs_buf			*agf_bp;
856 	struct xfs_mount		*mp = sc->mp;
857 	int				error;
858 
859 	/* Read the AGF. */
860 	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
861 	if (error)
862 		return error;
863 
864 	/* Find the btree roots. */
865 	error = xrep_find_ag_btree_roots(sc, agf_bp, fab, NULL);
866 	if (error)
867 		return error;
868 
869 	/* We must find the inobt root. */
870 	if (!xrep_check_btree_root(sc, &fab[XREP_AGI_INOBT]))
871 		return -EFSCORRUPTED;
872 
873 	/* We must find the finobt root if that feature is enabled. */
874 	if (xfs_has_finobt(mp) &&
875 	    !xrep_check_btree_root(sc, &fab[XREP_AGI_FINOBT]))
876 		return -EFSCORRUPTED;
877 
878 	return 0;
879 }
880 
881 /*
882  * Reinitialize the AGI header, making an in-core copy of the old contents so
883  * that we know which in-core state needs to be reinitialized.
884  */
885 STATIC void
886 xrep_agi_init_header(
887 	struct xrep_agi		*ragi)
888 {
889 	struct xfs_scrub	*sc = ragi->sc;
890 	struct xfs_buf		*agi_bp = ragi->agi_bp;
891 	struct xfs_agi		*old_agi = &ragi->old_agi;
892 	struct xfs_agi		*agi = agi_bp->b_addr;
893 	struct xfs_perag	*pag = sc->sa.pag;
894 	struct xfs_mount	*mp = sc->mp;
895 
896 	memcpy(old_agi, agi, sizeof(*old_agi));
897 	memset(agi, 0, BBTOB(agi_bp->b_length));
898 	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
899 	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
900 	agi->agi_seqno = cpu_to_be32(pag->pag_agno);
901 	agi->agi_length = cpu_to_be32(pag->block_count);
902 	agi->agi_newino = cpu_to_be32(NULLAGINO);
903 	agi->agi_dirino = cpu_to_be32(NULLAGINO);
904 	if (xfs_has_crc(mp))
905 		uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
906 
907 	/* Mark the incore AGF data stale until we're done fixing things. */
908 	ASSERT(xfs_perag_initialised_agi(pag));
909 	clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
910 }
911 
912 /* Set btree root information in an AGI. */
913 STATIC void
914 xrep_agi_set_roots(
915 	struct xrep_agi			*ragi)
916 {
917 	struct xfs_scrub		*sc = ragi->sc;
918 	struct xfs_agi			*agi = ragi->agi_bp->b_addr;
919 	struct xrep_find_ag_btree	*fab = ragi->fab;
920 
921 	agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root);
922 	agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height);
923 
924 	if (xfs_has_finobt(sc->mp)) {
925 		agi->agi_free_root = cpu_to_be32(fab[XREP_AGI_FINOBT].root);
926 		agi->agi_free_level = cpu_to_be32(fab[XREP_AGI_FINOBT].height);
927 	}
928 }
929 
930 /* Update the AGI counters. */
931 STATIC int
932 xrep_agi_calc_from_btrees(
933 	struct xrep_agi		*ragi)
934 {
935 	struct xfs_scrub	*sc = ragi->sc;
936 	struct xfs_buf		*agi_bp = ragi->agi_bp;
937 	struct xfs_btree_cur	*cur;
938 	struct xfs_agi		*agi = agi_bp->b_addr;
939 	struct xfs_mount	*mp = sc->mp;
940 	xfs_agino_t		count;
941 	xfs_agino_t		freecount;
942 	int			error;
943 
944 	cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
945 	error = xfs_ialloc_count_inodes(cur, &count, &freecount);
946 	if (error)
947 		goto err;
948 	if (xfs_has_inobtcounts(mp)) {
949 		xfs_agblock_t	blocks;
950 
951 		error = xfs_btree_count_blocks(cur, &blocks);
952 		if (error)
953 			goto err;
954 		agi->agi_iblocks = cpu_to_be32(blocks);
955 	}
956 	xfs_btree_del_cursor(cur, error);
957 
958 	agi->agi_count = cpu_to_be32(count);
959 	agi->agi_freecount = cpu_to_be32(freecount);
960 
961 	if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
962 		xfs_agblock_t	blocks;
963 
964 		cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
965 		error = xfs_btree_count_blocks(cur, &blocks);
966 		if (error)
967 			goto err;
968 		xfs_btree_del_cursor(cur, error);
969 		agi->agi_fblocks = cpu_to_be32(blocks);
970 	}
971 
972 	return 0;
973 err:
974 	xfs_btree_del_cursor(cur, error);
975 	return error;
976 }
977 
978 /*
979  * Record a forwards unlinked chain pointer from agino -> next_agino in our
980  * staging information.
981  */
982 static inline int
983 xrep_iunlink_store_next(
984 	struct xrep_agi		*ragi,
985 	xfs_agino_t		agino,
986 	xfs_agino_t		next_agino)
987 {
988 	ASSERT(next_agino != 0);
989 
990 	return xfarray_store(ragi->iunlink_next, agino, &next_agino);
991 }
992 
993 /*
994  * Record a backwards unlinked chain pointer from prev_ino <- agino in our
995  * staging information.
996  */
997 static inline int
998 xrep_iunlink_store_prev(
999 	struct xrep_agi		*ragi,
1000 	xfs_agino_t		agino,
1001 	xfs_agino_t		prev_agino)
1002 {
1003 	ASSERT(prev_agino != 0);
1004 
1005 	return xfarray_store(ragi->iunlink_prev, agino, &prev_agino);
1006 }
1007 
1008 /*
1009  * Given an @agino, look up the next inode in the iunlink bucket.  Returns
1010  * NULLAGINO if we're at the end of the chain, 0 if @agino is not in memory
1011  * like it should be, or a per-AG inode number.
1012  */
1013 static inline xfs_agino_t
1014 xrep_iunlink_next(
1015 	struct xfs_scrub	*sc,
1016 	xfs_agino_t		agino)
1017 {
1018 	struct xfs_inode	*ip;
1019 
1020 	ip = xfs_iunlink_lookup(sc->sa.pag, agino);
1021 	if (!ip)
1022 		return 0;
1023 
1024 	return ip->i_next_unlinked;
1025 }
1026 
1027 /*
1028  * Load the inode @agino into memory, set its i_prev_unlinked, and drop the
1029  * inode so it can be inactivated.  Returns NULLAGINO if we're at the end of
1030  * the chain or if we should stop walking the chain due to corruption; or a
1031  * per-AG inode number.
1032  */
1033 STATIC xfs_agino_t
1034 xrep_iunlink_reload_next(
1035 	struct xrep_agi		*ragi,
1036 	xfs_agino_t		prev_agino,
1037 	xfs_agino_t		agino)
1038 {
1039 	struct xfs_scrub	*sc = ragi->sc;
1040 	struct xfs_inode	*ip;
1041 	xfs_ino_t		ino;
1042 	xfs_agino_t		ret = NULLAGINO;
1043 	int			error;
1044 
1045 	ino = XFS_AGINO_TO_INO(sc->mp, sc->sa.pag->pag_agno, agino);
1046 	error = xchk_iget(ragi->sc, ino, &ip);
1047 	if (error)
1048 		return ret;
1049 
1050 	trace_xrep_iunlink_reload_next(ip, prev_agino);
1051 
1052 	/* If this is a linked inode, stop processing the chain. */
1053 	if (VFS_I(ip)->i_nlink != 0) {
1054 		xrep_iunlink_store_next(ragi, agino, NULLAGINO);
1055 		goto rele;
1056 	}
1057 
1058 	ip->i_prev_unlinked = prev_agino;
1059 	ret = ip->i_next_unlinked;
1060 
1061 	/*
1062 	 * Drop the inode reference that we just took.  We hold the AGI, so
1063 	 * this inode cannot move off the unlinked list and hence cannot be
1064 	 * reclaimed.
1065 	 */
1066 rele:
1067 	xchk_irele(sc, ip);
1068 	return ret;
1069 }
1070 
1071 /*
1072  * Walk an AGI unlinked bucket's list to load incore any unlinked inodes that
1073  * still existed at mount time.  This can happen if iunlink processing fails
1074  * during log recovery.
1075  */
1076 STATIC int
1077 xrep_iunlink_walk_ondisk_bucket(
1078 	struct xrep_agi		*ragi,
1079 	unsigned int		bucket)
1080 {
1081 	struct xfs_scrub	*sc = ragi->sc;
1082 	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
1083 	xfs_agino_t		prev_agino = NULLAGINO;
1084 	xfs_agino_t		next_agino;
1085 	int			error = 0;
1086 
1087 	next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
1088 	while (next_agino != NULLAGINO) {
1089 		xfs_agino_t	agino = next_agino;
1090 
1091 		if (xchk_should_terminate(ragi->sc, &error))
1092 			return error;
1093 
1094 		trace_xrep_iunlink_walk_ondisk_bucket(sc->sa.pag, bucket,
1095 				prev_agino, agino);
1096 
1097 		if (bucket != agino % XFS_AGI_UNLINKED_BUCKETS)
1098 			break;
1099 
1100 		next_agino = xrep_iunlink_next(sc, agino);
1101 		if (!next_agino)
1102 			next_agino = xrep_iunlink_reload_next(ragi, prev_agino,
1103 					agino);
1104 
1105 		prev_agino = agino;
1106 	}
1107 
1108 	return 0;
1109 }
1110 
1111 /* Decide if this is an unlinked inode in this AG. */
1112 STATIC bool
1113 xrep_iunlink_igrab(
1114 	struct xfs_perag	*pag,
1115 	struct xfs_inode	*ip)
1116 {
1117 	struct xfs_mount	*mp = pag->pag_mount;
1118 
1119 	if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
1120 		return false;
1121 
1122 	if (!xfs_inode_on_unlinked_list(ip))
1123 		return false;
1124 
1125 	return true;
1126 }
1127 
1128 /*
1129  * Mark the given inode in the lookup batch in our unlinked inode bitmap, and
1130  * remember if this inode is the start of the unlinked chain.
1131  */
1132 STATIC int
1133 xrep_iunlink_visit(
1134 	struct xrep_agi		*ragi,
1135 	unsigned int		batch_idx)
1136 {
1137 	struct xfs_mount	*mp = ragi->sc->mp;
1138 	struct xfs_inode	*ip = ragi->lookup_batch[batch_idx];
1139 	xfs_agino_t		agino;
1140 	unsigned int		bucket;
1141 	int			error;
1142 
1143 	ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == ragi->sc->sa.pag->pag_agno);
1144 	ASSERT(xfs_inode_on_unlinked_list(ip));
1145 
1146 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1147 	bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
1148 
1149 	trace_xrep_iunlink_visit(ragi->sc->sa.pag, bucket,
1150 			ragi->iunlink_heads[bucket], ip);
1151 
1152 	error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
1153 	if (error)
1154 		return error;
1155 
1156 	if (ip->i_prev_unlinked == NULLAGINO) {
1157 		if (ragi->iunlink_heads[bucket] == NULLAGINO)
1158 			ragi->iunlink_heads[bucket] = agino;
1159 	}
1160 
1161 	return 0;
1162 }
1163 
1164 /*
1165  * Find all incore unlinked inodes so that we can rebuild the unlinked buckets.
1166  * We hold the AGI so there should not be any modifications to the unlinked
1167  * list.
1168  */
1169 STATIC int
1170 xrep_iunlink_mark_incore(
1171 	struct xrep_agi		*ragi)
1172 {
1173 	struct xfs_perag	*pag = ragi->sc->sa.pag;
1174 	struct xfs_mount	*mp = pag->pag_mount;
1175 	uint32_t		first_index = 0;
1176 	bool			done = false;
1177 	unsigned int		nr_found = 0;
1178 
1179 	do {
1180 		unsigned int	i;
1181 		int		error = 0;
1182 
1183 		if (xchk_should_terminate(ragi->sc, &error))
1184 			return error;
1185 
1186 		rcu_read_lock();
1187 
1188 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
1189 				(void **)&ragi->lookup_batch, first_index,
1190 				XREP_AGI_LOOKUP_BATCH);
1191 		if (!nr_found) {
1192 			rcu_read_unlock();
1193 			return 0;
1194 		}
1195 
1196 		for (i = 0; i < nr_found; i++) {
1197 			struct xfs_inode *ip = ragi->lookup_batch[i];
1198 
1199 			if (done || !xrep_iunlink_igrab(pag, ip))
1200 				ragi->lookup_batch[i] = NULL;
1201 
1202 			/*
1203 			 * Update the index for the next lookup. Catch
1204 			 * overflows into the next AG range which can occur if
1205 			 * we have inodes in the last block of the AG and we
1206 			 * are currently pointing to the last inode.
1207 			 *
1208 			 * Because we may see inodes that are from the wrong AG
1209 			 * due to RCU freeing and reallocation, only update the
1210 			 * index if it lies in this AG. It was a race that lead
1211 			 * us to see this inode, so another lookup from the
1212 			 * same index will not find it again.
1213 			 */
1214 			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
1215 				continue;
1216 			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1217 			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1218 				done = true;
1219 		}
1220 
1221 		/* unlock now we've grabbed the inodes. */
1222 		rcu_read_unlock();
1223 
1224 		for (i = 0; i < nr_found; i++) {
1225 			if (!ragi->lookup_batch[i])
1226 				continue;
1227 			error = xrep_iunlink_visit(ragi, i);
1228 			if (error)
1229 				return error;
1230 		}
1231 	} while (!done);
1232 
1233 	return 0;
1234 }
1235 
1236 /* Mark all the unlinked ondisk inodes in this inobt record in iunlink_bmp. */
1237 STATIC int
1238 xrep_iunlink_mark_ondisk_rec(
1239 	struct xfs_btree_cur		*cur,
1240 	const union xfs_btree_rec	*rec,
1241 	void				*priv)
1242 {
1243 	struct xfs_inobt_rec_incore	irec;
1244 	struct xrep_agi			*ragi = priv;
1245 	struct xfs_scrub		*sc = ragi->sc;
1246 	struct xfs_mount		*mp = cur->bc_mp;
1247 	xfs_agino_t			agino;
1248 	unsigned int			i;
1249 	int				error = 0;
1250 
1251 	xfs_inobt_btrec_to_irec(mp, rec, &irec);
1252 
1253 	for (i = 0, agino = irec.ir_startino;
1254 	     i < XFS_INODES_PER_CHUNK;
1255 	     i++, agino++) {
1256 		struct xfs_inode	*ip;
1257 		unsigned int		len = 1;
1258 
1259 		/* Skip free inodes */
1260 		if (XFS_INOBT_MASK(i) & irec.ir_free)
1261 			continue;
1262 		/* Skip inodes we've seen before */
1263 		if (xagino_bitmap_test(&ragi->iunlink_bmp, agino, &len))
1264 			continue;
1265 
1266 		/*
1267 		 * Skip incore inodes; these were already picked up by
1268 		 * the _mark_incore step.
1269 		 */
1270 		rcu_read_lock();
1271 		ip = radix_tree_lookup(&sc->sa.pag->pag_ici_root, agino);
1272 		rcu_read_unlock();
1273 		if (ip)
1274 			continue;
1275 
1276 		/*
1277 		 * Try to look up this inode.  If we can't get it, just move
1278 		 * on because we haven't actually scrubbed the inobt or the
1279 		 * inodes yet.
1280 		 */
1281 		error = xchk_iget(ragi->sc,
1282 				XFS_AGINO_TO_INO(mp, sc->sa.pag->pag_agno,
1283 						 agino),
1284 				&ip);
1285 		if (error)
1286 			continue;
1287 
1288 		trace_xrep_iunlink_reload_ondisk(ip);
1289 
1290 		if (VFS_I(ip)->i_nlink == 0)
1291 			error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
1292 		xchk_irele(sc, ip);
1293 		if (error)
1294 			break;
1295 	}
1296 
1297 	return error;
1298 }
1299 
1300 /*
1301  * Find ondisk inodes that are unlinked and not in cache, and mark them in
1302  * iunlink_bmp.   We haven't checked the inobt yet, so we don't error out if
1303  * the btree is corrupt.
1304  */
1305 STATIC void
1306 xrep_iunlink_mark_ondisk(
1307 	struct xrep_agi		*ragi)
1308 {
1309 	struct xfs_scrub	*sc = ragi->sc;
1310 	struct xfs_buf		*agi_bp = ragi->agi_bp;
1311 	struct xfs_btree_cur	*cur;
1312 	int			error;
1313 
1314 	cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
1315 	error = xfs_btree_query_all(cur, xrep_iunlink_mark_ondisk_rec, ragi);
1316 	xfs_btree_del_cursor(cur, error);
1317 }
1318 
1319 /*
1320  * Walk an iunlink bucket's inode list.  For each inode that should be on this
1321  * chain, clear its entry in in iunlink_bmp because it's ok and we don't need
1322  * to touch it further.
1323  */
1324 STATIC int
1325 xrep_iunlink_resolve_bucket(
1326 	struct xrep_agi		*ragi,
1327 	unsigned int		bucket)
1328 {
1329 	struct xfs_scrub	*sc = ragi->sc;
1330 	struct xfs_inode	*ip;
1331 	xfs_agino_t		prev_agino = NULLAGINO;
1332 	xfs_agino_t		next_agino = ragi->iunlink_heads[bucket];
1333 	int			error = 0;
1334 
1335 	while (next_agino != NULLAGINO) {
1336 		if (xchk_should_terminate(ragi->sc, &error))
1337 			return error;
1338 
1339 		/* Find the next inode in the chain. */
1340 		ip = xfs_iunlink_lookup(sc->sa.pag, next_agino);
1341 		if (!ip) {
1342 			/* Inode not incore?  Terminate the chain. */
1343 			trace_xrep_iunlink_resolve_uncached(sc->sa.pag,
1344 					bucket, prev_agino, next_agino);
1345 
1346 			next_agino = NULLAGINO;
1347 			break;
1348 		}
1349 
1350 		if (next_agino % XFS_AGI_UNLINKED_BUCKETS != bucket) {
1351 			/*
1352 			 * Inode is in the wrong bucket.  Advance the list,
1353 			 * but pretend we didn't see this inode.
1354 			 */
1355 			trace_xrep_iunlink_resolve_wronglist(sc->sa.pag,
1356 					bucket, prev_agino, next_agino);
1357 
1358 			next_agino = ip->i_next_unlinked;
1359 			continue;
1360 		}
1361 
1362 		if (!xfs_inode_on_unlinked_list(ip)) {
1363 			/*
1364 			 * Incore inode doesn't think this inode is on an
1365 			 * unlinked list.  This is probably because we reloaded
1366 			 * it from disk.  Advance the list, but pretend we
1367 			 * didn't see this inode; we'll fix that later.
1368 			 */
1369 			trace_xrep_iunlink_resolve_nolist(sc->sa.pag,
1370 					bucket, prev_agino, next_agino);
1371 			next_agino = ip->i_next_unlinked;
1372 			continue;
1373 		}
1374 
1375 		trace_xrep_iunlink_resolve_ok(sc->sa.pag, bucket, prev_agino,
1376 				next_agino);
1377 
1378 		/*
1379 		 * Otherwise, this inode's unlinked pointers are ok.  Clear it
1380 		 * from the unlinked bitmap since we're done with it, and make
1381 		 * sure the chain is still correct.
1382 		 */
1383 		error = xagino_bitmap_clear(&ragi->iunlink_bmp, next_agino, 1);
1384 		if (error)
1385 			return error;
1386 
1387 		/* Remember the previous inode's next pointer. */
1388 		if (prev_agino != NULLAGINO) {
1389 			error = xrep_iunlink_store_next(ragi, prev_agino,
1390 					next_agino);
1391 			if (error)
1392 				return error;
1393 		}
1394 
1395 		/* Remember this inode's previous pointer. */
1396 		error = xrep_iunlink_store_prev(ragi, next_agino, prev_agino);
1397 		if (error)
1398 			return error;
1399 
1400 		/* Advance the list and remember this inode. */
1401 		prev_agino = next_agino;
1402 		next_agino = ip->i_next_unlinked;
1403 	}
1404 
1405 	/* Update the previous inode's next pointer. */
1406 	if (prev_agino != NULLAGINO) {
1407 		error = xrep_iunlink_store_next(ragi, prev_agino, next_agino);
1408 		if (error)
1409 			return error;
1410 	}
1411 
1412 	return 0;
1413 }
1414 
1415 /* Reinsert this unlinked inode into the head of the staged bucket list. */
1416 STATIC int
1417 xrep_iunlink_add_to_bucket(
1418 	struct xrep_agi		*ragi,
1419 	xfs_agino_t		agino)
1420 {
1421 	xfs_agino_t		current_head;
1422 	unsigned int		bucket;
1423 	int			error;
1424 
1425 	bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
1426 
1427 	/* Point this inode at the current head of the bucket list. */
1428 	current_head = ragi->iunlink_heads[bucket];
1429 
1430 	trace_xrep_iunlink_add_to_bucket(ragi->sc->sa.pag, bucket, agino,
1431 			current_head);
1432 
1433 	error = xrep_iunlink_store_next(ragi, agino, current_head);
1434 	if (error)
1435 		return error;
1436 
1437 	/* Remember the head inode's previous pointer. */
1438 	if (current_head != NULLAGINO) {
1439 		error = xrep_iunlink_store_prev(ragi, current_head, agino);
1440 		if (error)
1441 			return error;
1442 	}
1443 
1444 	ragi->iunlink_heads[bucket] = agino;
1445 	return 0;
1446 }
1447 
1448 /* Reinsert unlinked inodes into the staged iunlink buckets. */
1449 STATIC int
1450 xrep_iunlink_add_lost_inodes(
1451 	uint32_t		start,
1452 	uint32_t		len,
1453 	void			*priv)
1454 {
1455 	struct xrep_agi		*ragi = priv;
1456 	int			error;
1457 
1458 	for (; len > 0; start++, len--) {
1459 		error = xrep_iunlink_add_to_bucket(ragi, start);
1460 		if (error)
1461 			return error;
1462 	}
1463 
1464 	return 0;
1465 }
1466 
1467 /*
1468  * Figure out the iunlink bucket values and find inodes that need to be
1469  * reinserted into the list.
1470  */
1471 STATIC int
1472 xrep_iunlink_rebuild_buckets(
1473 	struct xrep_agi		*ragi)
1474 {
1475 	unsigned int		i;
1476 	int			error;
1477 
1478 	/*
1479 	 * Walk the ondisk AGI unlinked list to find inodes that are on the
1480 	 * list but aren't in memory.  This can happen if a past log recovery
1481 	 * tried to clear the iunlinked list but failed.  Our scan rebuilds the
1482 	 * unlinked list using incore inodes, so we must load and link them
1483 	 * properly.
1484 	 */
1485 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1486 		error = xrep_iunlink_walk_ondisk_bucket(ragi, i);
1487 		if (error)
1488 			return error;
1489 	}
1490 
1491 	/*
1492 	 * Record all the incore unlinked inodes in iunlink_bmp that we didn't
1493 	 * find by walking the ondisk iunlink buckets.  This shouldn't happen,
1494 	 * but we can't risk forgetting an inode somewhere.
1495 	 */
1496 	error = xrep_iunlink_mark_incore(ragi);
1497 	if (error)
1498 		return error;
1499 
1500 	/*
1501 	 * If there are ondisk inodes that are unlinked and are not been loaded
1502 	 * into cache, record them in iunlink_bmp.
1503 	 */
1504 	xrep_iunlink_mark_ondisk(ragi);
1505 
1506 	/*
1507 	 * Walk each iunlink bucket to (re)construct as much of the incore list
1508 	 * as would be correct.  For each inode that survives this step, mark
1509 	 * it clear in iunlink_bmp; we're done with those inodes.
1510 	 */
1511 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1512 		error = xrep_iunlink_resolve_bucket(ragi, i);
1513 		if (error)
1514 			return error;
1515 	}
1516 
1517 	/*
1518 	 * Any unlinked inodes that we didn't find through the bucket list
1519 	 * walk (or was ignored by the walk) must be inserted into the bucket
1520 	 * list.  Stage this in memory for now.
1521 	 */
1522 	return xagino_bitmap_walk(&ragi->iunlink_bmp,
1523 			xrep_iunlink_add_lost_inodes, ragi);
1524 }
1525 
1526 /* Update i_next_iunlinked for the inode @agino. */
1527 STATIC int
1528 xrep_iunlink_relink_next(
1529 	struct xrep_agi		*ragi,
1530 	xfarray_idx_t		idx,
1531 	xfs_agino_t		next_agino)
1532 {
1533 	struct xfs_scrub	*sc = ragi->sc;
1534 	struct xfs_perag	*pag = sc->sa.pag;
1535 	struct xfs_inode	*ip;
1536 	xfarray_idx_t		agino = idx - 1;
1537 	bool			want_rele = false;
1538 	int			error = 0;
1539 
1540 	ip = xfs_iunlink_lookup(pag, agino);
1541 	if (!ip) {
1542 		xfs_ino_t	ino;
1543 		xfs_agino_t	prev_agino;
1544 
1545 		/*
1546 		 * No inode exists in cache.  Load it off the disk so that we
1547 		 * can reinsert it into the incore unlinked list.
1548 		 */
1549 		ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1550 		error = xchk_iget(sc, ino, &ip);
1551 		if (error)
1552 			return -EFSCORRUPTED;
1553 
1554 		want_rele = true;
1555 
1556 		/* Set the backward pointer since this just came off disk. */
1557 		error = xfarray_load(ragi->iunlink_prev, agino, &prev_agino);
1558 		if (error)
1559 			goto out_rele;
1560 
1561 		trace_xrep_iunlink_relink_prev(ip, prev_agino);
1562 		ip->i_prev_unlinked = prev_agino;
1563 	}
1564 
1565 	/* Update the forward pointer. */
1566 	if (ip->i_next_unlinked != next_agino) {
1567 		error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
1568 		if (error)
1569 			goto out_rele;
1570 
1571 		trace_xrep_iunlink_relink_next(ip, next_agino);
1572 		ip->i_next_unlinked = next_agino;
1573 	}
1574 
1575 out_rele:
1576 	/*
1577 	 * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
1578 	 * and the inode cannot be reclaimed.  However, if we used iget to load
1579 	 * a missing inode, we must irele it here.
1580 	 */
1581 	if (want_rele)
1582 		xchk_irele(sc, ip);
1583 	return error;
1584 }
1585 
1586 /* Update i_prev_iunlinked for the inode @agino. */
1587 STATIC int
1588 xrep_iunlink_relink_prev(
1589 	struct xrep_agi		*ragi,
1590 	xfarray_idx_t		idx,
1591 	xfs_agino_t		prev_agino)
1592 {
1593 	struct xfs_scrub	*sc = ragi->sc;
1594 	struct xfs_perag	*pag = sc->sa.pag;
1595 	struct xfs_inode	*ip;
1596 	xfarray_idx_t		agino = idx - 1;
1597 	bool			want_rele = false;
1598 	int			error = 0;
1599 
1600 	ASSERT(prev_agino != 0);
1601 
1602 	ip = xfs_iunlink_lookup(pag, agino);
1603 	if (!ip) {
1604 		xfs_ino_t	ino;
1605 		xfs_agino_t	next_agino;
1606 
1607 		/*
1608 		 * No inode exists in cache.  Load it off the disk so that we
1609 		 * can reinsert it into the incore unlinked list.
1610 		 */
1611 		ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1612 		error = xchk_iget(sc, ino, &ip);
1613 		if (error)
1614 			return -EFSCORRUPTED;
1615 
1616 		want_rele = true;
1617 
1618 		/* Set the forward pointer since this just came off disk. */
1619 		error = xfarray_load(ragi->iunlink_prev, agino, &next_agino);
1620 		if (error)
1621 			goto out_rele;
1622 
1623 		error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
1624 		if (error)
1625 			goto out_rele;
1626 
1627 		trace_xrep_iunlink_relink_next(ip, next_agino);
1628 		ip->i_next_unlinked = next_agino;
1629 	}
1630 
1631 	/* Update the backward pointer. */
1632 	if (ip->i_prev_unlinked != prev_agino) {
1633 		trace_xrep_iunlink_relink_prev(ip, prev_agino);
1634 		ip->i_prev_unlinked = prev_agino;
1635 	}
1636 
1637 out_rele:
1638 	/*
1639 	 * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
1640 	 * and the inode cannot be reclaimed.  However, if we used iget to load
1641 	 * a missing inode, we must irele it here.
1642 	 */
1643 	if (want_rele)
1644 		xchk_irele(sc, ip);
1645 	return error;
1646 }
1647 
1648 /* Log all the iunlink updates we need to finish regenerating the AGI. */
1649 STATIC int
1650 xrep_iunlink_commit(
1651 	struct xrep_agi		*ragi)
1652 {
1653 	struct xfs_agi		*agi = ragi->agi_bp->b_addr;
1654 	xfarray_idx_t		idx = XFARRAY_CURSOR_INIT;
1655 	xfs_agino_t		agino;
1656 	unsigned int		i;
1657 	int			error;
1658 
1659 	/* Fix all the forward links */
1660 	while ((error = xfarray_iter(ragi->iunlink_next, &idx, &agino)) == 1) {
1661 		error = xrep_iunlink_relink_next(ragi, idx, agino);
1662 		if (error)
1663 			return error;
1664 	}
1665 
1666 	/* Fix all the back links */
1667 	idx = XFARRAY_CURSOR_INIT;
1668 	while ((error = xfarray_iter(ragi->iunlink_prev, &idx, &agino)) == 1) {
1669 		error = xrep_iunlink_relink_prev(ragi, idx, agino);
1670 		if (error)
1671 			return error;
1672 	}
1673 
1674 	/* Copy the staged iunlink buckets to the new AGI. */
1675 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1676 		trace_xrep_iunlink_commit_bucket(ragi->sc->sa.pag, i,
1677 				be32_to_cpu(ragi->old_agi.agi_unlinked[i]),
1678 				ragi->iunlink_heads[i]);
1679 
1680 		agi->agi_unlinked[i] = cpu_to_be32(ragi->iunlink_heads[i]);
1681 	}
1682 
1683 	return 0;
1684 }
1685 
1686 /* Trigger reinitialization of the in-core data. */
1687 STATIC int
1688 xrep_agi_commit_new(
1689 	struct xrep_agi		*ragi)
1690 {
1691 	struct xfs_scrub	*sc = ragi->sc;
1692 	struct xfs_buf		*agi_bp = ragi->agi_bp;
1693 	struct xfs_perag	*pag;
1694 	struct xfs_agi		*agi = agi_bp->b_addr;
1695 
1696 	/* Trigger inode count recalculation */
1697 	xfs_force_summary_recalc(sc->mp);
1698 
1699 	/* Write this to disk. */
1700 	xfs_trans_buf_set_type(sc->tp, agi_bp, XFS_BLFT_AGI_BUF);
1701 	xfs_trans_log_buf(sc->tp, agi_bp, 0, BBTOB(agi_bp->b_length) - 1);
1702 
1703 	/* Now reinitialize the in-core counters if necessary. */
1704 	pag = sc->sa.pag;
1705 	pag->pagi_count = be32_to_cpu(agi->agi_count);
1706 	pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
1707 	set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
1708 
1709 	return xrep_roll_ag_trans(sc);
1710 }
1711 
1712 /* Repair the AGI. */
1713 int
1714 xrep_agi(
1715 	struct xfs_scrub	*sc)
1716 {
1717 	struct xrep_agi		*ragi;
1718 	struct xfs_mount	*mp = sc->mp;
1719 	char			*descr;
1720 	unsigned int		i;
1721 	int			error;
1722 
1723 	/* We require the rmapbt to rebuild anything. */
1724 	if (!xfs_has_rmapbt(mp))
1725 		return -EOPNOTSUPP;
1726 
1727 	sc->buf = kzalloc(sizeof(struct xrep_agi), XCHK_GFP_FLAGS);
1728 	if (!sc->buf)
1729 		return -ENOMEM;
1730 	ragi = sc->buf;
1731 	ragi->sc = sc;
1732 
1733 	ragi->fab[XREP_AGI_INOBT] = (struct xrep_find_ag_btree){
1734 		.rmap_owner	= XFS_RMAP_OWN_INOBT,
1735 		.buf_ops	= &xfs_inobt_buf_ops,
1736 		.maxlevels	= M_IGEO(sc->mp)->inobt_maxlevels,
1737 	};
1738 	ragi->fab[XREP_AGI_FINOBT] = (struct xrep_find_ag_btree){
1739 		.rmap_owner	= XFS_RMAP_OWN_INOBT,
1740 		.buf_ops	= &xfs_finobt_buf_ops,
1741 		.maxlevels	= M_IGEO(sc->mp)->inobt_maxlevels,
1742 	};
1743 	ragi->fab[XREP_AGI_END] = (struct xrep_find_ag_btree){
1744 		.buf_ops	= NULL,
1745 	};
1746 
1747 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
1748 		ragi->iunlink_heads[i] = NULLAGINO;
1749 
1750 	xagino_bitmap_init(&ragi->iunlink_bmp);
1751 	sc->buf_cleanup = xrep_agi_buf_cleanup;
1752 
1753 	descr = xchk_xfile_ag_descr(sc, "iunlinked next pointers");
1754 	error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
1755 			&ragi->iunlink_next);
1756 	kfree(descr);
1757 	if (error)
1758 		return error;
1759 
1760 	descr = xchk_xfile_ag_descr(sc, "iunlinked prev pointers");
1761 	error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
1762 			&ragi->iunlink_prev);
1763 	kfree(descr);
1764 	if (error)
1765 		return error;
1766 
1767 	/*
1768 	 * Make sure we have the AGI buffer, as scrub might have decided it
1769 	 * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED.
1770 	 */
1771 	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
1772 			XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
1773 						XFS_AGI_DADDR(mp)),
1774 			XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL);
1775 	if (error)
1776 		return error;
1777 	ragi->agi_bp->b_ops = &xfs_agi_buf_ops;
1778 
1779 	/* Find the AGI btree roots. */
1780 	error = xrep_agi_find_btrees(ragi);
1781 	if (error)
1782 		return error;
1783 
1784 	error = xrep_iunlink_rebuild_buckets(ragi);
1785 	if (error)
1786 		return error;
1787 
1788 	/* Last chance to abort before we start committing fixes. */
1789 	if (xchk_should_terminate(sc, &error))
1790 		return error;
1791 
1792 	/* Start rewriting the header and implant the btrees we found. */
1793 	xrep_agi_init_header(ragi);
1794 	xrep_agi_set_roots(ragi);
1795 	error = xrep_agi_calc_from_btrees(ragi);
1796 	if (error)
1797 		goto out_revert;
1798 	error = xrep_iunlink_commit(ragi);
1799 	if (error)
1800 		goto out_revert;
1801 
1802 	/* Reinitialize in-core state. */
1803 	return xrep_agi_commit_new(ragi);
1804 
1805 out_revert:
1806 	/* Mark the incore AGI state stale and revert the AGI. */
1807 	clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate);
1808 	memcpy(ragi->agi_bp->b_addr, &ragi->old_agi, sizeof(struct xfs_agi));
1809 	return error;
1810 }
1811