xref: /linux/fs/xfs/scrub/agheader_repair.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans.h"
15 #include "xfs_sb.h"
16 #include "xfs_alloc.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_ialloc.h"
19 #include "xfs_ialloc_btree.h"
20 #include "xfs_rmap.h"
21 #include "xfs_rmap_btree.h"
22 #include "xfs_refcount_btree.h"
23 #include "xfs_ag.h"
24 #include "xfs_inode.h"
25 #include "xfs_iunlink_item.h"
26 #include "scrub/scrub.h"
27 #include "scrub/common.h"
28 #include "scrub/trace.h"
29 #include "scrub/repair.h"
30 #include "scrub/bitmap.h"
31 #include "scrub/agb_bitmap.h"
32 #include "scrub/agino_bitmap.h"
33 #include "scrub/reap.h"
34 #include "scrub/xfile.h"
35 #include "scrub/xfarray.h"
36 
37 /* Superblock */
38 
39 /* Repair the superblock. */
40 int
41 xrep_superblock(
42 	struct xfs_scrub	*sc)
43 {
44 	struct xfs_mount	*mp = sc->mp;
45 	struct xfs_buf		*bp;
46 	xfs_agnumber_t		agno;
47 	int			error;
48 
49 	/* Don't try to repair AG 0's sb; let xfs_repair deal with it. */
50 	agno = sc->sm->sm_agno;
51 	if (agno == 0)
52 		return -EOPNOTSUPP;
53 
54 	error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp);
55 	if (error)
56 		return error;
57 
58 	/* Last chance to abort before we start committing fixes. */
59 	if (xchk_should_terminate(sc, &error))
60 		return error;
61 
62 	/* Copy AG 0's superblock to this one. */
63 	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
64 	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
65 
66 	/*
67 	 * Don't write out a secondary super with NEEDSREPAIR or log incompat
68 	 * features set, since both are ignored when set on a secondary.
69 	 */
70 	if (xfs_has_crc(mp)) {
71 		struct xfs_dsb		*sb = bp->b_addr;
72 
73 		sb->sb_features_incompat &=
74 				~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
75 		sb->sb_features_log_incompat = 0;
76 	}
77 
78 	/* Write this to disk. */
79 	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
80 	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
81 	return 0;
82 }
83 
84 /* AGF */
85 
86 struct xrep_agf_allocbt {
87 	struct xfs_scrub	*sc;
88 	xfs_agblock_t		freeblks;
89 	xfs_agblock_t		longest;
90 };
91 
92 /* Record free space shape information. */
93 STATIC int
94 xrep_agf_walk_allocbt(
95 	struct xfs_btree_cur		*cur,
96 	const struct xfs_alloc_rec_incore *rec,
97 	void				*priv)
98 {
99 	struct xrep_agf_allocbt		*raa = priv;
100 	int				error = 0;
101 
102 	if (xchk_should_terminate(raa->sc, &error))
103 		return error;
104 
105 	raa->freeblks += rec->ar_blockcount;
106 	if (rec->ar_blockcount > raa->longest)
107 		raa->longest = rec->ar_blockcount;
108 	return error;
109 }
110 
111 /* Does this AGFL block look sane? */
112 STATIC int
113 xrep_agf_check_agfl_block(
114 	struct xfs_mount	*mp,
115 	xfs_agblock_t		agbno,
116 	void			*priv)
117 {
118 	struct xfs_scrub	*sc = priv;
119 
120 	if (!xfs_verify_agbno(sc->sa.pag, agbno))
121 		return -EFSCORRUPTED;
122 	return 0;
123 }
124 
125 /*
126  * Offset within the xrep_find_ag_btree array for each btree type.  Avoid the
127  * XFS_BTNUM_ names here to avoid creating a sparse array.
128  */
129 enum {
130 	XREP_AGF_BNOBT = 0,
131 	XREP_AGF_CNTBT,
132 	XREP_AGF_RMAPBT,
133 	XREP_AGF_REFCOUNTBT,
134 	XREP_AGF_END,
135 	XREP_AGF_MAX
136 };
137 
138 /* Check a btree root candidate. */
139 static inline bool
140 xrep_check_btree_root(
141 	struct xfs_scrub		*sc,
142 	struct xrep_find_ag_btree	*fab)
143 {
144 	return xfs_verify_agbno(sc->sa.pag, fab->root) &&
145 	       fab->height <= fab->maxlevels;
146 }
147 
148 /*
149  * Given the btree roots described by *fab, find the roots, check them for
150  * sanity, and pass the root data back out via *fab.
151  *
152  * This is /also/ a chicken and egg problem because we have to use the rmapbt
153  * (rooted in the AGF) to find the btrees rooted in the AGF.  We also have no
154  * idea if the btrees make any sense.  If we hit obvious corruptions in those
155  * btrees we'll bail out.
156  */
157 STATIC int
158 xrep_agf_find_btrees(
159 	struct xfs_scrub		*sc,
160 	struct xfs_buf			*agf_bp,
161 	struct xrep_find_ag_btree	*fab,
162 	struct xfs_buf			*agfl_bp)
163 {
164 	struct xfs_agf			*old_agf = agf_bp->b_addr;
165 	int				error;
166 
167 	/* Go find the root data. */
168 	error = xrep_find_ag_btree_roots(sc, agf_bp, fab, agfl_bp);
169 	if (error)
170 		return error;
171 
172 	/* We must find the bnobt, cntbt, and rmapbt roots. */
173 	if (!xrep_check_btree_root(sc, &fab[XREP_AGF_BNOBT]) ||
174 	    !xrep_check_btree_root(sc, &fab[XREP_AGF_CNTBT]) ||
175 	    !xrep_check_btree_root(sc, &fab[XREP_AGF_RMAPBT]))
176 		return -EFSCORRUPTED;
177 
178 	/*
179 	 * We relied on the rmapbt to reconstruct the AGF.  If we get a
180 	 * different root then something's seriously wrong.
181 	 */
182 	if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root))
183 		return -EFSCORRUPTED;
184 
185 	/* We must find the refcountbt root if that feature is enabled. */
186 	if (xfs_has_reflink(sc->mp) &&
187 	    !xrep_check_btree_root(sc, &fab[XREP_AGF_REFCOUNTBT]))
188 		return -EFSCORRUPTED;
189 
190 	return 0;
191 }
192 
193 /*
194  * Reinitialize the AGF header, making an in-core copy of the old contents so
195  * that we know which in-core state needs to be reinitialized.
196  */
197 STATIC void
198 xrep_agf_init_header(
199 	struct xfs_scrub	*sc,
200 	struct xfs_buf		*agf_bp,
201 	struct xfs_agf		*old_agf)
202 {
203 	struct xfs_mount	*mp = sc->mp;
204 	struct xfs_perag	*pag = sc->sa.pag;
205 	struct xfs_agf		*agf = agf_bp->b_addr;
206 
207 	memcpy(old_agf, agf, sizeof(*old_agf));
208 	memset(agf, 0, BBTOB(agf_bp->b_length));
209 	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
210 	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
211 	agf->agf_seqno = cpu_to_be32(pag_agno(pag));
212 	agf->agf_length = cpu_to_be32(pag_group(pag)->xg_block_count);
213 	agf->agf_flfirst = old_agf->agf_flfirst;
214 	agf->agf_fllast = old_agf->agf_fllast;
215 	agf->agf_flcount = old_agf->agf_flcount;
216 	if (xfs_has_crc(mp))
217 		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
218 
219 	/* Mark the incore AGF data stale until we're done fixing things. */
220 	ASSERT(xfs_perag_initialised_agf(pag));
221 	clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
222 }
223 
224 /* Set btree root information in an AGF. */
225 STATIC void
226 xrep_agf_set_roots(
227 	struct xfs_scrub		*sc,
228 	struct xfs_agf			*agf,
229 	struct xrep_find_ag_btree	*fab)
230 {
231 	agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root);
232 	agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height);
233 
234 	agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root);
235 	agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height);
236 
237 	agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
238 	agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
239 
240 	if (xfs_has_reflink(sc->mp)) {
241 		agf->agf_refcount_root =
242 				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].root);
243 		agf->agf_refcount_level =
244 				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].height);
245 	}
246 }
247 
248 /* Update all AGF fields which derive from btree contents. */
249 STATIC int
250 xrep_agf_calc_from_btrees(
251 	struct xfs_scrub	*sc,
252 	struct xfs_buf		*agf_bp)
253 {
254 	struct xrep_agf_allocbt	raa = { .sc = sc };
255 	struct xfs_btree_cur	*cur = NULL;
256 	struct xfs_agf		*agf = agf_bp->b_addr;
257 	struct xfs_mount	*mp = sc->mp;
258 	xfs_agblock_t		btreeblks;
259 	xfs_filblks_t		blocks;
260 	int			error;
261 
262 	/* Update the AGF counters from the bnobt. */
263 	cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
264 	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
265 	if (error)
266 		goto err;
267 	error = xfs_btree_count_blocks(cur, &blocks);
268 	if (error)
269 		goto err;
270 	xfs_btree_del_cursor(cur, error);
271 	btreeblks = blocks - 1;
272 	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
273 	agf->agf_longest = cpu_to_be32(raa.longest);
274 
275 	/* Update the AGF counters from the cntbt. */
276 	cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
277 	error = xfs_btree_count_blocks(cur, &blocks);
278 	if (error)
279 		goto err;
280 	xfs_btree_del_cursor(cur, error);
281 	btreeblks += blocks - 1;
282 
283 	/* Update the AGF counters from the rmapbt. */
284 	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
285 	error = xfs_btree_count_blocks(cur, &blocks);
286 	if (error)
287 		goto err;
288 	xfs_btree_del_cursor(cur, error);
289 	agf->agf_rmap_blocks = cpu_to_be32(blocks);
290 	btreeblks += blocks - 1;
291 
292 	agf->agf_btreeblks = cpu_to_be32(btreeblks);
293 
294 	/* Update the AGF counters from the refcountbt. */
295 	if (xfs_has_reflink(mp)) {
296 		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
297 				sc->sa.pag);
298 		error = xfs_btree_count_blocks(cur, &blocks);
299 		if (error)
300 			goto err;
301 		xfs_btree_del_cursor(cur, error);
302 		agf->agf_refcount_blocks = cpu_to_be32(blocks);
303 	}
304 
305 	return 0;
306 err:
307 	xfs_btree_del_cursor(cur, error);
308 	return error;
309 }
310 
311 /* Commit the new AGF and reinitialize the incore state. */
312 STATIC int
313 xrep_agf_commit_new(
314 	struct xfs_scrub	*sc,
315 	struct xfs_buf		*agf_bp)
316 {
317 	struct xfs_perag	*pag;
318 	struct xfs_agf		*agf = agf_bp->b_addr;
319 
320 	/* Trigger fdblocks recalculation */
321 	xfs_force_summary_recalc(sc->mp);
322 
323 	/* Write this to disk. */
324 	xfs_trans_buf_set_type(sc->tp, agf_bp, XFS_BLFT_AGF_BUF);
325 	xfs_trans_log_buf(sc->tp, agf_bp, 0, BBTOB(agf_bp->b_length) - 1);
326 
327 	/* Now reinitialize the in-core counters we changed. */
328 	pag = sc->sa.pag;
329 	pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
330 	pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
331 	pag->pagf_longest = be32_to_cpu(agf->agf_longest);
332 	pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
333 	pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
334 	pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
335 	pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
336 	set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
337 
338 	return xrep_roll_ag_trans(sc);
339 }
340 
341 /* Repair the AGF. v5 filesystems only. */
342 int
343 xrep_agf(
344 	struct xfs_scrub		*sc)
345 {
346 	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
347 		[XREP_AGF_BNOBT] = {
348 			.rmap_owner = XFS_RMAP_OWN_AG,
349 			.buf_ops = &xfs_bnobt_buf_ops,
350 			.maxlevels = sc->mp->m_alloc_maxlevels,
351 		},
352 		[XREP_AGF_CNTBT] = {
353 			.rmap_owner = XFS_RMAP_OWN_AG,
354 			.buf_ops = &xfs_cntbt_buf_ops,
355 			.maxlevels = sc->mp->m_alloc_maxlevels,
356 		},
357 		[XREP_AGF_RMAPBT] = {
358 			.rmap_owner = XFS_RMAP_OWN_AG,
359 			.buf_ops = &xfs_rmapbt_buf_ops,
360 			.maxlevels = sc->mp->m_rmap_maxlevels,
361 		},
362 		[XREP_AGF_REFCOUNTBT] = {
363 			.rmap_owner = XFS_RMAP_OWN_REFC,
364 			.buf_ops = &xfs_refcountbt_buf_ops,
365 			.maxlevels = sc->mp->m_refc_maxlevels,
366 		},
367 		[XREP_AGF_END] = {
368 			.buf_ops = NULL,
369 		},
370 	};
371 	struct xfs_agf			old_agf;
372 	struct xfs_mount		*mp = sc->mp;
373 	struct xfs_buf			*agf_bp;
374 	struct xfs_buf			*agfl_bp;
375 	struct xfs_agf			*agf;
376 	int				error;
377 
378 	/* We require the rmapbt to rebuild anything. */
379 	if (!xfs_has_rmapbt(mp))
380 		return -EOPNOTSUPP;
381 
382 	/*
383 	 * Make sure we have the AGF buffer, as scrub might have decided it
384 	 * was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED.
385 	 */
386 	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
387 			XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
388 						XFS_AGF_DADDR(mp)),
389 			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
390 	if (error)
391 		return error;
392 	agf_bp->b_ops = &xfs_agf_buf_ops;
393 	agf = agf_bp->b_addr;
394 
395 	/*
396 	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
397 	 * the AGFL now; these blocks might have once been part of the
398 	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
399 	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
400 	 * because we can't do any serious cross-referencing with any of the
401 	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
402 	 * then we'll bail out.
403 	 */
404 	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
405 	if (error)
406 		return error;
407 
408 	/*
409 	 * Spot-check the AGFL blocks; if they're obviously corrupt then
410 	 * there's nothing we can do but bail out.
411 	 */
412 	error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp,
413 			xrep_agf_check_agfl_block, sc);
414 	if (error)
415 		return error;
416 
417 	/*
418 	 * Find the AGF btree roots.  This is also a chicken-and-egg situation;
419 	 * see the function for more details.
420 	 */
421 	error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
422 	if (error)
423 		return error;
424 
425 	/* Last chance to abort before we start committing fixes. */
426 	if (xchk_should_terminate(sc, &error))
427 		return error;
428 
429 	/* Start rewriting the header and implant the btrees we found. */
430 	xrep_agf_init_header(sc, agf_bp, &old_agf);
431 	xrep_agf_set_roots(sc, agf, fab);
432 	error = xrep_agf_calc_from_btrees(sc, agf_bp);
433 	if (error)
434 		goto out_revert;
435 
436 	/* Commit the changes and reinitialize incore state. */
437 	return xrep_agf_commit_new(sc, agf_bp);
438 
439 out_revert:
440 	/* Mark the incore AGF state stale and revert the AGF. */
441 	clear_bit(XFS_AGSTATE_AGF_INIT, &sc->sa.pag->pag_opstate);
442 	memcpy(agf, &old_agf, sizeof(old_agf));
443 	return error;
444 }
445 
446 /* AGFL */
447 
448 struct xrep_agfl {
449 	/* Bitmap of alleged AGFL blocks that we're not going to add. */
450 	struct xagb_bitmap	crossed;
451 
452 	/* Bitmap of other OWN_AG metadata blocks. */
453 	struct xagb_bitmap	agmetablocks;
454 
455 	/* Bitmap of free space. */
456 	struct xagb_bitmap	*freesp;
457 
458 	/* rmapbt cursor for finding crosslinked blocks */
459 	struct xfs_btree_cur	*rmap_cur;
460 
461 	struct xfs_scrub	*sc;
462 };
463 
464 /* Record all OWN_AG (free space btree) information from the rmap data. */
465 STATIC int
466 xrep_agfl_walk_rmap(
467 	struct xfs_btree_cur	*cur,
468 	const struct xfs_rmap_irec *rec,
469 	void			*priv)
470 {
471 	struct xrep_agfl	*ra = priv;
472 	int			error = 0;
473 
474 	if (xchk_should_terminate(ra->sc, &error))
475 		return error;
476 
477 	/* Record all the OWN_AG blocks. */
478 	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
479 		error = xagb_bitmap_set(ra->freesp, rec->rm_startblock,
480 				rec->rm_blockcount);
481 		if (error)
482 			return error;
483 	}
484 
485 	return xagb_bitmap_set_btcur_path(&ra->agmetablocks, cur);
486 }
487 
488 /* Strike out the blocks that are cross-linked according to the rmapbt. */
489 STATIC int
490 xrep_agfl_check_extent(
491 	uint32_t		agbno,
492 	uint32_t		len,
493 	void			*priv)
494 {
495 	struct xrep_agfl	*ra = priv;
496 	xfs_agblock_t		last_agbno = agbno + len - 1;
497 	int			error;
498 
499 	while (agbno <= last_agbno) {
500 		bool		other_owners;
501 
502 		error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1,
503 				&XFS_RMAP_OINFO_AG, &other_owners);
504 		if (error)
505 			return error;
506 
507 		if (other_owners) {
508 			error = xagb_bitmap_set(&ra->crossed, agbno, 1);
509 			if (error)
510 				return error;
511 		}
512 
513 		if (xchk_should_terminate(ra->sc, &error))
514 			return error;
515 		agbno++;
516 	}
517 
518 	return 0;
519 }
520 
521 /*
522  * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce
523  * which blocks belong to the AGFL.
524  *
525  * Compute the set of old AGFL blocks by subtracting from the list of OWN_AG
526  * blocks the list of blocks owned by all other OWN_AG metadata (bnobt, cntbt,
527  * rmapbt).  These are the old AGFL blocks, so return that list and the number
528  * of blocks we're actually going to put back on the AGFL.
529  */
530 STATIC int
531 xrep_agfl_collect_blocks(
532 	struct xfs_scrub	*sc,
533 	struct xfs_buf		*agf_bp,
534 	struct xagb_bitmap	*agfl_extents,
535 	xfs_agblock_t		*flcount)
536 {
537 	struct xrep_agfl	ra;
538 	struct xfs_mount	*mp = sc->mp;
539 	struct xfs_btree_cur	*cur;
540 	int			error;
541 
542 	ra.sc = sc;
543 	ra.freesp = agfl_extents;
544 	xagb_bitmap_init(&ra.agmetablocks);
545 	xagb_bitmap_init(&ra.crossed);
546 
547 	/* Find all space used by the free space btrees & rmapbt. */
548 	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
549 	error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra);
550 	xfs_btree_del_cursor(cur, error);
551 	if (error)
552 		goto out_bmp;
553 
554 	/* Find all blocks currently being used by the bnobt. */
555 	cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
556 	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
557 	xfs_btree_del_cursor(cur, error);
558 	if (error)
559 		goto out_bmp;
560 
561 	/* Find all blocks currently being used by the cntbt. */
562 	cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
563 	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
564 	xfs_btree_del_cursor(cur, error);
565 	if (error)
566 		goto out_bmp;
567 
568 	/*
569 	 * Drop the freesp meta blocks that are in use by btrees.
570 	 * The remaining blocks /should/ be AGFL blocks.
571 	 */
572 	error = xagb_bitmap_disunion(agfl_extents, &ra.agmetablocks);
573 	if (error)
574 		goto out_bmp;
575 
576 	/* Strike out the blocks that are cross-linked. */
577 	ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
578 	error = xagb_bitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
579 	xfs_btree_del_cursor(ra.rmap_cur, error);
580 	if (error)
581 		goto out_bmp;
582 	error = xagb_bitmap_disunion(agfl_extents, &ra.crossed);
583 	if (error)
584 		goto out_bmp;
585 
586 	/*
587 	 * Calculate the new AGFL size.  If we found more blocks than fit in
588 	 * the AGFL we'll free them later.
589 	 */
590 	*flcount = min_t(uint64_t, xagb_bitmap_hweight(agfl_extents),
591 			 xfs_agfl_size(mp));
592 
593 out_bmp:
594 	xagb_bitmap_destroy(&ra.crossed);
595 	xagb_bitmap_destroy(&ra.agmetablocks);
596 	return error;
597 }
598 
599 /* Update the AGF and reset the in-core state. */
600 STATIC void
601 xrep_agfl_update_agf(
602 	struct xfs_scrub	*sc,
603 	struct xfs_buf		*agf_bp,
604 	xfs_agblock_t		flcount)
605 {
606 	struct xfs_agf		*agf = agf_bp->b_addr;
607 
608 	ASSERT(flcount <= xfs_agfl_size(sc->mp));
609 
610 	/* Trigger fdblocks recalculation */
611 	xfs_force_summary_recalc(sc->mp);
612 
613 	/* Update the AGF counters. */
614 	if (xfs_perag_initialised_agf(sc->sa.pag)) {
615 		sc->sa.pag->pagf_flcount = flcount;
616 		clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET,
617 				&sc->sa.pag->pag_opstate);
618 	}
619 	agf->agf_flfirst = cpu_to_be32(0);
620 	agf->agf_flcount = cpu_to_be32(flcount);
621 	if (flcount)
622 		agf->agf_fllast = cpu_to_be32(flcount - 1);
623 	else
624 		agf->agf_fllast = cpu_to_be32(xfs_agfl_size(sc->mp) - 1);
625 
626 	xfs_alloc_log_agf(sc->tp, agf_bp,
627 			XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
628 }
629 
630 struct xrep_agfl_fill {
631 	struct xagb_bitmap	used_extents;
632 	struct xfs_scrub	*sc;
633 	__be32			*agfl_bno;
634 	xfs_agblock_t		flcount;
635 	unsigned int		fl_off;
636 };
637 
638 /* Fill the AGFL with whatever blocks are in this extent. */
639 static int
640 xrep_agfl_fill(
641 	uint32_t		start,
642 	uint32_t		len,
643 	void			*priv)
644 {
645 	struct xrep_agfl_fill	*af = priv;
646 	struct xfs_scrub	*sc = af->sc;
647 	xfs_agblock_t		agbno = start;
648 	int			error;
649 
650 	trace_xrep_agfl_insert(pag_group(sc->sa.pag), agbno, len);
651 
652 	while (agbno < start + len && af->fl_off < af->flcount)
653 		af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);
654 
655 	error = xagb_bitmap_set(&af->used_extents, start, agbno - 1);
656 	if (error)
657 		return error;
658 
659 	if (af->fl_off == af->flcount)
660 		return -ECANCELED;
661 
662 	return 0;
663 }
664 
665 /* Write out a totally new AGFL. */
666 STATIC int
667 xrep_agfl_init_header(
668 	struct xfs_scrub	*sc,
669 	struct xfs_buf		*agfl_bp,
670 	struct xagb_bitmap	*agfl_extents,
671 	xfs_agblock_t		flcount)
672 {
673 	struct xrep_agfl_fill	af = {
674 		.sc		= sc,
675 		.flcount	= flcount,
676 	};
677 	struct xfs_mount	*mp = sc->mp;
678 	struct xfs_agfl		*agfl;
679 	int			error;
680 
681 	ASSERT(flcount <= xfs_agfl_size(mp));
682 
683 	/*
684 	 * Start rewriting the header by setting the bno[] array to
685 	 * NULLAGBLOCK, then setting AGFL header fields.
686 	 */
687 	agfl = XFS_BUF_TO_AGFL(agfl_bp);
688 	memset(agfl, 0xFF, BBTOB(agfl_bp->b_length));
689 	agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
690 	agfl->agfl_seqno = cpu_to_be32(pag_agno(sc->sa.pag));
691 	uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
692 
693 	/*
694 	 * Fill the AGFL with the remaining blocks.  If agfl_extents has more
695 	 * blocks than fit in the AGFL, they will be freed in a subsequent
696 	 * step.
697 	 */
698 	xagb_bitmap_init(&af.used_extents);
699 	af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp);
700 	xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af);
701 	error = xagb_bitmap_disunion(agfl_extents, &af.used_extents);
702 	if (error)
703 		return error;
704 
705 	/* Write new AGFL to disk. */
706 	xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
707 	xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1);
708 	xagb_bitmap_destroy(&af.used_extents);
709 	return 0;
710 }
711 
712 /* Repair the AGFL. */
713 int
714 xrep_agfl(
715 	struct xfs_scrub	*sc)
716 {
717 	struct xagb_bitmap	agfl_extents;
718 	struct xfs_mount	*mp = sc->mp;
719 	struct xfs_buf		*agf_bp;
720 	struct xfs_buf		*agfl_bp;
721 	xfs_agblock_t		flcount;
722 	int			error;
723 
724 	/* We require the rmapbt to rebuild anything. */
725 	if (!xfs_has_rmapbt(mp))
726 		return -EOPNOTSUPP;
727 
728 	xagb_bitmap_init(&agfl_extents);
729 
730 	/*
731 	 * Read the AGF so that we can query the rmapbt.  We hope that there's
732 	 * nothing wrong with the AGF, but all the AG header repair functions
733 	 * have this chicken-and-egg problem.
734 	 */
735 	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
736 	if (error)
737 		return error;
738 
739 	/*
740 	 * Make sure we have the AGFL buffer, as scrub might have decided it
741 	 * was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED.
742 	 */
743 	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
744 			XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
745 						XFS_AGFL_DADDR(mp)),
746 			XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL);
747 	if (error)
748 		return error;
749 	agfl_bp->b_ops = &xfs_agfl_buf_ops;
750 
751 	/* Gather all the extents we're going to put on the new AGFL. */
752 	error = xrep_agfl_collect_blocks(sc, agf_bp, &agfl_extents, &flcount);
753 	if (error)
754 		goto err;
755 
756 	/* Last chance to abort before we start committing fixes. */
757 	if (xchk_should_terminate(sc, &error))
758 		goto err;
759 
760 	/*
761 	 * Update AGF and AGFL.  We reset the global free block counter when
762 	 * we adjust the AGF flcount (which can fail) so avoid updating any
763 	 * buffers until we know that part works.
764 	 */
765 	xrep_agfl_update_agf(sc, agf_bp, flcount);
766 	error = xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount);
767 	if (error)
768 		goto err;
769 
770 	/*
771 	 * Ok, the AGFL should be ready to go now.  Roll the transaction to
772 	 * make the new AGFL permanent before we start using it to return
773 	 * freespace overflow to the freespace btrees.
774 	 */
775 	sc->sa.agf_bp = agf_bp;
776 	error = xrep_roll_ag_trans(sc);
777 	if (error)
778 		goto err;
779 
780 	/* Dump any AGFL overflow. */
781 	error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
782 			XFS_AG_RESV_AGFL);
783 	if (error)
784 		goto err;
785 
786 err:
787 	xagb_bitmap_destroy(&agfl_extents);
788 	return error;
789 }
790 
791 /* AGI */
792 
793 /*
794  * Offset within the xrep_find_ag_btree array for each btree type.  Avoid the
795  * XFS_BTNUM_ names here to avoid creating a sparse array.
796  */
797 enum {
798 	XREP_AGI_INOBT = 0,
799 	XREP_AGI_FINOBT,
800 	XREP_AGI_END,
801 	XREP_AGI_MAX
802 };
803 
804 #define XREP_AGI_LOOKUP_BATCH		32
805 
806 struct xrep_agi {
807 	struct xfs_scrub		*sc;
808 
809 	/* AGI buffer, tracked separately */
810 	struct xfs_buf			*agi_bp;
811 
812 	/* context for finding btree roots */
813 	struct xrep_find_ag_btree	fab[XREP_AGI_MAX];
814 
815 	/* old AGI contents in case we have to revert */
816 	struct xfs_agi			old_agi;
817 
818 	/* bitmap of which inodes are unlinked */
819 	struct xagino_bitmap		iunlink_bmp;
820 
821 	/* heads of the unlinked inode bucket lists */
822 	xfs_agino_t			iunlink_heads[XFS_AGI_UNLINKED_BUCKETS];
823 
824 	/* scratchpad for batched lookups of the radix tree */
825 	struct xfs_inode		*lookup_batch[XREP_AGI_LOOKUP_BATCH];
826 
827 	/* Map of ino -> next_ino for unlinked inode processing. */
828 	struct xfarray			*iunlink_next;
829 
830 	/* Map of ino -> prev_ino for unlinked inode processing. */
831 	struct xfarray			*iunlink_prev;
832 };
833 
834 static void
835 xrep_agi_buf_cleanup(
836 	void		*buf)
837 {
838 	struct xrep_agi	*ragi = buf;
839 
840 	if (ragi->iunlink_prev)
841 		xfarray_destroy(ragi->iunlink_prev);
842 	ragi->iunlink_prev = NULL;
843 	if (ragi->iunlink_next)
844 		xfarray_destroy(ragi->iunlink_next);
845 	ragi->iunlink_next = NULL;
846 	xagino_bitmap_destroy(&ragi->iunlink_bmp);
847 }
848 
849 /*
850  * Given the inode btree roots described by *fab, find the roots, check them
851  * for sanity, and pass the root data back out via *fab.
852  */
853 STATIC int
854 xrep_agi_find_btrees(
855 	struct xrep_agi			*ragi)
856 {
857 	struct xfs_scrub		*sc = ragi->sc;
858 	struct xrep_find_ag_btree	*fab = ragi->fab;
859 	struct xfs_buf			*agf_bp;
860 	struct xfs_mount		*mp = sc->mp;
861 	int				error;
862 
863 	/* Read the AGF. */
864 	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
865 	if (error)
866 		return error;
867 
868 	/* Find the btree roots. */
869 	error = xrep_find_ag_btree_roots(sc, agf_bp, fab, NULL);
870 	if (error)
871 		return error;
872 
873 	/* We must find the inobt root. */
874 	if (!xrep_check_btree_root(sc, &fab[XREP_AGI_INOBT]))
875 		return -EFSCORRUPTED;
876 
877 	/* We must find the finobt root if that feature is enabled. */
878 	if (xfs_has_finobt(mp) &&
879 	    !xrep_check_btree_root(sc, &fab[XREP_AGI_FINOBT]))
880 		return -EFSCORRUPTED;
881 
882 	return 0;
883 }
884 
885 /*
886  * Reinitialize the AGI header, making an in-core copy of the old contents so
887  * that we know which in-core state needs to be reinitialized.
888  */
889 STATIC void
890 xrep_agi_init_header(
891 	struct xrep_agi		*ragi)
892 {
893 	struct xfs_scrub	*sc = ragi->sc;
894 	struct xfs_buf		*agi_bp = ragi->agi_bp;
895 	struct xfs_agi		*old_agi = &ragi->old_agi;
896 	struct xfs_agi		*agi = agi_bp->b_addr;
897 	struct xfs_perag	*pag = sc->sa.pag;
898 	struct xfs_mount	*mp = sc->mp;
899 
900 	memcpy(old_agi, agi, sizeof(*old_agi));
901 	memset(agi, 0, BBTOB(agi_bp->b_length));
902 	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
903 	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
904 	agi->agi_seqno = cpu_to_be32(pag_agno(pag));
905 	agi->agi_length = cpu_to_be32(pag_group(pag)->xg_block_count);
906 	agi->agi_newino = cpu_to_be32(NULLAGINO);
907 	agi->agi_dirino = cpu_to_be32(NULLAGINO);
908 	if (xfs_has_crc(mp))
909 		uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
910 
911 	/* Mark the incore AGF data stale until we're done fixing things. */
912 	ASSERT(xfs_perag_initialised_agi(pag));
913 	clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
914 }
915 
916 /* Set btree root information in an AGI. */
917 STATIC void
918 xrep_agi_set_roots(
919 	struct xrep_agi			*ragi)
920 {
921 	struct xfs_scrub		*sc = ragi->sc;
922 	struct xfs_agi			*agi = ragi->agi_bp->b_addr;
923 	struct xrep_find_ag_btree	*fab = ragi->fab;
924 
925 	agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root);
926 	agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height);
927 
928 	if (xfs_has_finobt(sc->mp)) {
929 		agi->agi_free_root = cpu_to_be32(fab[XREP_AGI_FINOBT].root);
930 		agi->agi_free_level = cpu_to_be32(fab[XREP_AGI_FINOBT].height);
931 	}
932 }
933 
934 /* Update the AGI counters. */
935 STATIC int
936 xrep_agi_calc_from_btrees(
937 	struct xrep_agi		*ragi)
938 {
939 	struct xfs_scrub	*sc = ragi->sc;
940 	struct xfs_buf		*agi_bp = ragi->agi_bp;
941 	struct xfs_btree_cur	*cur;
942 	struct xfs_agi		*agi = agi_bp->b_addr;
943 	struct xfs_mount	*mp = sc->mp;
944 	xfs_agino_t		count;
945 	xfs_agino_t		freecount;
946 	int			error;
947 
948 	cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
949 	error = xfs_ialloc_count_inodes(cur, &count, &freecount);
950 	if (error)
951 		goto err;
952 	if (xfs_has_inobtcounts(mp)) {
953 		xfs_filblks_t	blocks;
954 
955 		error = xfs_btree_count_blocks(cur, &blocks);
956 		if (error)
957 			goto err;
958 		agi->agi_iblocks = cpu_to_be32(blocks);
959 	}
960 	xfs_btree_del_cursor(cur, error);
961 
962 	agi->agi_count = cpu_to_be32(count);
963 	agi->agi_freecount = cpu_to_be32(freecount);
964 
965 	if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
966 		xfs_filblks_t	blocks;
967 
968 		cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
969 		error = xfs_btree_count_blocks(cur, &blocks);
970 		if (error)
971 			goto err;
972 		xfs_btree_del_cursor(cur, error);
973 		agi->agi_fblocks = cpu_to_be32(blocks);
974 	}
975 
976 	return 0;
977 err:
978 	xfs_btree_del_cursor(cur, error);
979 	return error;
980 }
981 
982 /*
983  * Record a forwards unlinked chain pointer from agino -> next_agino in our
984  * staging information.
985  */
986 static inline int
987 xrep_iunlink_store_next(
988 	struct xrep_agi		*ragi,
989 	xfs_agino_t		agino,
990 	xfs_agino_t		next_agino)
991 {
992 	ASSERT(next_agino != 0);
993 
994 	return xfarray_store(ragi->iunlink_next, agino, &next_agino);
995 }
996 
997 /*
998  * Record a backwards unlinked chain pointer from prev_ino <- agino in our
999  * staging information.
1000  */
1001 static inline int
1002 xrep_iunlink_store_prev(
1003 	struct xrep_agi		*ragi,
1004 	xfs_agino_t		agino,
1005 	xfs_agino_t		prev_agino)
1006 {
1007 	ASSERT(prev_agino != 0);
1008 
1009 	return xfarray_store(ragi->iunlink_prev, agino, &prev_agino);
1010 }
1011 
1012 /*
1013  * Given an @agino, look up the next inode in the iunlink bucket.  Returns
1014  * NULLAGINO if we're at the end of the chain, 0 if @agino is not in memory
1015  * like it should be, or a per-AG inode number.
1016  */
1017 static inline xfs_agino_t
1018 xrep_iunlink_next(
1019 	struct xfs_scrub	*sc,
1020 	xfs_agino_t		agino)
1021 {
1022 	struct xfs_inode	*ip;
1023 
1024 	ip = xfs_iunlink_lookup(sc->sa.pag, agino);
1025 	if (!ip)
1026 		return 0;
1027 
1028 	return ip->i_next_unlinked;
1029 }
1030 
1031 /*
1032  * Load the inode @agino into memory, set its i_prev_unlinked, and drop the
1033  * inode so it can be inactivated.  Returns NULLAGINO if we're at the end of
1034  * the chain or if we should stop walking the chain due to corruption; or a
1035  * per-AG inode number.
1036  */
1037 STATIC xfs_agino_t
1038 xrep_iunlink_reload_next(
1039 	struct xrep_agi		*ragi,
1040 	xfs_agino_t		prev_agino,
1041 	xfs_agino_t		agino)
1042 {
1043 	struct xfs_scrub	*sc = ragi->sc;
1044 	struct xfs_inode	*ip;
1045 	xfs_agino_t		ret = NULLAGINO;
1046 	int			error;
1047 
1048 	error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino), &ip);
1049 	if (error)
1050 		return ret;
1051 
1052 	trace_xrep_iunlink_reload_next(ip, prev_agino);
1053 
1054 	/* If this is a linked inode, stop processing the chain. */
1055 	if (VFS_I(ip)->i_nlink != 0) {
1056 		xrep_iunlink_store_next(ragi, agino, NULLAGINO);
1057 		goto rele;
1058 	}
1059 
1060 	ip->i_prev_unlinked = prev_agino;
1061 	ret = ip->i_next_unlinked;
1062 
1063 	/*
1064 	 * Drop the inode reference that we just took.  We hold the AGI, so
1065 	 * this inode cannot move off the unlinked list and hence cannot be
1066 	 * reclaimed.
1067 	 */
1068 rele:
1069 	xchk_irele(sc, ip);
1070 	return ret;
1071 }
1072 
1073 /*
1074  * Walk an AGI unlinked bucket's list to load incore any unlinked inodes that
1075  * still existed at mount time.  This can happen if iunlink processing fails
1076  * during log recovery.
1077  */
1078 STATIC int
1079 xrep_iunlink_walk_ondisk_bucket(
1080 	struct xrep_agi		*ragi,
1081 	unsigned int		bucket)
1082 {
1083 	struct xfs_scrub	*sc = ragi->sc;
1084 	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
1085 	xfs_agino_t		prev_agino = NULLAGINO;
1086 	xfs_agino_t		next_agino;
1087 	int			error = 0;
1088 
1089 	next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
1090 	while (next_agino != NULLAGINO) {
1091 		xfs_agino_t	agino = next_agino;
1092 
1093 		if (xchk_should_terminate(ragi->sc, &error))
1094 			return error;
1095 
1096 		trace_xrep_iunlink_walk_ondisk_bucket(sc->sa.pag, bucket,
1097 				prev_agino, agino);
1098 
1099 		if (bucket != agino % XFS_AGI_UNLINKED_BUCKETS)
1100 			break;
1101 
1102 		next_agino = xrep_iunlink_next(sc, agino);
1103 		if (!next_agino)
1104 			next_agino = xrep_iunlink_reload_next(ragi, prev_agino,
1105 					agino);
1106 
1107 		prev_agino = agino;
1108 	}
1109 
1110 	return 0;
1111 }
1112 
1113 /* Decide if this is an unlinked inode in this AG. */
1114 STATIC bool
1115 xrep_iunlink_igrab(
1116 	struct xfs_perag	*pag,
1117 	struct xfs_inode	*ip)
1118 {
1119 	struct xfs_mount	*mp = pag_mount(pag);
1120 
1121 	if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag))
1122 		return false;
1123 
1124 	if (!xfs_inode_on_unlinked_list(ip))
1125 		return false;
1126 
1127 	return true;
1128 }
1129 
1130 /*
1131  * Mark the given inode in the lookup batch in our unlinked inode bitmap, and
1132  * remember if this inode is the start of the unlinked chain.
1133  */
1134 STATIC int
1135 xrep_iunlink_visit(
1136 	struct xrep_agi		*ragi,
1137 	unsigned int		batch_idx)
1138 {
1139 	struct xfs_mount	*mp = ragi->sc->mp;
1140 	struct xfs_inode	*ip = ragi->lookup_batch[batch_idx];
1141 	xfs_agino_t		agino;
1142 	unsigned int		bucket;
1143 	int			error;
1144 
1145 	ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == pag_agno(ragi->sc->sa.pag));
1146 	ASSERT(xfs_inode_on_unlinked_list(ip));
1147 
1148 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1149 	bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
1150 
1151 	trace_xrep_iunlink_visit(ragi->sc->sa.pag, bucket,
1152 			ragi->iunlink_heads[bucket], ip);
1153 
1154 	error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
1155 	if (error)
1156 		return error;
1157 
1158 	if (ip->i_prev_unlinked == NULLAGINO) {
1159 		if (ragi->iunlink_heads[bucket] == NULLAGINO)
1160 			ragi->iunlink_heads[bucket] = agino;
1161 	}
1162 
1163 	return 0;
1164 }
1165 
1166 /*
1167  * Find all incore unlinked inodes so that we can rebuild the unlinked buckets.
1168  * We hold the AGI so there should not be any modifications to the unlinked
1169  * list.
1170  */
1171 STATIC int
1172 xrep_iunlink_mark_incore(
1173 	struct xrep_agi		*ragi)
1174 {
1175 	struct xfs_perag	*pag = ragi->sc->sa.pag;
1176 	struct xfs_mount	*mp = pag_mount(pag);
1177 	uint32_t		first_index = 0;
1178 	bool			done = false;
1179 	unsigned int		nr_found = 0;
1180 
1181 	do {
1182 		unsigned int	i;
1183 		int		error = 0;
1184 
1185 		if (xchk_should_terminate(ragi->sc, &error))
1186 			return error;
1187 
1188 		rcu_read_lock();
1189 
1190 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
1191 				(void **)&ragi->lookup_batch, first_index,
1192 				XREP_AGI_LOOKUP_BATCH);
1193 		if (!nr_found) {
1194 			rcu_read_unlock();
1195 			return 0;
1196 		}
1197 
1198 		for (i = 0; i < nr_found; i++) {
1199 			struct xfs_inode *ip = ragi->lookup_batch[i];
1200 
1201 			if (done || !xrep_iunlink_igrab(pag, ip))
1202 				ragi->lookup_batch[i] = NULL;
1203 
1204 			/*
1205 			 * Update the index for the next lookup. Catch
1206 			 * overflows into the next AG range which can occur if
1207 			 * we have inodes in the last block of the AG and we
1208 			 * are currently pointing to the last inode.
1209 			 *
1210 			 * Because we may see inodes that are from the wrong AG
1211 			 * due to RCU freeing and reallocation, only update the
1212 			 * index if it lies in this AG. It was a race that lead
1213 			 * us to see this inode, so another lookup from the
1214 			 * same index will not find it again.
1215 			 */
1216 			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag))
1217 				continue;
1218 			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1219 			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1220 				done = true;
1221 		}
1222 
1223 		/* unlock now we've grabbed the inodes. */
1224 		rcu_read_unlock();
1225 
1226 		for (i = 0; i < nr_found; i++) {
1227 			if (!ragi->lookup_batch[i])
1228 				continue;
1229 			error = xrep_iunlink_visit(ragi, i);
1230 			if (error)
1231 				return error;
1232 		}
1233 	} while (!done);
1234 
1235 	return 0;
1236 }
1237 
1238 /* Mark all the unlinked ondisk inodes in this inobt record in iunlink_bmp. */
1239 STATIC int
1240 xrep_iunlink_mark_ondisk_rec(
1241 	struct xfs_btree_cur		*cur,
1242 	const union xfs_btree_rec	*rec,
1243 	void				*priv)
1244 {
1245 	struct xfs_inobt_rec_incore	irec;
1246 	struct xrep_agi			*ragi = priv;
1247 	struct xfs_scrub		*sc = ragi->sc;
1248 	struct xfs_mount		*mp = cur->bc_mp;
1249 	xfs_agino_t			agino;
1250 	unsigned int			i;
1251 	int				error = 0;
1252 
1253 	xfs_inobt_btrec_to_irec(mp, rec, &irec);
1254 
1255 	for (i = 0, agino = irec.ir_startino;
1256 	     i < XFS_INODES_PER_CHUNK;
1257 	     i++, agino++) {
1258 		struct xfs_inode	*ip;
1259 		unsigned int		len = 1;
1260 
1261 		/* Skip free inodes */
1262 		if (XFS_INOBT_MASK(i) & irec.ir_free)
1263 			continue;
1264 		/* Skip inodes we've seen before */
1265 		if (xagino_bitmap_test(&ragi->iunlink_bmp, agino, &len))
1266 			continue;
1267 
1268 		/*
1269 		 * Skip incore inodes; these were already picked up by
1270 		 * the _mark_incore step.
1271 		 */
1272 		rcu_read_lock();
1273 		ip = radix_tree_lookup(&sc->sa.pag->pag_ici_root, agino);
1274 		rcu_read_unlock();
1275 		if (ip)
1276 			continue;
1277 
1278 		/*
1279 		 * Try to look up this inode.  If we can't get it, just move
1280 		 * on because we haven't actually scrubbed the inobt or the
1281 		 * inodes yet.
1282 		 */
1283 		error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino),
1284 				&ip);
1285 		if (error)
1286 			continue;
1287 
1288 		trace_xrep_iunlink_reload_ondisk(ip);
1289 
1290 		if (VFS_I(ip)->i_nlink == 0)
1291 			error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
1292 		xchk_irele(sc, ip);
1293 		if (error)
1294 			break;
1295 	}
1296 
1297 	return error;
1298 }
1299 
1300 /*
1301  * Find ondisk inodes that are unlinked and not in cache, and mark them in
1302  * iunlink_bmp.   We haven't checked the inobt yet, so we don't error out if
1303  * the btree is corrupt.
1304  */
1305 STATIC void
1306 xrep_iunlink_mark_ondisk(
1307 	struct xrep_agi		*ragi)
1308 {
1309 	struct xfs_scrub	*sc = ragi->sc;
1310 	struct xfs_buf		*agi_bp = ragi->agi_bp;
1311 	struct xfs_btree_cur	*cur;
1312 	int			error;
1313 
1314 	cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
1315 	error = xfs_btree_query_all(cur, xrep_iunlink_mark_ondisk_rec, ragi);
1316 	xfs_btree_del_cursor(cur, error);
1317 }
1318 
1319 /*
1320  * Walk an iunlink bucket's inode list.  For each inode that should be on this
1321  * chain, clear its entry in in iunlink_bmp because it's ok and we don't need
1322  * to touch it further.
1323  */
1324 STATIC int
1325 xrep_iunlink_resolve_bucket(
1326 	struct xrep_agi		*ragi,
1327 	unsigned int		bucket)
1328 {
1329 	struct xfs_scrub	*sc = ragi->sc;
1330 	struct xfs_inode	*ip;
1331 	xfs_agino_t		prev_agino = NULLAGINO;
1332 	xfs_agino_t		next_agino = ragi->iunlink_heads[bucket];
1333 	int			error = 0;
1334 
1335 	while (next_agino != NULLAGINO) {
1336 		if (xchk_should_terminate(ragi->sc, &error))
1337 			return error;
1338 
1339 		/* Find the next inode in the chain. */
1340 		ip = xfs_iunlink_lookup(sc->sa.pag, next_agino);
1341 		if (!ip) {
1342 			/* Inode not incore?  Terminate the chain. */
1343 			trace_xrep_iunlink_resolve_uncached(sc->sa.pag,
1344 					bucket, prev_agino, next_agino);
1345 
1346 			next_agino = NULLAGINO;
1347 			break;
1348 		}
1349 
1350 		if (next_agino % XFS_AGI_UNLINKED_BUCKETS != bucket) {
1351 			/*
1352 			 * Inode is in the wrong bucket.  Advance the list,
1353 			 * but pretend we didn't see this inode.
1354 			 */
1355 			trace_xrep_iunlink_resolve_wronglist(sc->sa.pag,
1356 					bucket, prev_agino, next_agino);
1357 
1358 			next_agino = ip->i_next_unlinked;
1359 			continue;
1360 		}
1361 
1362 		if (!xfs_inode_on_unlinked_list(ip)) {
1363 			/*
1364 			 * Incore inode doesn't think this inode is on an
1365 			 * unlinked list.  This is probably because we reloaded
1366 			 * it from disk.  Advance the list, but pretend we
1367 			 * didn't see this inode; we'll fix that later.
1368 			 */
1369 			trace_xrep_iunlink_resolve_nolist(sc->sa.pag,
1370 					bucket, prev_agino, next_agino);
1371 			next_agino = ip->i_next_unlinked;
1372 			continue;
1373 		}
1374 
1375 		trace_xrep_iunlink_resolve_ok(sc->sa.pag, bucket, prev_agino,
1376 				next_agino);
1377 
1378 		/*
1379 		 * Otherwise, this inode's unlinked pointers are ok.  Clear it
1380 		 * from the unlinked bitmap since we're done with it, and make
1381 		 * sure the chain is still correct.
1382 		 */
1383 		error = xagino_bitmap_clear(&ragi->iunlink_bmp, next_agino, 1);
1384 		if (error)
1385 			return error;
1386 
1387 		/* Remember the previous inode's next pointer. */
1388 		if (prev_agino != NULLAGINO) {
1389 			error = xrep_iunlink_store_next(ragi, prev_agino,
1390 					next_agino);
1391 			if (error)
1392 				return error;
1393 		}
1394 
1395 		/* Remember this inode's previous pointer. */
1396 		error = xrep_iunlink_store_prev(ragi, next_agino, prev_agino);
1397 		if (error)
1398 			return error;
1399 
1400 		/* Advance the list and remember this inode. */
1401 		prev_agino = next_agino;
1402 		next_agino = ip->i_next_unlinked;
1403 	}
1404 
1405 	/* Update the previous inode's next pointer. */
1406 	if (prev_agino != NULLAGINO) {
1407 		error = xrep_iunlink_store_next(ragi, prev_agino, next_agino);
1408 		if (error)
1409 			return error;
1410 	}
1411 
1412 	return 0;
1413 }
1414 
1415 /* Reinsert this unlinked inode into the head of the staged bucket list. */
1416 STATIC int
1417 xrep_iunlink_add_to_bucket(
1418 	struct xrep_agi		*ragi,
1419 	xfs_agino_t		agino)
1420 {
1421 	xfs_agino_t		current_head;
1422 	unsigned int		bucket;
1423 	int			error;
1424 
1425 	bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
1426 
1427 	/* Point this inode at the current head of the bucket list. */
1428 	current_head = ragi->iunlink_heads[bucket];
1429 
1430 	trace_xrep_iunlink_add_to_bucket(ragi->sc->sa.pag, bucket, agino,
1431 			current_head);
1432 
1433 	error = xrep_iunlink_store_next(ragi, agino, current_head);
1434 	if (error)
1435 		return error;
1436 
1437 	/* Remember the head inode's previous pointer. */
1438 	if (current_head != NULLAGINO) {
1439 		error = xrep_iunlink_store_prev(ragi, current_head, agino);
1440 		if (error)
1441 			return error;
1442 	}
1443 
1444 	ragi->iunlink_heads[bucket] = agino;
1445 	return 0;
1446 }
1447 
1448 /* Reinsert unlinked inodes into the staged iunlink buckets. */
1449 STATIC int
1450 xrep_iunlink_add_lost_inodes(
1451 	uint32_t		start,
1452 	uint32_t		len,
1453 	void			*priv)
1454 {
1455 	struct xrep_agi		*ragi = priv;
1456 	int			error;
1457 
1458 	for (; len > 0; start++, len--) {
1459 		error = xrep_iunlink_add_to_bucket(ragi, start);
1460 		if (error)
1461 			return error;
1462 	}
1463 
1464 	return 0;
1465 }
1466 
1467 /*
1468  * Figure out the iunlink bucket values and find inodes that need to be
1469  * reinserted into the list.
1470  */
1471 STATIC int
1472 xrep_iunlink_rebuild_buckets(
1473 	struct xrep_agi		*ragi)
1474 {
1475 	unsigned int		i;
1476 	int			error;
1477 
1478 	/*
1479 	 * Walk the ondisk AGI unlinked list to find inodes that are on the
1480 	 * list but aren't in memory.  This can happen if a past log recovery
1481 	 * tried to clear the iunlinked list but failed.  Our scan rebuilds the
1482 	 * unlinked list using incore inodes, so we must load and link them
1483 	 * properly.
1484 	 */
1485 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1486 		error = xrep_iunlink_walk_ondisk_bucket(ragi, i);
1487 		if (error)
1488 			return error;
1489 	}
1490 
1491 	/*
1492 	 * Record all the incore unlinked inodes in iunlink_bmp that we didn't
1493 	 * find by walking the ondisk iunlink buckets.  This shouldn't happen,
1494 	 * but we can't risk forgetting an inode somewhere.
1495 	 */
1496 	error = xrep_iunlink_mark_incore(ragi);
1497 	if (error)
1498 		return error;
1499 
1500 	/*
1501 	 * If there are ondisk inodes that are unlinked and are not been loaded
1502 	 * into cache, record them in iunlink_bmp.
1503 	 */
1504 	xrep_iunlink_mark_ondisk(ragi);
1505 
1506 	/*
1507 	 * Walk each iunlink bucket to (re)construct as much of the incore list
1508 	 * as would be correct.  For each inode that survives this step, mark
1509 	 * it clear in iunlink_bmp; we're done with those inodes.
1510 	 */
1511 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1512 		error = xrep_iunlink_resolve_bucket(ragi, i);
1513 		if (error)
1514 			return error;
1515 	}
1516 
1517 	/*
1518 	 * Any unlinked inodes that we didn't find through the bucket list
1519 	 * walk (or was ignored by the walk) must be inserted into the bucket
1520 	 * list.  Stage this in memory for now.
1521 	 */
1522 	return xagino_bitmap_walk(&ragi->iunlink_bmp,
1523 			xrep_iunlink_add_lost_inodes, ragi);
1524 }
1525 
1526 /* Update i_next_iunlinked for the inode @agino. */
1527 STATIC int
1528 xrep_iunlink_relink_next(
1529 	struct xrep_agi		*ragi,
1530 	xfarray_idx_t		idx,
1531 	xfs_agino_t		next_agino)
1532 {
1533 	struct xfs_scrub	*sc = ragi->sc;
1534 	struct xfs_perag	*pag = sc->sa.pag;
1535 	struct xfs_inode	*ip;
1536 	xfarray_idx_t		agino = idx - 1;
1537 	bool			want_rele = false;
1538 	int			error = 0;
1539 
1540 	ip = xfs_iunlink_lookup(pag, agino);
1541 	if (!ip) {
1542 		xfs_agino_t	prev_agino;
1543 
1544 		/*
1545 		 * No inode exists in cache.  Load it off the disk so that we
1546 		 * can reinsert it into the incore unlinked list.
1547 		 */
1548 		error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip);
1549 		if (error)
1550 			return -EFSCORRUPTED;
1551 
1552 		want_rele = true;
1553 
1554 		/* Set the backward pointer since this just came off disk. */
1555 		error = xfarray_load(ragi->iunlink_prev, agino, &prev_agino);
1556 		if (error)
1557 			goto out_rele;
1558 
1559 		trace_xrep_iunlink_relink_prev(ip, prev_agino);
1560 		ip->i_prev_unlinked = prev_agino;
1561 	}
1562 
1563 	/* Update the forward pointer. */
1564 	if (ip->i_next_unlinked != next_agino) {
1565 		error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
1566 		if (error)
1567 			goto out_rele;
1568 
1569 		trace_xrep_iunlink_relink_next(ip, next_agino);
1570 		ip->i_next_unlinked = next_agino;
1571 	}
1572 
1573 out_rele:
1574 	/*
1575 	 * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
1576 	 * and the inode cannot be reclaimed.  However, if we used iget to load
1577 	 * a missing inode, we must irele it here.
1578 	 */
1579 	if (want_rele)
1580 		xchk_irele(sc, ip);
1581 	return error;
1582 }
1583 
1584 /* Update i_prev_iunlinked for the inode @agino. */
1585 STATIC int
1586 xrep_iunlink_relink_prev(
1587 	struct xrep_agi		*ragi,
1588 	xfarray_idx_t		idx,
1589 	xfs_agino_t		prev_agino)
1590 {
1591 	struct xfs_scrub	*sc = ragi->sc;
1592 	struct xfs_perag	*pag = sc->sa.pag;
1593 	struct xfs_inode	*ip;
1594 	xfarray_idx_t		agino = idx - 1;
1595 	bool			want_rele = false;
1596 	int			error = 0;
1597 
1598 	ASSERT(prev_agino != 0);
1599 
1600 	ip = xfs_iunlink_lookup(pag, agino);
1601 	if (!ip) {
1602 		xfs_agino_t	next_agino;
1603 
1604 		/*
1605 		 * No inode exists in cache.  Load it off the disk so that we
1606 		 * can reinsert it into the incore unlinked list.
1607 		 */
1608 		error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip);
1609 		if (error)
1610 			return -EFSCORRUPTED;
1611 
1612 		want_rele = true;
1613 
1614 		/* Set the forward pointer since this just came off disk. */
1615 		error = xfarray_load(ragi->iunlink_prev, agino, &next_agino);
1616 		if (error)
1617 			goto out_rele;
1618 
1619 		error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
1620 		if (error)
1621 			goto out_rele;
1622 
1623 		trace_xrep_iunlink_relink_next(ip, next_agino);
1624 		ip->i_next_unlinked = next_agino;
1625 	}
1626 
1627 	/* Update the backward pointer. */
1628 	if (ip->i_prev_unlinked != prev_agino) {
1629 		trace_xrep_iunlink_relink_prev(ip, prev_agino);
1630 		ip->i_prev_unlinked = prev_agino;
1631 	}
1632 
1633 out_rele:
1634 	/*
1635 	 * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
1636 	 * and the inode cannot be reclaimed.  However, if we used iget to load
1637 	 * a missing inode, we must irele it here.
1638 	 */
1639 	if (want_rele)
1640 		xchk_irele(sc, ip);
1641 	return error;
1642 }
1643 
1644 /* Log all the iunlink updates we need to finish regenerating the AGI. */
1645 STATIC int
1646 xrep_iunlink_commit(
1647 	struct xrep_agi		*ragi)
1648 {
1649 	struct xfs_agi		*agi = ragi->agi_bp->b_addr;
1650 	xfarray_idx_t		idx = XFARRAY_CURSOR_INIT;
1651 	xfs_agino_t		agino;
1652 	unsigned int		i;
1653 	int			error;
1654 
1655 	/* Fix all the forward links */
1656 	while ((error = xfarray_iter(ragi->iunlink_next, &idx, &agino)) == 1) {
1657 		error = xrep_iunlink_relink_next(ragi, idx, agino);
1658 		if (error)
1659 			return error;
1660 	}
1661 
1662 	/* Fix all the back links */
1663 	idx = XFARRAY_CURSOR_INIT;
1664 	while ((error = xfarray_iter(ragi->iunlink_prev, &idx, &agino)) == 1) {
1665 		error = xrep_iunlink_relink_prev(ragi, idx, agino);
1666 		if (error)
1667 			return error;
1668 	}
1669 
1670 	/* Copy the staged iunlink buckets to the new AGI. */
1671 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1672 		trace_xrep_iunlink_commit_bucket(ragi->sc->sa.pag, i,
1673 				be32_to_cpu(ragi->old_agi.agi_unlinked[i]),
1674 				ragi->iunlink_heads[i]);
1675 
1676 		agi->agi_unlinked[i] = cpu_to_be32(ragi->iunlink_heads[i]);
1677 	}
1678 
1679 	return 0;
1680 }
1681 
1682 /* Trigger reinitialization of the in-core data. */
1683 STATIC int
1684 xrep_agi_commit_new(
1685 	struct xrep_agi		*ragi)
1686 {
1687 	struct xfs_scrub	*sc = ragi->sc;
1688 	struct xfs_buf		*agi_bp = ragi->agi_bp;
1689 	struct xfs_perag	*pag;
1690 	struct xfs_agi		*agi = agi_bp->b_addr;
1691 
1692 	/* Trigger inode count recalculation */
1693 	xfs_force_summary_recalc(sc->mp);
1694 
1695 	/* Write this to disk. */
1696 	xfs_trans_buf_set_type(sc->tp, agi_bp, XFS_BLFT_AGI_BUF);
1697 	xfs_trans_log_buf(sc->tp, agi_bp, 0, BBTOB(agi_bp->b_length) - 1);
1698 
1699 	/* Now reinitialize the in-core counters if necessary. */
1700 	pag = sc->sa.pag;
1701 	pag->pagi_count = be32_to_cpu(agi->agi_count);
1702 	pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
1703 	set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
1704 
1705 	return xrep_roll_ag_trans(sc);
1706 }
1707 
1708 /* Repair the AGI. */
1709 int
1710 xrep_agi(
1711 	struct xfs_scrub	*sc)
1712 {
1713 	struct xrep_agi		*ragi;
1714 	struct xfs_mount	*mp = sc->mp;
1715 	unsigned int		i;
1716 	int			error;
1717 
1718 	/* We require the rmapbt to rebuild anything. */
1719 	if (!xfs_has_rmapbt(mp))
1720 		return -EOPNOTSUPP;
1721 
1722 	sc->buf = kzalloc(sizeof(struct xrep_agi), XCHK_GFP_FLAGS);
1723 	if (!sc->buf)
1724 		return -ENOMEM;
1725 	ragi = sc->buf;
1726 	ragi->sc = sc;
1727 
1728 	ragi->fab[XREP_AGI_INOBT] = (struct xrep_find_ag_btree){
1729 		.rmap_owner	= XFS_RMAP_OWN_INOBT,
1730 		.buf_ops	= &xfs_inobt_buf_ops,
1731 		.maxlevels	= M_IGEO(sc->mp)->inobt_maxlevels,
1732 	};
1733 	ragi->fab[XREP_AGI_FINOBT] = (struct xrep_find_ag_btree){
1734 		.rmap_owner	= XFS_RMAP_OWN_INOBT,
1735 		.buf_ops	= &xfs_finobt_buf_ops,
1736 		.maxlevels	= M_IGEO(sc->mp)->inobt_maxlevels,
1737 	};
1738 	ragi->fab[XREP_AGI_END] = (struct xrep_find_ag_btree){
1739 		.buf_ops	= NULL,
1740 	};
1741 
1742 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
1743 		ragi->iunlink_heads[i] = NULLAGINO;
1744 
1745 	xagino_bitmap_init(&ragi->iunlink_bmp);
1746 	sc->buf_cleanup = xrep_agi_buf_cleanup;
1747 
1748 	error = xfarray_create("iunlinked next pointers", 0,
1749 			sizeof(xfs_agino_t), &ragi->iunlink_next);
1750 	if (error)
1751 		return error;
1752 
1753 	error = xfarray_create("iunlinked prev pointers", 0,
1754 			sizeof(xfs_agino_t), &ragi->iunlink_prev);
1755 	if (error)
1756 		return error;
1757 
1758 	/*
1759 	 * Make sure we have the AGI buffer, as scrub might have decided it
1760 	 * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED.
1761 	 */
1762 	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
1763 			XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
1764 						XFS_AGI_DADDR(mp)),
1765 			XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL);
1766 	if (error)
1767 		return error;
1768 	ragi->agi_bp->b_ops = &xfs_agi_buf_ops;
1769 
1770 	/* Find the AGI btree roots. */
1771 	error = xrep_agi_find_btrees(ragi);
1772 	if (error)
1773 		return error;
1774 
1775 	error = xrep_iunlink_rebuild_buckets(ragi);
1776 	if (error)
1777 		return error;
1778 
1779 	/* Last chance to abort before we start committing fixes. */
1780 	if (xchk_should_terminate(sc, &error))
1781 		return error;
1782 
1783 	/* Start rewriting the header and implant the btrees we found. */
1784 	xrep_agi_init_header(ragi);
1785 	xrep_agi_set_roots(ragi);
1786 	error = xrep_agi_calc_from_btrees(ragi);
1787 	if (error)
1788 		goto out_revert;
1789 	error = xrep_iunlink_commit(ragi);
1790 	if (error)
1791 		goto out_revert;
1792 
1793 	/* Reinitialize in-core state. */
1794 	return xrep_agi_commit_new(ragi);
1795 
1796 out_revert:
1797 	/* Mark the incore AGI state stale and revert the AGI. */
1798 	clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate);
1799 	memcpy(ragi->agi_bp->b_addr, &ragi->old_agi, sizeof(struct xfs_agi));
1800 	return error;
1801 }
1802