xref: /linux/fs/xfs/scrub/agheader_repair.c (revision e18655cf35a5958fbf4ae9ca3ebf28871a3a1801)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans.h"
15 #include "xfs_sb.h"
16 #include "xfs_alloc.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_ialloc.h"
19 #include "xfs_ialloc_btree.h"
20 #include "xfs_rmap.h"
21 #include "xfs_rmap_btree.h"
22 #include "xfs_refcount_btree.h"
23 #include "xfs_ag.h"
24 #include "xfs_inode.h"
25 #include "xfs_iunlink_item.h"
26 #include "scrub/scrub.h"
27 #include "scrub/common.h"
28 #include "scrub/trace.h"
29 #include "scrub/repair.h"
30 #include "scrub/bitmap.h"
31 #include "scrub/agb_bitmap.h"
32 #include "scrub/agino_bitmap.h"
33 #include "scrub/reap.h"
34 #include "scrub/xfile.h"
35 #include "scrub/xfarray.h"
36 
37 /* Superblock */
38 
39 /* Repair the superblock. */
40 int
41 xrep_superblock(
42 	struct xfs_scrub	*sc)
43 {
44 	struct xfs_mount	*mp = sc->mp;
45 	struct xfs_buf		*bp;
46 	xfs_agnumber_t		agno;
47 	int			error;
48 
49 	/* Don't try to repair AG 0's sb; let xfs_repair deal with it. */
50 	agno = sc->sm->sm_agno;
51 	if (agno == 0)
52 		return -EOPNOTSUPP;
53 
54 	error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp);
55 	if (error)
56 		return error;
57 
58 	/* Last chance to abort before we start committing fixes. */
59 	if (xchk_should_terminate(sc, &error))
60 		return error;
61 
62 	/* Copy AG 0's superblock to this one. */
63 	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
64 	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
65 
66 	/*
67 	 * Don't write out a secondary super with NEEDSREPAIR or log incompat
68 	 * features set, since both are ignored when set on a secondary.
69 	 */
70 	if (xfs_has_crc(mp)) {
71 		struct xfs_dsb		*sb = bp->b_addr;
72 
73 		sb->sb_features_incompat &=
74 				~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
75 		sb->sb_features_log_incompat = 0;
76 	}
77 
78 	/* Write this to disk. */
79 	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
80 	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
81 	return 0;
82 }
83 
84 /* AGF */
85 
86 struct xrep_agf_allocbt {
87 	struct xfs_scrub	*sc;
88 	xfs_agblock_t		freeblks;
89 	xfs_agblock_t		longest;
90 };
91 
92 /* Record free space shape information. */
93 STATIC int
94 xrep_agf_walk_allocbt(
95 	struct xfs_btree_cur		*cur,
96 	const struct xfs_alloc_rec_incore *rec,
97 	void				*priv)
98 {
99 	struct xrep_agf_allocbt		*raa = priv;
100 	int				error = 0;
101 
102 	if (xchk_should_terminate(raa->sc, &error))
103 		return error;
104 
105 	raa->freeblks += rec->ar_blockcount;
106 	if (rec->ar_blockcount > raa->longest)
107 		raa->longest = rec->ar_blockcount;
108 	return error;
109 }
110 
111 /* Does this AGFL block look sane? */
112 STATIC int
113 xrep_agf_check_agfl_block(
114 	struct xfs_mount	*mp,
115 	xfs_agblock_t		agbno,
116 	void			*priv)
117 {
118 	struct xfs_scrub	*sc = priv;
119 
120 	if (!xfs_verify_agbno(sc->sa.pag, agbno))
121 		return -EFSCORRUPTED;
122 	return 0;
123 }
124 
125 /*
126  * Offset within the xrep_find_ag_btree array for each btree type.  Avoid the
127  * XFS_BTNUM_ names here to avoid creating a sparse array.
128  */
129 enum {
130 	XREP_AGF_BNOBT = 0,
131 	XREP_AGF_CNTBT,
132 	XREP_AGF_RMAPBT,
133 	XREP_AGF_REFCOUNTBT,
134 	XREP_AGF_END,
135 	XREP_AGF_MAX
136 };
137 
138 /* Check a btree root candidate. */
139 static inline bool
140 xrep_check_btree_root(
141 	struct xfs_scrub		*sc,
142 	struct xrep_find_ag_btree	*fab)
143 {
144 	return xfs_verify_agbno(sc->sa.pag, fab->root) &&
145 	       fab->height <= fab->maxlevels;
146 }
147 
148 /*
149  * Given the btree roots described by *fab, find the roots, check them for
150  * sanity, and pass the root data back out via *fab.
151  *
152  * This is /also/ a chicken and egg problem because we have to use the rmapbt
153  * (rooted in the AGF) to find the btrees rooted in the AGF.  We also have no
154  * idea if the btrees make any sense.  If we hit obvious corruptions in those
155  * btrees we'll bail out.
156  */
157 STATIC int
158 xrep_agf_find_btrees(
159 	struct xfs_scrub		*sc,
160 	struct xfs_buf			*agf_bp,
161 	struct xrep_find_ag_btree	*fab,
162 	struct xfs_buf			*agfl_bp)
163 {
164 	struct xfs_agf			*old_agf = agf_bp->b_addr;
165 	int				error;
166 
167 	/* Go find the root data. */
168 	error = xrep_find_ag_btree_roots(sc, agf_bp, fab, agfl_bp);
169 	if (error)
170 		return error;
171 
172 	/* We must find the bnobt, cntbt, and rmapbt roots. */
173 	if (!xrep_check_btree_root(sc, &fab[XREP_AGF_BNOBT]) ||
174 	    !xrep_check_btree_root(sc, &fab[XREP_AGF_CNTBT]) ||
175 	    !xrep_check_btree_root(sc, &fab[XREP_AGF_RMAPBT]))
176 		return -EFSCORRUPTED;
177 
178 	/*
179 	 * We relied on the rmapbt to reconstruct the AGF.  If we get a
180 	 * different root then something's seriously wrong.
181 	 */
182 	if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root))
183 		return -EFSCORRUPTED;
184 
185 	/* We must find the refcountbt root if that feature is enabled. */
186 	if (xfs_has_reflink(sc->mp) &&
187 	    !xrep_check_btree_root(sc, &fab[XREP_AGF_REFCOUNTBT]))
188 		return -EFSCORRUPTED;
189 
190 	return 0;
191 }
192 
193 /*
194  * Reinitialize the AGF header, making an in-core copy of the old contents so
195  * that we know which in-core state needs to be reinitialized.
196  */
197 STATIC void
198 xrep_agf_init_header(
199 	struct xfs_scrub	*sc,
200 	struct xfs_buf		*agf_bp,
201 	struct xfs_agf		*old_agf)
202 {
203 	struct xfs_mount	*mp = sc->mp;
204 	struct xfs_perag	*pag = sc->sa.pag;
205 	struct xfs_agf		*agf = agf_bp->b_addr;
206 
207 	memcpy(old_agf, agf, sizeof(*old_agf));
208 	memset(agf, 0, BBTOB(agf_bp->b_length));
209 	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
210 	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
211 	agf->agf_seqno = cpu_to_be32(pag_agno(pag));
212 	agf->agf_length = cpu_to_be32(pag_group(pag)->xg_block_count);
213 	agf->agf_flfirst = old_agf->agf_flfirst;
214 	agf->agf_fllast = old_agf->agf_fllast;
215 	agf->agf_flcount = old_agf->agf_flcount;
216 	if (xfs_has_crc(mp))
217 		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
218 
219 	/* Mark the incore AGF data stale until we're done fixing things. */
220 	ASSERT(xfs_perag_initialised_agf(pag));
221 	clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
222 }
223 
224 /* Set btree root information in an AGF. */
225 STATIC void
226 xrep_agf_set_roots(
227 	struct xfs_scrub		*sc,
228 	struct xfs_agf			*agf,
229 	struct xrep_find_ag_btree	*fab)
230 {
231 	agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root);
232 	agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height);
233 
234 	agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root);
235 	agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height);
236 
237 	agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
238 	agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
239 
240 	if (xfs_has_reflink(sc->mp)) {
241 		agf->agf_refcount_root =
242 				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].root);
243 		agf->agf_refcount_level =
244 				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].height);
245 	}
246 }
247 
248 /* Update all AGF fields which derive from btree contents. */
249 STATIC int
250 xrep_agf_calc_from_btrees(
251 	struct xfs_scrub	*sc,
252 	struct xfs_buf		*agf_bp)
253 {
254 	struct xrep_agf_allocbt	raa = { .sc = sc };
255 	struct xfs_btree_cur	*cur = NULL;
256 	struct xfs_agf		*agf = agf_bp->b_addr;
257 	struct xfs_mount	*mp = sc->mp;
258 	xfs_agblock_t		btreeblks;
259 	xfs_agblock_t		blocks;
260 	int			error;
261 
262 	/* Update the AGF counters from the bnobt. */
263 	cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
264 	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
265 	if (error)
266 		goto err;
267 	error = xfs_btree_count_blocks(cur, &blocks);
268 	if (error)
269 		goto err;
270 	xfs_btree_del_cursor(cur, error);
271 	btreeblks = blocks - 1;
272 	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
273 	agf->agf_longest = cpu_to_be32(raa.longest);
274 
275 	/* Update the AGF counters from the cntbt. */
276 	cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
277 	error = xfs_btree_count_blocks(cur, &blocks);
278 	if (error)
279 		goto err;
280 	xfs_btree_del_cursor(cur, error);
281 	btreeblks += blocks - 1;
282 
283 	/* Update the AGF counters from the rmapbt. */
284 	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
285 	error = xfs_btree_count_blocks(cur, &blocks);
286 	if (error)
287 		goto err;
288 	xfs_btree_del_cursor(cur, error);
289 	agf->agf_rmap_blocks = cpu_to_be32(blocks);
290 	btreeblks += blocks - 1;
291 
292 	agf->agf_btreeblks = cpu_to_be32(btreeblks);
293 
294 	/* Update the AGF counters from the refcountbt. */
295 	if (xfs_has_reflink(mp)) {
296 		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
297 				sc->sa.pag);
298 		error = xfs_btree_count_blocks(cur, &blocks);
299 		if (error)
300 			goto err;
301 		xfs_btree_del_cursor(cur, error);
302 		agf->agf_refcount_blocks = cpu_to_be32(blocks);
303 	}
304 
305 	return 0;
306 err:
307 	xfs_btree_del_cursor(cur, error);
308 	return error;
309 }
310 
311 /* Commit the new AGF and reinitialize the incore state. */
312 STATIC int
313 xrep_agf_commit_new(
314 	struct xfs_scrub	*sc,
315 	struct xfs_buf		*agf_bp)
316 {
317 	struct xfs_perag	*pag;
318 	struct xfs_agf		*agf = agf_bp->b_addr;
319 
320 	/* Trigger fdblocks recalculation */
321 	xfs_force_summary_recalc(sc->mp);
322 
323 	/* Write this to disk. */
324 	xfs_trans_buf_set_type(sc->tp, agf_bp, XFS_BLFT_AGF_BUF);
325 	xfs_trans_log_buf(sc->tp, agf_bp, 0, BBTOB(agf_bp->b_length) - 1);
326 
327 	/* Now reinitialize the in-core counters we changed. */
328 	pag = sc->sa.pag;
329 	pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
330 	pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
331 	pag->pagf_longest = be32_to_cpu(agf->agf_longest);
332 	pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
333 	pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
334 	pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
335 	pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
336 	set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
337 
338 	return xrep_roll_ag_trans(sc);
339 }
340 
341 /* Repair the AGF. v5 filesystems only. */
342 int
343 xrep_agf(
344 	struct xfs_scrub		*sc)
345 {
346 	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
347 		[XREP_AGF_BNOBT] = {
348 			.rmap_owner = XFS_RMAP_OWN_AG,
349 			.buf_ops = &xfs_bnobt_buf_ops,
350 			.maxlevels = sc->mp->m_alloc_maxlevels,
351 		},
352 		[XREP_AGF_CNTBT] = {
353 			.rmap_owner = XFS_RMAP_OWN_AG,
354 			.buf_ops = &xfs_cntbt_buf_ops,
355 			.maxlevels = sc->mp->m_alloc_maxlevels,
356 		},
357 		[XREP_AGF_RMAPBT] = {
358 			.rmap_owner = XFS_RMAP_OWN_AG,
359 			.buf_ops = &xfs_rmapbt_buf_ops,
360 			.maxlevels = sc->mp->m_rmap_maxlevels,
361 		},
362 		[XREP_AGF_REFCOUNTBT] = {
363 			.rmap_owner = XFS_RMAP_OWN_REFC,
364 			.buf_ops = &xfs_refcountbt_buf_ops,
365 			.maxlevels = sc->mp->m_refc_maxlevels,
366 		},
367 		[XREP_AGF_END] = {
368 			.buf_ops = NULL,
369 		},
370 	};
371 	struct xfs_agf			old_agf;
372 	struct xfs_mount		*mp = sc->mp;
373 	struct xfs_buf			*agf_bp;
374 	struct xfs_buf			*agfl_bp;
375 	struct xfs_agf			*agf;
376 	int				error;
377 
378 	/* We require the rmapbt to rebuild anything. */
379 	if (!xfs_has_rmapbt(mp))
380 		return -EOPNOTSUPP;
381 
382 	/*
383 	 * Make sure we have the AGF buffer, as scrub might have decided it
384 	 * was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED.
385 	 */
386 	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
387 			XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
388 						XFS_AGF_DADDR(mp)),
389 			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
390 	if (error)
391 		return error;
392 	agf_bp->b_ops = &xfs_agf_buf_ops;
393 	agf = agf_bp->b_addr;
394 
395 	/*
396 	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
397 	 * the AGFL now; these blocks might have once been part of the
398 	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
399 	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
400 	 * because we can't do any serious cross-referencing with any of the
401 	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
402 	 * then we'll bail out.
403 	 */
404 	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
405 	if (error)
406 		return error;
407 
408 	/*
409 	 * Spot-check the AGFL blocks; if they're obviously corrupt then
410 	 * there's nothing we can do but bail out.
411 	 */
412 	error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp,
413 			xrep_agf_check_agfl_block, sc);
414 	if (error)
415 		return error;
416 
417 	/*
418 	 * Find the AGF btree roots.  This is also a chicken-and-egg situation;
419 	 * see the function for more details.
420 	 */
421 	error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
422 	if (error)
423 		return error;
424 
425 	/* Last chance to abort before we start committing fixes. */
426 	if (xchk_should_terminate(sc, &error))
427 		return error;
428 
429 	/* Start rewriting the header and implant the btrees we found. */
430 	xrep_agf_init_header(sc, agf_bp, &old_agf);
431 	xrep_agf_set_roots(sc, agf, fab);
432 	error = xrep_agf_calc_from_btrees(sc, agf_bp);
433 	if (error)
434 		goto out_revert;
435 
436 	/* Commit the changes and reinitialize incore state. */
437 	return xrep_agf_commit_new(sc, agf_bp);
438 
439 out_revert:
440 	/* Mark the incore AGF state stale and revert the AGF. */
441 	clear_bit(XFS_AGSTATE_AGF_INIT, &sc->sa.pag->pag_opstate);
442 	memcpy(agf, &old_agf, sizeof(old_agf));
443 	return error;
444 }
445 
446 /* AGFL */
447 
448 struct xrep_agfl {
449 	/* Bitmap of alleged AGFL blocks that we're not going to add. */
450 	struct xagb_bitmap	crossed;
451 
452 	/* Bitmap of other OWN_AG metadata blocks. */
453 	struct xagb_bitmap	agmetablocks;
454 
455 	/* Bitmap of free space. */
456 	struct xagb_bitmap	*freesp;
457 
458 	/* rmapbt cursor for finding crosslinked blocks */
459 	struct xfs_btree_cur	*rmap_cur;
460 
461 	struct xfs_scrub	*sc;
462 };
463 
464 /* Record all OWN_AG (free space btree) information from the rmap data. */
465 STATIC int
466 xrep_agfl_walk_rmap(
467 	struct xfs_btree_cur	*cur,
468 	const struct xfs_rmap_irec *rec,
469 	void			*priv)
470 {
471 	struct xrep_agfl	*ra = priv;
472 	int			error = 0;
473 
474 	if (xchk_should_terminate(ra->sc, &error))
475 		return error;
476 
477 	/* Record all the OWN_AG blocks. */
478 	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
479 		error = xagb_bitmap_set(ra->freesp, rec->rm_startblock,
480 				rec->rm_blockcount);
481 		if (error)
482 			return error;
483 	}
484 
485 	return xagb_bitmap_set_btcur_path(&ra->agmetablocks, cur);
486 }
487 
488 /* Strike out the blocks that are cross-linked according to the rmapbt. */
489 STATIC int
490 xrep_agfl_check_extent(
491 	uint32_t		agbno,
492 	uint32_t		len,
493 	void			*priv)
494 {
495 	struct xrep_agfl	*ra = priv;
496 	xfs_agblock_t		last_agbno = agbno + len - 1;
497 	int			error;
498 
499 	while (agbno <= last_agbno) {
500 		bool		other_owners;
501 
502 		error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1,
503 				&XFS_RMAP_OINFO_AG, &other_owners);
504 		if (error)
505 			return error;
506 
507 		if (other_owners) {
508 			error = xagb_bitmap_set(&ra->crossed, agbno, 1);
509 			if (error)
510 				return error;
511 		}
512 
513 		if (xchk_should_terminate(ra->sc, &error))
514 			return error;
515 		agbno++;
516 	}
517 
518 	return 0;
519 }
520 
521 /*
522  * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce
523  * which blocks belong to the AGFL.
524  *
525  * Compute the set of old AGFL blocks by subtracting from the list of OWN_AG
526  * blocks the list of blocks owned by all other OWN_AG metadata (bnobt, cntbt,
527  * rmapbt).  These are the old AGFL blocks, so return that list and the number
528  * of blocks we're actually going to put back on the AGFL.
529  */
530 STATIC int
531 xrep_agfl_collect_blocks(
532 	struct xfs_scrub	*sc,
533 	struct xfs_buf		*agf_bp,
534 	struct xagb_bitmap	*agfl_extents,
535 	xfs_agblock_t		*flcount)
536 {
537 	struct xrep_agfl	ra;
538 	struct xfs_mount	*mp = sc->mp;
539 	struct xfs_btree_cur	*cur;
540 	int			error;
541 
542 	ra.sc = sc;
543 	ra.freesp = agfl_extents;
544 	xagb_bitmap_init(&ra.agmetablocks);
545 	xagb_bitmap_init(&ra.crossed);
546 
547 	/* Find all space used by the free space btrees & rmapbt. */
548 	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
549 	error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra);
550 	xfs_btree_del_cursor(cur, error);
551 	if (error)
552 		goto out_bmp;
553 
554 	/* Find all blocks currently being used by the bnobt. */
555 	cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
556 	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
557 	xfs_btree_del_cursor(cur, error);
558 	if (error)
559 		goto out_bmp;
560 
561 	/* Find all blocks currently being used by the cntbt. */
562 	cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
563 	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
564 	xfs_btree_del_cursor(cur, error);
565 	if (error)
566 		goto out_bmp;
567 
568 	/*
569 	 * Drop the freesp meta blocks that are in use by btrees.
570 	 * The remaining blocks /should/ be AGFL blocks.
571 	 */
572 	error = xagb_bitmap_disunion(agfl_extents, &ra.agmetablocks);
573 	if (error)
574 		goto out_bmp;
575 
576 	/* Strike out the blocks that are cross-linked. */
577 	ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
578 	error = xagb_bitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
579 	xfs_btree_del_cursor(ra.rmap_cur, error);
580 	if (error)
581 		goto out_bmp;
582 	error = xagb_bitmap_disunion(agfl_extents, &ra.crossed);
583 	if (error)
584 		goto out_bmp;
585 
586 	/*
587 	 * Calculate the new AGFL size.  If we found more blocks than fit in
588 	 * the AGFL we'll free them later.
589 	 */
590 	*flcount = min_t(uint64_t, xagb_bitmap_hweight(agfl_extents),
591 			 xfs_agfl_size(mp));
592 
593 out_bmp:
594 	xagb_bitmap_destroy(&ra.crossed);
595 	xagb_bitmap_destroy(&ra.agmetablocks);
596 	return error;
597 }
598 
599 /* Update the AGF and reset the in-core state. */
600 STATIC void
601 xrep_agfl_update_agf(
602 	struct xfs_scrub	*sc,
603 	struct xfs_buf		*agf_bp,
604 	xfs_agblock_t		flcount)
605 {
606 	struct xfs_agf		*agf = agf_bp->b_addr;
607 
608 	ASSERT(flcount <= xfs_agfl_size(sc->mp));
609 
610 	/* Trigger fdblocks recalculation */
611 	xfs_force_summary_recalc(sc->mp);
612 
613 	/* Update the AGF counters. */
614 	if (xfs_perag_initialised_agf(sc->sa.pag)) {
615 		sc->sa.pag->pagf_flcount = flcount;
616 		clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET,
617 				&sc->sa.pag->pag_opstate);
618 	}
619 	agf->agf_flfirst = cpu_to_be32(0);
620 	agf->agf_flcount = cpu_to_be32(flcount);
621 	if (flcount)
622 		agf->agf_fllast = cpu_to_be32(flcount - 1);
623 	else
624 		agf->agf_fllast = cpu_to_be32(xfs_agfl_size(sc->mp) - 1);
625 
626 	xfs_alloc_log_agf(sc->tp, agf_bp,
627 			XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
628 }
629 
630 struct xrep_agfl_fill {
631 	struct xagb_bitmap	used_extents;
632 	struct xfs_scrub	*sc;
633 	__be32			*agfl_bno;
634 	xfs_agblock_t		flcount;
635 	unsigned int		fl_off;
636 };
637 
638 /* Fill the AGFL with whatever blocks are in this extent. */
639 static int
640 xrep_agfl_fill(
641 	uint32_t		start,
642 	uint32_t		len,
643 	void			*priv)
644 {
645 	struct xrep_agfl_fill	*af = priv;
646 	struct xfs_scrub	*sc = af->sc;
647 	xfs_agblock_t		agbno = start;
648 	int			error;
649 
650 	trace_xrep_agfl_insert(sc->sa.pag, agbno, len);
651 
652 	while (agbno < start + len && af->fl_off < af->flcount)
653 		af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);
654 
655 	error = xagb_bitmap_set(&af->used_extents, start, agbno - 1);
656 	if (error)
657 		return error;
658 
659 	if (af->fl_off == af->flcount)
660 		return -ECANCELED;
661 
662 	return 0;
663 }
664 
665 /* Write out a totally new AGFL. */
666 STATIC int
667 xrep_agfl_init_header(
668 	struct xfs_scrub	*sc,
669 	struct xfs_buf		*agfl_bp,
670 	struct xagb_bitmap	*agfl_extents,
671 	xfs_agblock_t		flcount)
672 {
673 	struct xrep_agfl_fill	af = {
674 		.sc		= sc,
675 		.flcount	= flcount,
676 	};
677 	struct xfs_mount	*mp = sc->mp;
678 	struct xfs_agfl		*agfl;
679 	int			error;
680 
681 	ASSERT(flcount <= xfs_agfl_size(mp));
682 
683 	/*
684 	 * Start rewriting the header by setting the bno[] array to
685 	 * NULLAGBLOCK, then setting AGFL header fields.
686 	 */
687 	agfl = XFS_BUF_TO_AGFL(agfl_bp);
688 	memset(agfl, 0xFF, BBTOB(agfl_bp->b_length));
689 	agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
690 	agfl->agfl_seqno = cpu_to_be32(pag_agno(sc->sa.pag));
691 	uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
692 
693 	/*
694 	 * Fill the AGFL with the remaining blocks.  If agfl_extents has more
695 	 * blocks than fit in the AGFL, they will be freed in a subsequent
696 	 * step.
697 	 */
698 	xagb_bitmap_init(&af.used_extents);
699 	af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp);
700 	xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af);
701 	error = xagb_bitmap_disunion(agfl_extents, &af.used_extents);
702 	if (error)
703 		return error;
704 
705 	/* Write new AGFL to disk. */
706 	xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
707 	xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1);
708 	xagb_bitmap_destroy(&af.used_extents);
709 	return 0;
710 }
711 
712 /* Repair the AGFL. */
713 int
714 xrep_agfl(
715 	struct xfs_scrub	*sc)
716 {
717 	struct xagb_bitmap	agfl_extents;
718 	struct xfs_mount	*mp = sc->mp;
719 	struct xfs_buf		*agf_bp;
720 	struct xfs_buf		*agfl_bp;
721 	xfs_agblock_t		flcount;
722 	int			error;
723 
724 	/* We require the rmapbt to rebuild anything. */
725 	if (!xfs_has_rmapbt(mp))
726 		return -EOPNOTSUPP;
727 
728 	xagb_bitmap_init(&agfl_extents);
729 
730 	/*
731 	 * Read the AGF so that we can query the rmapbt.  We hope that there's
732 	 * nothing wrong with the AGF, but all the AG header repair functions
733 	 * have this chicken-and-egg problem.
734 	 */
735 	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
736 	if (error)
737 		return error;
738 
739 	/*
740 	 * Make sure we have the AGFL buffer, as scrub might have decided it
741 	 * was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED.
742 	 */
743 	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
744 			XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
745 						XFS_AGFL_DADDR(mp)),
746 			XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL);
747 	if (error)
748 		return error;
749 	agfl_bp->b_ops = &xfs_agfl_buf_ops;
750 
751 	/* Gather all the extents we're going to put on the new AGFL. */
752 	error = xrep_agfl_collect_blocks(sc, agf_bp, &agfl_extents, &flcount);
753 	if (error)
754 		goto err;
755 
756 	/* Last chance to abort before we start committing fixes. */
757 	if (xchk_should_terminate(sc, &error))
758 		goto err;
759 
760 	/*
761 	 * Update AGF and AGFL.  We reset the global free block counter when
762 	 * we adjust the AGF flcount (which can fail) so avoid updating any
763 	 * buffers until we know that part works.
764 	 */
765 	xrep_agfl_update_agf(sc, agf_bp, flcount);
766 	error = xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount);
767 	if (error)
768 		goto err;
769 
770 	/*
771 	 * Ok, the AGFL should be ready to go now.  Roll the transaction to
772 	 * make the new AGFL permanent before we start using it to return
773 	 * freespace overflow to the freespace btrees.
774 	 */
775 	sc->sa.agf_bp = agf_bp;
776 	error = xrep_roll_ag_trans(sc);
777 	if (error)
778 		goto err;
779 
780 	/* Dump any AGFL overflow. */
781 	error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
782 			XFS_AG_RESV_AGFL);
783 	if (error)
784 		goto err;
785 
786 err:
787 	xagb_bitmap_destroy(&agfl_extents);
788 	return error;
789 }
790 
791 /* AGI */
792 
793 /*
794  * Offset within the xrep_find_ag_btree array for each btree type.  Avoid the
795  * XFS_BTNUM_ names here to avoid creating a sparse array.
796  */
797 enum {
798 	XREP_AGI_INOBT = 0,
799 	XREP_AGI_FINOBT,
800 	XREP_AGI_END,
801 	XREP_AGI_MAX
802 };
803 
804 #define XREP_AGI_LOOKUP_BATCH		32
805 
806 struct xrep_agi {
807 	struct xfs_scrub		*sc;
808 
809 	/* AGI buffer, tracked separately */
810 	struct xfs_buf			*agi_bp;
811 
812 	/* context for finding btree roots */
813 	struct xrep_find_ag_btree	fab[XREP_AGI_MAX];
814 
815 	/* old AGI contents in case we have to revert */
816 	struct xfs_agi			old_agi;
817 
818 	/* bitmap of which inodes are unlinked */
819 	struct xagino_bitmap		iunlink_bmp;
820 
821 	/* heads of the unlinked inode bucket lists */
822 	xfs_agino_t			iunlink_heads[XFS_AGI_UNLINKED_BUCKETS];
823 
824 	/* scratchpad for batched lookups of the radix tree */
825 	struct xfs_inode		*lookup_batch[XREP_AGI_LOOKUP_BATCH];
826 
827 	/* Map of ino -> next_ino for unlinked inode processing. */
828 	struct xfarray			*iunlink_next;
829 
830 	/* Map of ino -> prev_ino for unlinked inode processing. */
831 	struct xfarray			*iunlink_prev;
832 };
833 
834 static void
835 xrep_agi_buf_cleanup(
836 	void		*buf)
837 {
838 	struct xrep_agi	*ragi = buf;
839 
840 	xfarray_destroy(ragi->iunlink_prev);
841 	xfarray_destroy(ragi->iunlink_next);
842 	xagino_bitmap_destroy(&ragi->iunlink_bmp);
843 }
844 
845 /*
846  * Given the inode btree roots described by *fab, find the roots, check them
847  * for sanity, and pass the root data back out via *fab.
848  */
849 STATIC int
850 xrep_agi_find_btrees(
851 	struct xrep_agi			*ragi)
852 {
853 	struct xfs_scrub		*sc = ragi->sc;
854 	struct xrep_find_ag_btree	*fab = ragi->fab;
855 	struct xfs_buf			*agf_bp;
856 	struct xfs_mount		*mp = sc->mp;
857 	int				error;
858 
859 	/* Read the AGF. */
860 	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
861 	if (error)
862 		return error;
863 
864 	/* Find the btree roots. */
865 	error = xrep_find_ag_btree_roots(sc, agf_bp, fab, NULL);
866 	if (error)
867 		return error;
868 
869 	/* We must find the inobt root. */
870 	if (!xrep_check_btree_root(sc, &fab[XREP_AGI_INOBT]))
871 		return -EFSCORRUPTED;
872 
873 	/* We must find the finobt root if that feature is enabled. */
874 	if (xfs_has_finobt(mp) &&
875 	    !xrep_check_btree_root(sc, &fab[XREP_AGI_FINOBT]))
876 		return -EFSCORRUPTED;
877 
878 	return 0;
879 }
880 
881 /*
882  * Reinitialize the AGI header, making an in-core copy of the old contents so
883  * that we know which in-core state needs to be reinitialized.
884  */
885 STATIC void
886 xrep_agi_init_header(
887 	struct xrep_agi		*ragi)
888 {
889 	struct xfs_scrub	*sc = ragi->sc;
890 	struct xfs_buf		*agi_bp = ragi->agi_bp;
891 	struct xfs_agi		*old_agi = &ragi->old_agi;
892 	struct xfs_agi		*agi = agi_bp->b_addr;
893 	struct xfs_perag	*pag = sc->sa.pag;
894 	struct xfs_mount	*mp = sc->mp;
895 
896 	memcpy(old_agi, agi, sizeof(*old_agi));
897 	memset(agi, 0, BBTOB(agi_bp->b_length));
898 	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
899 	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
900 	agi->agi_seqno = cpu_to_be32(pag_agno(pag));
901 	agi->agi_length = cpu_to_be32(pag_group(pag)->xg_block_count);
902 	agi->agi_newino = cpu_to_be32(NULLAGINO);
903 	agi->agi_dirino = cpu_to_be32(NULLAGINO);
904 	if (xfs_has_crc(mp))
905 		uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
906 
907 	/* Mark the incore AGF data stale until we're done fixing things. */
908 	ASSERT(xfs_perag_initialised_agi(pag));
909 	clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
910 }
911 
912 /* Set btree root information in an AGI. */
913 STATIC void
914 xrep_agi_set_roots(
915 	struct xrep_agi			*ragi)
916 {
917 	struct xfs_scrub		*sc = ragi->sc;
918 	struct xfs_agi			*agi = ragi->agi_bp->b_addr;
919 	struct xrep_find_ag_btree	*fab = ragi->fab;
920 
921 	agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root);
922 	agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height);
923 
924 	if (xfs_has_finobt(sc->mp)) {
925 		agi->agi_free_root = cpu_to_be32(fab[XREP_AGI_FINOBT].root);
926 		agi->agi_free_level = cpu_to_be32(fab[XREP_AGI_FINOBT].height);
927 	}
928 }
929 
930 /* Update the AGI counters. */
931 STATIC int
932 xrep_agi_calc_from_btrees(
933 	struct xrep_agi		*ragi)
934 {
935 	struct xfs_scrub	*sc = ragi->sc;
936 	struct xfs_buf		*agi_bp = ragi->agi_bp;
937 	struct xfs_btree_cur	*cur;
938 	struct xfs_agi		*agi = agi_bp->b_addr;
939 	struct xfs_mount	*mp = sc->mp;
940 	xfs_agino_t		count;
941 	xfs_agino_t		freecount;
942 	int			error;
943 
944 	cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
945 	error = xfs_ialloc_count_inodes(cur, &count, &freecount);
946 	if (error)
947 		goto err;
948 	if (xfs_has_inobtcounts(mp)) {
949 		xfs_agblock_t	blocks;
950 
951 		error = xfs_btree_count_blocks(cur, &blocks);
952 		if (error)
953 			goto err;
954 		agi->agi_iblocks = cpu_to_be32(blocks);
955 	}
956 	xfs_btree_del_cursor(cur, error);
957 
958 	agi->agi_count = cpu_to_be32(count);
959 	agi->agi_freecount = cpu_to_be32(freecount);
960 
961 	if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
962 		xfs_agblock_t	blocks;
963 
964 		cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
965 		error = xfs_btree_count_blocks(cur, &blocks);
966 		if (error)
967 			goto err;
968 		xfs_btree_del_cursor(cur, error);
969 		agi->agi_fblocks = cpu_to_be32(blocks);
970 	}
971 
972 	return 0;
973 err:
974 	xfs_btree_del_cursor(cur, error);
975 	return error;
976 }
977 
978 /*
979  * Record a forwards unlinked chain pointer from agino -> next_agino in our
980  * staging information.
981  */
982 static inline int
983 xrep_iunlink_store_next(
984 	struct xrep_agi		*ragi,
985 	xfs_agino_t		agino,
986 	xfs_agino_t		next_agino)
987 {
988 	ASSERT(next_agino != 0);
989 
990 	return xfarray_store(ragi->iunlink_next, agino, &next_agino);
991 }
992 
993 /*
994  * Record a backwards unlinked chain pointer from prev_ino <- agino in our
995  * staging information.
996  */
997 static inline int
998 xrep_iunlink_store_prev(
999 	struct xrep_agi		*ragi,
1000 	xfs_agino_t		agino,
1001 	xfs_agino_t		prev_agino)
1002 {
1003 	ASSERT(prev_agino != 0);
1004 
1005 	return xfarray_store(ragi->iunlink_prev, agino, &prev_agino);
1006 }
1007 
1008 /*
1009  * Given an @agino, look up the next inode in the iunlink bucket.  Returns
1010  * NULLAGINO if we're at the end of the chain, 0 if @agino is not in memory
1011  * like it should be, or a per-AG inode number.
1012  */
1013 static inline xfs_agino_t
1014 xrep_iunlink_next(
1015 	struct xfs_scrub	*sc,
1016 	xfs_agino_t		agino)
1017 {
1018 	struct xfs_inode	*ip;
1019 
1020 	ip = xfs_iunlink_lookup(sc->sa.pag, agino);
1021 	if (!ip)
1022 		return 0;
1023 
1024 	return ip->i_next_unlinked;
1025 }
1026 
1027 /*
1028  * Load the inode @agino into memory, set its i_prev_unlinked, and drop the
1029  * inode so it can be inactivated.  Returns NULLAGINO if we're at the end of
1030  * the chain or if we should stop walking the chain due to corruption; or a
1031  * per-AG inode number.
1032  */
1033 STATIC xfs_agino_t
1034 xrep_iunlink_reload_next(
1035 	struct xrep_agi		*ragi,
1036 	xfs_agino_t		prev_agino,
1037 	xfs_agino_t		agino)
1038 {
1039 	struct xfs_scrub	*sc = ragi->sc;
1040 	struct xfs_inode	*ip;
1041 	xfs_agino_t		ret = NULLAGINO;
1042 	int			error;
1043 
1044 	error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino), &ip);
1045 	if (error)
1046 		return ret;
1047 
1048 	trace_xrep_iunlink_reload_next(ip, prev_agino);
1049 
1050 	/* If this is a linked inode, stop processing the chain. */
1051 	if (VFS_I(ip)->i_nlink != 0) {
1052 		xrep_iunlink_store_next(ragi, agino, NULLAGINO);
1053 		goto rele;
1054 	}
1055 
1056 	ip->i_prev_unlinked = prev_agino;
1057 	ret = ip->i_next_unlinked;
1058 
1059 	/*
1060 	 * Drop the inode reference that we just took.  We hold the AGI, so
1061 	 * this inode cannot move off the unlinked list and hence cannot be
1062 	 * reclaimed.
1063 	 */
1064 rele:
1065 	xchk_irele(sc, ip);
1066 	return ret;
1067 }
1068 
1069 /*
1070  * Walk an AGI unlinked bucket's list to load incore any unlinked inodes that
1071  * still existed at mount time.  This can happen if iunlink processing fails
1072  * during log recovery.
1073  */
1074 STATIC int
1075 xrep_iunlink_walk_ondisk_bucket(
1076 	struct xrep_agi		*ragi,
1077 	unsigned int		bucket)
1078 {
1079 	struct xfs_scrub	*sc = ragi->sc;
1080 	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
1081 	xfs_agino_t		prev_agino = NULLAGINO;
1082 	xfs_agino_t		next_agino;
1083 	int			error = 0;
1084 
1085 	next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
1086 	while (next_agino != NULLAGINO) {
1087 		xfs_agino_t	agino = next_agino;
1088 
1089 		if (xchk_should_terminate(ragi->sc, &error))
1090 			return error;
1091 
1092 		trace_xrep_iunlink_walk_ondisk_bucket(sc->sa.pag, bucket,
1093 				prev_agino, agino);
1094 
1095 		if (bucket != agino % XFS_AGI_UNLINKED_BUCKETS)
1096 			break;
1097 
1098 		next_agino = xrep_iunlink_next(sc, agino);
1099 		if (!next_agino)
1100 			next_agino = xrep_iunlink_reload_next(ragi, prev_agino,
1101 					agino);
1102 
1103 		prev_agino = agino;
1104 	}
1105 
1106 	return 0;
1107 }
1108 
1109 /* Decide if this is an unlinked inode in this AG. */
1110 STATIC bool
1111 xrep_iunlink_igrab(
1112 	struct xfs_perag	*pag,
1113 	struct xfs_inode	*ip)
1114 {
1115 	struct xfs_mount	*mp = pag_mount(pag);
1116 
1117 	if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag))
1118 		return false;
1119 
1120 	if (!xfs_inode_on_unlinked_list(ip))
1121 		return false;
1122 
1123 	return true;
1124 }
1125 
1126 /*
1127  * Mark the given inode in the lookup batch in our unlinked inode bitmap, and
1128  * remember if this inode is the start of the unlinked chain.
1129  */
1130 STATIC int
1131 xrep_iunlink_visit(
1132 	struct xrep_agi		*ragi,
1133 	unsigned int		batch_idx)
1134 {
1135 	struct xfs_mount	*mp = ragi->sc->mp;
1136 	struct xfs_inode	*ip = ragi->lookup_batch[batch_idx];
1137 	xfs_agino_t		agino;
1138 	unsigned int		bucket;
1139 	int			error;
1140 
1141 	ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == pag_agno(ragi->sc->sa.pag));
1142 	ASSERT(xfs_inode_on_unlinked_list(ip));
1143 
1144 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1145 	bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
1146 
1147 	trace_xrep_iunlink_visit(ragi->sc->sa.pag, bucket,
1148 			ragi->iunlink_heads[bucket], ip);
1149 
1150 	error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
1151 	if (error)
1152 		return error;
1153 
1154 	if (ip->i_prev_unlinked == NULLAGINO) {
1155 		if (ragi->iunlink_heads[bucket] == NULLAGINO)
1156 			ragi->iunlink_heads[bucket] = agino;
1157 	}
1158 
1159 	return 0;
1160 }
1161 
1162 /*
1163  * Find all incore unlinked inodes so that we can rebuild the unlinked buckets.
1164  * We hold the AGI so there should not be any modifications to the unlinked
1165  * list.
1166  */
1167 STATIC int
1168 xrep_iunlink_mark_incore(
1169 	struct xrep_agi		*ragi)
1170 {
1171 	struct xfs_perag	*pag = ragi->sc->sa.pag;
1172 	struct xfs_mount	*mp = pag_mount(pag);
1173 	uint32_t		first_index = 0;
1174 	bool			done = false;
1175 	unsigned int		nr_found = 0;
1176 
1177 	do {
1178 		unsigned int	i;
1179 		int		error = 0;
1180 
1181 		if (xchk_should_terminate(ragi->sc, &error))
1182 			return error;
1183 
1184 		rcu_read_lock();
1185 
1186 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
1187 				(void **)&ragi->lookup_batch, first_index,
1188 				XREP_AGI_LOOKUP_BATCH);
1189 		if (!nr_found) {
1190 			rcu_read_unlock();
1191 			return 0;
1192 		}
1193 
1194 		for (i = 0; i < nr_found; i++) {
1195 			struct xfs_inode *ip = ragi->lookup_batch[i];
1196 
1197 			if (done || !xrep_iunlink_igrab(pag, ip))
1198 				ragi->lookup_batch[i] = NULL;
1199 
1200 			/*
1201 			 * Update the index for the next lookup. Catch
1202 			 * overflows into the next AG range which can occur if
1203 			 * we have inodes in the last block of the AG and we
1204 			 * are currently pointing to the last inode.
1205 			 *
1206 			 * Because we may see inodes that are from the wrong AG
1207 			 * due to RCU freeing and reallocation, only update the
1208 			 * index if it lies in this AG. It was a race that lead
1209 			 * us to see this inode, so another lookup from the
1210 			 * same index will not find it again.
1211 			 */
1212 			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag))
1213 				continue;
1214 			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1215 			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1216 				done = true;
1217 		}
1218 
1219 		/* unlock now we've grabbed the inodes. */
1220 		rcu_read_unlock();
1221 
1222 		for (i = 0; i < nr_found; i++) {
1223 			if (!ragi->lookup_batch[i])
1224 				continue;
1225 			error = xrep_iunlink_visit(ragi, i);
1226 			if (error)
1227 				return error;
1228 		}
1229 	} while (!done);
1230 
1231 	return 0;
1232 }
1233 
1234 /* Mark all the unlinked ondisk inodes in this inobt record in iunlink_bmp. */
1235 STATIC int
1236 xrep_iunlink_mark_ondisk_rec(
1237 	struct xfs_btree_cur		*cur,
1238 	const union xfs_btree_rec	*rec,
1239 	void				*priv)
1240 {
1241 	struct xfs_inobt_rec_incore	irec;
1242 	struct xrep_agi			*ragi = priv;
1243 	struct xfs_scrub		*sc = ragi->sc;
1244 	struct xfs_mount		*mp = cur->bc_mp;
1245 	xfs_agino_t			agino;
1246 	unsigned int			i;
1247 	int				error = 0;
1248 
1249 	xfs_inobt_btrec_to_irec(mp, rec, &irec);
1250 
1251 	for (i = 0, agino = irec.ir_startino;
1252 	     i < XFS_INODES_PER_CHUNK;
1253 	     i++, agino++) {
1254 		struct xfs_inode	*ip;
1255 		unsigned int		len = 1;
1256 
1257 		/* Skip free inodes */
1258 		if (XFS_INOBT_MASK(i) & irec.ir_free)
1259 			continue;
1260 		/* Skip inodes we've seen before */
1261 		if (xagino_bitmap_test(&ragi->iunlink_bmp, agino, &len))
1262 			continue;
1263 
1264 		/*
1265 		 * Skip incore inodes; these were already picked up by
1266 		 * the _mark_incore step.
1267 		 */
1268 		rcu_read_lock();
1269 		ip = radix_tree_lookup(&sc->sa.pag->pag_ici_root, agino);
1270 		rcu_read_unlock();
1271 		if (ip)
1272 			continue;
1273 
1274 		/*
1275 		 * Try to look up this inode.  If we can't get it, just move
1276 		 * on because we haven't actually scrubbed the inobt or the
1277 		 * inodes yet.
1278 		 */
1279 		error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino),
1280 				&ip);
1281 		if (error)
1282 			continue;
1283 
1284 		trace_xrep_iunlink_reload_ondisk(ip);
1285 
1286 		if (VFS_I(ip)->i_nlink == 0)
1287 			error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
1288 		xchk_irele(sc, ip);
1289 		if (error)
1290 			break;
1291 	}
1292 
1293 	return error;
1294 }
1295 
1296 /*
1297  * Find ondisk inodes that are unlinked and not in cache, and mark them in
1298  * iunlink_bmp.   We haven't checked the inobt yet, so we don't error out if
1299  * the btree is corrupt.
1300  */
1301 STATIC void
1302 xrep_iunlink_mark_ondisk(
1303 	struct xrep_agi		*ragi)
1304 {
1305 	struct xfs_scrub	*sc = ragi->sc;
1306 	struct xfs_buf		*agi_bp = ragi->agi_bp;
1307 	struct xfs_btree_cur	*cur;
1308 	int			error;
1309 
1310 	cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
1311 	error = xfs_btree_query_all(cur, xrep_iunlink_mark_ondisk_rec, ragi);
1312 	xfs_btree_del_cursor(cur, error);
1313 }
1314 
1315 /*
1316  * Walk an iunlink bucket's inode list.  For each inode that should be on this
1317  * chain, clear its entry in in iunlink_bmp because it's ok and we don't need
1318  * to touch it further.
1319  */
1320 STATIC int
1321 xrep_iunlink_resolve_bucket(
1322 	struct xrep_agi		*ragi,
1323 	unsigned int		bucket)
1324 {
1325 	struct xfs_scrub	*sc = ragi->sc;
1326 	struct xfs_inode	*ip;
1327 	xfs_agino_t		prev_agino = NULLAGINO;
1328 	xfs_agino_t		next_agino = ragi->iunlink_heads[bucket];
1329 	int			error = 0;
1330 
1331 	while (next_agino != NULLAGINO) {
1332 		if (xchk_should_terminate(ragi->sc, &error))
1333 			return error;
1334 
1335 		/* Find the next inode in the chain. */
1336 		ip = xfs_iunlink_lookup(sc->sa.pag, next_agino);
1337 		if (!ip) {
1338 			/* Inode not incore?  Terminate the chain. */
1339 			trace_xrep_iunlink_resolve_uncached(sc->sa.pag,
1340 					bucket, prev_agino, next_agino);
1341 
1342 			next_agino = NULLAGINO;
1343 			break;
1344 		}
1345 
1346 		if (next_agino % XFS_AGI_UNLINKED_BUCKETS != bucket) {
1347 			/*
1348 			 * Inode is in the wrong bucket.  Advance the list,
1349 			 * but pretend we didn't see this inode.
1350 			 */
1351 			trace_xrep_iunlink_resolve_wronglist(sc->sa.pag,
1352 					bucket, prev_agino, next_agino);
1353 
1354 			next_agino = ip->i_next_unlinked;
1355 			continue;
1356 		}
1357 
1358 		if (!xfs_inode_on_unlinked_list(ip)) {
1359 			/*
1360 			 * Incore inode doesn't think this inode is on an
1361 			 * unlinked list.  This is probably because we reloaded
1362 			 * it from disk.  Advance the list, but pretend we
1363 			 * didn't see this inode; we'll fix that later.
1364 			 */
1365 			trace_xrep_iunlink_resolve_nolist(sc->sa.pag,
1366 					bucket, prev_agino, next_agino);
1367 			next_agino = ip->i_next_unlinked;
1368 			continue;
1369 		}
1370 
1371 		trace_xrep_iunlink_resolve_ok(sc->sa.pag, bucket, prev_agino,
1372 				next_agino);
1373 
1374 		/*
1375 		 * Otherwise, this inode's unlinked pointers are ok.  Clear it
1376 		 * from the unlinked bitmap since we're done with it, and make
1377 		 * sure the chain is still correct.
1378 		 */
1379 		error = xagino_bitmap_clear(&ragi->iunlink_bmp, next_agino, 1);
1380 		if (error)
1381 			return error;
1382 
1383 		/* Remember the previous inode's next pointer. */
1384 		if (prev_agino != NULLAGINO) {
1385 			error = xrep_iunlink_store_next(ragi, prev_agino,
1386 					next_agino);
1387 			if (error)
1388 				return error;
1389 		}
1390 
1391 		/* Remember this inode's previous pointer. */
1392 		error = xrep_iunlink_store_prev(ragi, next_agino, prev_agino);
1393 		if (error)
1394 			return error;
1395 
1396 		/* Advance the list and remember this inode. */
1397 		prev_agino = next_agino;
1398 		next_agino = ip->i_next_unlinked;
1399 	}
1400 
1401 	/* Update the previous inode's next pointer. */
1402 	if (prev_agino != NULLAGINO) {
1403 		error = xrep_iunlink_store_next(ragi, prev_agino, next_agino);
1404 		if (error)
1405 			return error;
1406 	}
1407 
1408 	return 0;
1409 }
1410 
1411 /* Reinsert this unlinked inode into the head of the staged bucket list. */
1412 STATIC int
1413 xrep_iunlink_add_to_bucket(
1414 	struct xrep_agi		*ragi,
1415 	xfs_agino_t		agino)
1416 {
1417 	xfs_agino_t		current_head;
1418 	unsigned int		bucket;
1419 	int			error;
1420 
1421 	bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
1422 
1423 	/* Point this inode at the current head of the bucket list. */
1424 	current_head = ragi->iunlink_heads[bucket];
1425 
1426 	trace_xrep_iunlink_add_to_bucket(ragi->sc->sa.pag, bucket, agino,
1427 			current_head);
1428 
1429 	error = xrep_iunlink_store_next(ragi, agino, current_head);
1430 	if (error)
1431 		return error;
1432 
1433 	/* Remember the head inode's previous pointer. */
1434 	if (current_head != NULLAGINO) {
1435 		error = xrep_iunlink_store_prev(ragi, current_head, agino);
1436 		if (error)
1437 			return error;
1438 	}
1439 
1440 	ragi->iunlink_heads[bucket] = agino;
1441 	return 0;
1442 }
1443 
1444 /* Reinsert unlinked inodes into the staged iunlink buckets. */
1445 STATIC int
1446 xrep_iunlink_add_lost_inodes(
1447 	uint32_t		start,
1448 	uint32_t		len,
1449 	void			*priv)
1450 {
1451 	struct xrep_agi		*ragi = priv;
1452 	int			error;
1453 
1454 	for (; len > 0; start++, len--) {
1455 		error = xrep_iunlink_add_to_bucket(ragi, start);
1456 		if (error)
1457 			return error;
1458 	}
1459 
1460 	return 0;
1461 }
1462 
1463 /*
1464  * Figure out the iunlink bucket values and find inodes that need to be
1465  * reinserted into the list.
1466  */
1467 STATIC int
1468 xrep_iunlink_rebuild_buckets(
1469 	struct xrep_agi		*ragi)
1470 {
1471 	unsigned int		i;
1472 	int			error;
1473 
1474 	/*
1475 	 * Walk the ondisk AGI unlinked list to find inodes that are on the
1476 	 * list but aren't in memory.  This can happen if a past log recovery
1477 	 * tried to clear the iunlinked list but failed.  Our scan rebuilds the
1478 	 * unlinked list using incore inodes, so we must load and link them
1479 	 * properly.
1480 	 */
1481 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1482 		error = xrep_iunlink_walk_ondisk_bucket(ragi, i);
1483 		if (error)
1484 			return error;
1485 	}
1486 
1487 	/*
1488 	 * Record all the incore unlinked inodes in iunlink_bmp that we didn't
1489 	 * find by walking the ondisk iunlink buckets.  This shouldn't happen,
1490 	 * but we can't risk forgetting an inode somewhere.
1491 	 */
1492 	error = xrep_iunlink_mark_incore(ragi);
1493 	if (error)
1494 		return error;
1495 
1496 	/*
1497 	 * If there are ondisk inodes that are unlinked and are not been loaded
1498 	 * into cache, record them in iunlink_bmp.
1499 	 */
1500 	xrep_iunlink_mark_ondisk(ragi);
1501 
1502 	/*
1503 	 * Walk each iunlink bucket to (re)construct as much of the incore list
1504 	 * as would be correct.  For each inode that survives this step, mark
1505 	 * it clear in iunlink_bmp; we're done with those inodes.
1506 	 */
1507 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1508 		error = xrep_iunlink_resolve_bucket(ragi, i);
1509 		if (error)
1510 			return error;
1511 	}
1512 
1513 	/*
1514 	 * Any unlinked inodes that we didn't find through the bucket list
1515 	 * walk (or was ignored by the walk) must be inserted into the bucket
1516 	 * list.  Stage this in memory for now.
1517 	 */
1518 	return xagino_bitmap_walk(&ragi->iunlink_bmp,
1519 			xrep_iunlink_add_lost_inodes, ragi);
1520 }
1521 
1522 /* Update i_next_iunlinked for the inode @agino. */
1523 STATIC int
1524 xrep_iunlink_relink_next(
1525 	struct xrep_agi		*ragi,
1526 	xfarray_idx_t		idx,
1527 	xfs_agino_t		next_agino)
1528 {
1529 	struct xfs_scrub	*sc = ragi->sc;
1530 	struct xfs_perag	*pag = sc->sa.pag;
1531 	struct xfs_inode	*ip;
1532 	xfarray_idx_t		agino = idx - 1;
1533 	bool			want_rele = false;
1534 	int			error = 0;
1535 
1536 	ip = xfs_iunlink_lookup(pag, agino);
1537 	if (!ip) {
1538 		xfs_agino_t	prev_agino;
1539 
1540 		/*
1541 		 * No inode exists in cache.  Load it off the disk so that we
1542 		 * can reinsert it into the incore unlinked list.
1543 		 */
1544 		error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip);
1545 		if (error)
1546 			return -EFSCORRUPTED;
1547 
1548 		want_rele = true;
1549 
1550 		/* Set the backward pointer since this just came off disk. */
1551 		error = xfarray_load(ragi->iunlink_prev, agino, &prev_agino);
1552 		if (error)
1553 			goto out_rele;
1554 
1555 		trace_xrep_iunlink_relink_prev(ip, prev_agino);
1556 		ip->i_prev_unlinked = prev_agino;
1557 	}
1558 
1559 	/* Update the forward pointer. */
1560 	if (ip->i_next_unlinked != next_agino) {
1561 		error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
1562 		if (error)
1563 			goto out_rele;
1564 
1565 		trace_xrep_iunlink_relink_next(ip, next_agino);
1566 		ip->i_next_unlinked = next_agino;
1567 	}
1568 
1569 out_rele:
1570 	/*
1571 	 * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
1572 	 * and the inode cannot be reclaimed.  However, if we used iget to load
1573 	 * a missing inode, we must irele it here.
1574 	 */
1575 	if (want_rele)
1576 		xchk_irele(sc, ip);
1577 	return error;
1578 }
1579 
1580 /* Update i_prev_iunlinked for the inode @agino. */
1581 STATIC int
1582 xrep_iunlink_relink_prev(
1583 	struct xrep_agi		*ragi,
1584 	xfarray_idx_t		idx,
1585 	xfs_agino_t		prev_agino)
1586 {
1587 	struct xfs_scrub	*sc = ragi->sc;
1588 	struct xfs_perag	*pag = sc->sa.pag;
1589 	struct xfs_inode	*ip;
1590 	xfarray_idx_t		agino = idx - 1;
1591 	bool			want_rele = false;
1592 	int			error = 0;
1593 
1594 	ASSERT(prev_agino != 0);
1595 
1596 	ip = xfs_iunlink_lookup(pag, agino);
1597 	if (!ip) {
1598 		xfs_agino_t	next_agino;
1599 
1600 		/*
1601 		 * No inode exists in cache.  Load it off the disk so that we
1602 		 * can reinsert it into the incore unlinked list.
1603 		 */
1604 		error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip);
1605 		if (error)
1606 			return -EFSCORRUPTED;
1607 
1608 		want_rele = true;
1609 
1610 		/* Set the forward pointer since this just came off disk. */
1611 		error = xfarray_load(ragi->iunlink_prev, agino, &next_agino);
1612 		if (error)
1613 			goto out_rele;
1614 
1615 		error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
1616 		if (error)
1617 			goto out_rele;
1618 
1619 		trace_xrep_iunlink_relink_next(ip, next_agino);
1620 		ip->i_next_unlinked = next_agino;
1621 	}
1622 
1623 	/* Update the backward pointer. */
1624 	if (ip->i_prev_unlinked != prev_agino) {
1625 		trace_xrep_iunlink_relink_prev(ip, prev_agino);
1626 		ip->i_prev_unlinked = prev_agino;
1627 	}
1628 
1629 out_rele:
1630 	/*
1631 	 * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
1632 	 * and the inode cannot be reclaimed.  However, if we used iget to load
1633 	 * a missing inode, we must irele it here.
1634 	 */
1635 	if (want_rele)
1636 		xchk_irele(sc, ip);
1637 	return error;
1638 }
1639 
1640 /* Log all the iunlink updates we need to finish regenerating the AGI. */
1641 STATIC int
1642 xrep_iunlink_commit(
1643 	struct xrep_agi		*ragi)
1644 {
1645 	struct xfs_agi		*agi = ragi->agi_bp->b_addr;
1646 	xfarray_idx_t		idx = XFARRAY_CURSOR_INIT;
1647 	xfs_agino_t		agino;
1648 	unsigned int		i;
1649 	int			error;
1650 
1651 	/* Fix all the forward links */
1652 	while ((error = xfarray_iter(ragi->iunlink_next, &idx, &agino)) == 1) {
1653 		error = xrep_iunlink_relink_next(ragi, idx, agino);
1654 		if (error)
1655 			return error;
1656 	}
1657 
1658 	/* Fix all the back links */
1659 	idx = XFARRAY_CURSOR_INIT;
1660 	while ((error = xfarray_iter(ragi->iunlink_prev, &idx, &agino)) == 1) {
1661 		error = xrep_iunlink_relink_prev(ragi, idx, agino);
1662 		if (error)
1663 			return error;
1664 	}
1665 
1666 	/* Copy the staged iunlink buckets to the new AGI. */
1667 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1668 		trace_xrep_iunlink_commit_bucket(ragi->sc->sa.pag, i,
1669 				be32_to_cpu(ragi->old_agi.agi_unlinked[i]),
1670 				ragi->iunlink_heads[i]);
1671 
1672 		agi->agi_unlinked[i] = cpu_to_be32(ragi->iunlink_heads[i]);
1673 	}
1674 
1675 	return 0;
1676 }
1677 
1678 /* Trigger reinitialization of the in-core data. */
1679 STATIC int
1680 xrep_agi_commit_new(
1681 	struct xrep_agi		*ragi)
1682 {
1683 	struct xfs_scrub	*sc = ragi->sc;
1684 	struct xfs_buf		*agi_bp = ragi->agi_bp;
1685 	struct xfs_perag	*pag;
1686 	struct xfs_agi		*agi = agi_bp->b_addr;
1687 
1688 	/* Trigger inode count recalculation */
1689 	xfs_force_summary_recalc(sc->mp);
1690 
1691 	/* Write this to disk. */
1692 	xfs_trans_buf_set_type(sc->tp, agi_bp, XFS_BLFT_AGI_BUF);
1693 	xfs_trans_log_buf(sc->tp, agi_bp, 0, BBTOB(agi_bp->b_length) - 1);
1694 
1695 	/* Now reinitialize the in-core counters if necessary. */
1696 	pag = sc->sa.pag;
1697 	pag->pagi_count = be32_to_cpu(agi->agi_count);
1698 	pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
1699 	set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
1700 
1701 	return xrep_roll_ag_trans(sc);
1702 }
1703 
1704 /* Repair the AGI. */
1705 int
1706 xrep_agi(
1707 	struct xfs_scrub	*sc)
1708 {
1709 	struct xrep_agi		*ragi;
1710 	struct xfs_mount	*mp = sc->mp;
1711 	char			*descr;
1712 	unsigned int		i;
1713 	int			error;
1714 
1715 	/* We require the rmapbt to rebuild anything. */
1716 	if (!xfs_has_rmapbt(mp))
1717 		return -EOPNOTSUPP;
1718 
1719 	sc->buf = kzalloc(sizeof(struct xrep_agi), XCHK_GFP_FLAGS);
1720 	if (!sc->buf)
1721 		return -ENOMEM;
1722 	ragi = sc->buf;
1723 	ragi->sc = sc;
1724 
1725 	ragi->fab[XREP_AGI_INOBT] = (struct xrep_find_ag_btree){
1726 		.rmap_owner	= XFS_RMAP_OWN_INOBT,
1727 		.buf_ops	= &xfs_inobt_buf_ops,
1728 		.maxlevels	= M_IGEO(sc->mp)->inobt_maxlevels,
1729 	};
1730 	ragi->fab[XREP_AGI_FINOBT] = (struct xrep_find_ag_btree){
1731 		.rmap_owner	= XFS_RMAP_OWN_INOBT,
1732 		.buf_ops	= &xfs_finobt_buf_ops,
1733 		.maxlevels	= M_IGEO(sc->mp)->inobt_maxlevels,
1734 	};
1735 	ragi->fab[XREP_AGI_END] = (struct xrep_find_ag_btree){
1736 		.buf_ops	= NULL,
1737 	};
1738 
1739 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
1740 		ragi->iunlink_heads[i] = NULLAGINO;
1741 
1742 	xagino_bitmap_init(&ragi->iunlink_bmp);
1743 	sc->buf_cleanup = xrep_agi_buf_cleanup;
1744 
1745 	descr = xchk_xfile_ag_descr(sc, "iunlinked next pointers");
1746 	error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
1747 			&ragi->iunlink_next);
1748 	kfree(descr);
1749 	if (error)
1750 		return error;
1751 
1752 	descr = xchk_xfile_ag_descr(sc, "iunlinked prev pointers");
1753 	error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
1754 			&ragi->iunlink_prev);
1755 	kfree(descr);
1756 	if (error)
1757 		return error;
1758 
1759 	/*
1760 	 * Make sure we have the AGI buffer, as scrub might have decided it
1761 	 * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED.
1762 	 */
1763 	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
1764 			XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
1765 						XFS_AGI_DADDR(mp)),
1766 			XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL);
1767 	if (error)
1768 		return error;
1769 	ragi->agi_bp->b_ops = &xfs_agi_buf_ops;
1770 
1771 	/* Find the AGI btree roots. */
1772 	error = xrep_agi_find_btrees(ragi);
1773 	if (error)
1774 		return error;
1775 
1776 	error = xrep_iunlink_rebuild_buckets(ragi);
1777 	if (error)
1778 		return error;
1779 
1780 	/* Last chance to abort before we start committing fixes. */
1781 	if (xchk_should_terminate(sc, &error))
1782 		return error;
1783 
1784 	/* Start rewriting the header and implant the btrees we found. */
1785 	xrep_agi_init_header(ragi);
1786 	xrep_agi_set_roots(ragi);
1787 	error = xrep_agi_calc_from_btrees(ragi);
1788 	if (error)
1789 		goto out_revert;
1790 	error = xrep_iunlink_commit(ragi);
1791 	if (error)
1792 		goto out_revert;
1793 
1794 	/* Reinitialize in-core state. */
1795 	return xrep_agi_commit_new(ragi);
1796 
1797 out_revert:
1798 	/* Mark the incore AGI state stale and revert the AGI. */
1799 	clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate);
1800 	memcpy(ragi->agi_bp->b_addr, &ragi->old_agi, sizeof(struct xfs_agi));
1801 	return error;
1802 }
1803