xref: /linux/fs/xfs/scrub/ialloc_repair.c (revision 221013afb459e5deb8bd08e29b37050af5586d1c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_bit.h"
16 #include "xfs_log_format.h"
17 #include "xfs_trans.h"
18 #include "xfs_sb.h"
19 #include "xfs_inode.h"
20 #include "xfs_alloc.h"
21 #include "xfs_ialloc.h"
22 #include "xfs_ialloc_btree.h"
23 #include "xfs_icache.h"
24 #include "xfs_rmap.h"
25 #include "xfs_rmap_btree.h"
26 #include "xfs_log.h"
27 #include "xfs_trans_priv.h"
28 #include "xfs_error.h"
29 #include "xfs_health.h"
30 #include "xfs_ag.h"
31 #include "scrub/xfs_scrub.h"
32 #include "scrub/scrub.h"
33 #include "scrub/common.h"
34 #include "scrub/btree.h"
35 #include "scrub/trace.h"
36 #include "scrub/repair.h"
37 #include "scrub/bitmap.h"
38 #include "scrub/agb_bitmap.h"
39 #include "scrub/xfile.h"
40 #include "scrub/xfarray.h"
41 #include "scrub/newbt.h"
42 #include "scrub/reap.h"
43 
44 /*
45  * Inode Btree Repair
46  * ==================
47  *
48  * A quick refresher of inode btrees on a v5 filesystem:
49  *
50  * - Inode records are read into memory in units of 'inode clusters'.  However
51  *   many inodes fit in a cluster buffer is the smallest number of inodes that
52  *   can be allocated or freed.  Clusters are never smaller than one fs block
53  *   though they can span multiple blocks.  The size (in fs blocks) is
54  *   computed with xfs_icluster_size_fsb().  The fs block alignment of a
55  *   cluster is computed with xfs_ialloc_cluster_alignment().
56  *
57  * - Each inode btree record can describe a single 'inode chunk'.  The chunk
58  *   size is defined to be 64 inodes.  If sparse inodes are enabled, every
59  *   inobt record must be aligned to the chunk size; if not, every record must
60  *   be aligned to the start of a cluster.  It is possible to construct an XFS
61  *   geometry where one inobt record maps to multiple inode clusters; it is
62  *   also possible to construct a geometry where multiple inobt records map to
63  *   different parts of one inode cluster.
64  *
65  * - If sparse inodes are not enabled, the smallest unit of allocation for
66  *   inode records is enough to contain one inode chunk's worth of inodes.
67  *
68  * - If sparse inodes are enabled, the holemask field will be active.  Each
69  *   bit of the holemask represents 4 potential inodes; if set, the
70  *   corresponding space does *not* contain inodes and must be left alone.
71  *   Clusters cannot be smaller than 4 inodes.  The smallest unit of allocation
72  *   of inode records is one inode cluster.
73  *
74  * So what's the rebuild algorithm?
75  *
76  * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT
77  * records.  The OWN_INOBT records are the old inode btree blocks and will be
78  * cleared out after we've rebuilt the tree.  Each possible inode cluster
79  * within an OWN_INODES record will be read in; for each possible inobt record
80  * associated with that cluster, compute the freemask calculated from the
81  * i_mode data in the inode chunk.  For sparse inodes the holemask will be
82  * calculated by creating the properly aligned inobt record and punching out
83  * any chunk that's missing.  Inode allocations and frees grab the AGI first,
84  * so repair protects itself from concurrent access by locking the AGI.
85  *
86  * Once we've reconstructed all the inode records, we can create new inode
87  * btree roots and reload the btrees.  We rebuild both inode trees at the same
88  * time because they have the same rmap owner and it would be more complex to
89  * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT
90  * blocks it owns.  We have all the data we need to build both, so dump
91  * everything and start over.
92  *
93  * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once.
94  */
95 
96 struct xrep_ibt {
97 	/* Record under construction. */
98 	struct xfs_inobt_rec_incore	rie;
99 
100 	/* new inobt information */
101 	struct xrep_newbt	new_inobt;
102 
103 	/* new finobt information */
104 	struct xrep_newbt	new_finobt;
105 
106 	/* Old inode btree blocks we found in the rmap. */
107 	struct xagb_bitmap	old_iallocbt_blocks;
108 
109 	/* Reconstructed inode records. */
110 	struct xfarray		*inode_records;
111 
112 	struct xfs_scrub	*sc;
113 
114 	/* Number of inodes assigned disk space. */
115 	unsigned int		icount;
116 
117 	/* Number of inodes in use. */
118 	unsigned int		iused;
119 
120 	/* Number of finobt records needed. */
121 	unsigned int		finobt_recs;
122 
123 	/* get_records()'s position in the inode record array. */
124 	xfarray_idx_t		array_cur;
125 };
126 
127 /*
128  * Is this inode in use?  If the inode is in memory we can tell from i_mode,
129  * otherwise we have to check di_mode in the on-disk buffer.  We only care
130  * that the high (i.e. non-permission) bits of _mode are zero.  This should be
131  * safe because repair keeps all AG headers locked until the end, and process
132  * trying to perform an inode allocation/free must lock the AGI.
133  *
134  * @cluster_ag_base is the inode offset of the cluster within the AG.
135  * @cluster_bp is the cluster buffer.
136  * @cluster_index is the inode offset within the inode cluster.
137  */
138 STATIC int
139 xrep_ibt_check_ifree(
140 	struct xrep_ibt		*ri,
141 	xfs_agino_t		cluster_ag_base,
142 	struct xfs_buf		*cluster_bp,
143 	unsigned int		cluster_index,
144 	bool			*inuse)
145 {
146 	struct xfs_scrub	*sc = ri->sc;
147 	struct xfs_mount	*mp = sc->mp;
148 	struct xfs_dinode	*dip;
149 	xfs_ino_t		fsino;
150 	xfs_agino_t		agino;
151 	xfs_agnumber_t		agno = ri->sc->sa.pag->pag_agno;
152 	unsigned int		cluster_buf_base;
153 	unsigned int		offset;
154 	int			error;
155 
156 	agino = cluster_ag_base + cluster_index;
157 	fsino = XFS_AGINO_TO_INO(mp, agno, agino);
158 
159 	/* Inode uncached or half assembled, read disk buffer */
160 	cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
161 	offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize;
162 	if (offset >= BBTOB(cluster_bp->b_length))
163 		return -EFSCORRUPTED;
164 	dip = xfs_buf_offset(cluster_bp, offset);
165 	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
166 		return -EFSCORRUPTED;
167 
168 	if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)
169 		return -EFSCORRUPTED;
170 
171 	/* Will the in-core inode tell us if it's in use? */
172 	error = xchk_inode_is_allocated(sc, agino, inuse);
173 	if (!error)
174 		return 0;
175 
176 	*inuse = dip->di_mode != 0;
177 	return 0;
178 }
179 
180 /* Stash the accumulated inobt record for rebuilding. */
181 STATIC int
182 xrep_ibt_stash(
183 	struct xrep_ibt		*ri)
184 {
185 	int			error = 0;
186 
187 	if (xchk_should_terminate(ri->sc, &error))
188 		return error;
189 
190 	ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie);
191 	if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL)
192 		return -EFSCORRUPTED;
193 
194 	if (ri->rie.ir_freecount > 0)
195 		ri->finobt_recs++;
196 
197 	trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie);
198 
199 	error = xfarray_append(ri->inode_records, &ri->rie);
200 	if (error)
201 		return error;
202 
203 	ri->rie.ir_startino = NULLAGINO;
204 	return 0;
205 }
206 
207 /*
208  * Given an extent of inodes and an inode cluster buffer, calculate the
209  * location of the corresponding inobt record (creating it if necessary),
210  * then update the parts of the holemask and freemask of that record that
211  * correspond to the inode extent we were given.
212  *
213  * @cluster_ir_startino is the AG inode number of an inobt record that we're
214  * proposing to create for this inode cluster.  If sparse inodes are enabled,
215  * we must round down to a chunk boundary to find the actual sparse record.
216  * @cluster_bp is the buffer of the inode cluster.
217  * @nr_inodes is the number of inodes to check from the cluster.
218  */
219 STATIC int
220 xrep_ibt_cluster_record(
221 	struct xrep_ibt		*ri,
222 	xfs_agino_t		cluster_ir_startino,
223 	struct xfs_buf		*cluster_bp,
224 	unsigned int		nr_inodes)
225 {
226 	struct xfs_scrub	*sc = ri->sc;
227 	struct xfs_mount	*mp = sc->mp;
228 	xfs_agino_t		ir_startino;
229 	unsigned int		cluster_base;
230 	unsigned int		cluster_index;
231 	int			error = 0;
232 
233 	ir_startino = cluster_ir_startino;
234 	if (xfs_has_sparseinodes(mp))
235 		ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK);
236 	cluster_base = cluster_ir_startino - ir_startino;
237 
238 	/*
239 	 * If the accumulated inobt record doesn't map this cluster, add it to
240 	 * the list and reset it.
241 	 */
242 	if (ri->rie.ir_startino != NULLAGINO &&
243 	    ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) {
244 		error = xrep_ibt_stash(ri);
245 		if (error)
246 			return error;
247 	}
248 
249 	if (ri->rie.ir_startino == NULLAGINO) {
250 		ri->rie.ir_startino = ir_startino;
251 		ri->rie.ir_free = XFS_INOBT_ALL_FREE;
252 		ri->rie.ir_holemask = 0xFFFF;
253 		ri->rie.ir_count = 0;
254 	}
255 
256 	/* Record the whole cluster. */
257 	ri->icount += nr_inodes;
258 	ri->rie.ir_count += nr_inodes;
259 	ri->rie.ir_holemask &= ~xfs_inobt_maskn(
260 				cluster_base / XFS_INODES_PER_HOLEMASK_BIT,
261 				nr_inodes / XFS_INODES_PER_HOLEMASK_BIT);
262 
263 	/* Which inodes within this cluster are free? */
264 	for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
265 		bool		inuse = false;
266 
267 		error = xrep_ibt_check_ifree(ri, cluster_ir_startino,
268 				cluster_bp, cluster_index, &inuse);
269 		if (error)
270 			return error;
271 		if (!inuse)
272 			continue;
273 		ri->iused++;
274 		ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base +
275 						   cluster_index);
276 	}
277 	return 0;
278 }
279 
280 /*
281  * For each inode cluster covering the physical extent recorded by the rmapbt,
282  * we must calculate the properly aligned startino of that cluster, then
283  * iterate each cluster to fill in used and filled masks appropriately.  We
284  * then use the (startino, used, filled) information to construct the
285  * appropriate inode records.
286  */
287 STATIC int
288 xrep_ibt_process_cluster(
289 	struct xrep_ibt		*ri,
290 	xfs_agblock_t		cluster_bno)
291 {
292 	struct xfs_imap		imap;
293 	struct xfs_buf		*cluster_bp;
294 	struct xfs_scrub	*sc = ri->sc;
295 	struct xfs_mount	*mp = sc->mp;
296 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
297 	xfs_agino_t		cluster_ag_base;
298 	xfs_agino_t		irec_index;
299 	unsigned int		nr_inodes;
300 	int			error;
301 
302 	nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster,
303 			XFS_INODES_PER_CHUNK);
304 
305 	/*
306 	 * Grab the inode cluster buffer.  This is safe to do with a broken
307 	 * inobt because imap_to_bp directly maps the buffer without touching
308 	 * either inode btree.
309 	 */
310 	imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno);
311 	imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
312 	imap.im_boffset = 0;
313 	error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp);
314 	if (error)
315 		return error;
316 
317 	/*
318 	 * Record the contents of each possible inobt record mapping this
319 	 * cluster.
320 	 */
321 	cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno);
322 	for (irec_index = 0;
323 	     irec_index < igeo->inodes_per_cluster;
324 	     irec_index += XFS_INODES_PER_CHUNK) {
325 		error = xrep_ibt_cluster_record(ri,
326 				cluster_ag_base + irec_index, cluster_bp,
327 				nr_inodes);
328 		if (error)
329 			break;
330 
331 	}
332 
333 	xfs_trans_brelse(sc->tp, cluster_bp);
334 	return error;
335 }
336 
337 /* Check for any obvious conflicts in the inode chunk extent. */
338 STATIC int
339 xrep_ibt_check_inode_ext(
340 	struct xfs_scrub	*sc,
341 	xfs_agblock_t		agbno,
342 	xfs_extlen_t		len)
343 {
344 	struct xfs_mount	*mp = sc->mp;
345 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
346 	xfs_agino_t		agino;
347 	enum xbtree_recpacking	outcome;
348 	int			error;
349 
350 	/* Inode records must be within the AG. */
351 	if (!xfs_verify_agbext(sc->sa.pag, agbno, len))
352 		return -EFSCORRUPTED;
353 
354 	/* The entire record must align to the inode cluster size. */
355 	if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) ||
356 	    !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster))
357 		return -EFSCORRUPTED;
358 
359 	/*
360 	 * The entire record must also adhere to the inode cluster alignment
361 	 * size if sparse inodes are not enabled.
362 	 */
363 	if (!xfs_has_sparseinodes(mp) &&
364 	    (!IS_ALIGNED(agbno, igeo->cluster_align) ||
365 	     !IS_ALIGNED(agbno + len, igeo->cluster_align)))
366 		return -EFSCORRUPTED;
367 
368 	/*
369 	 * On a sparse inode fs, this cluster could be part of a sparse chunk.
370 	 * Sparse clusters must be aligned to sparse chunk alignment.
371 	 */
372 	if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align &&
373 	    (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
374 	     !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
375 		return -EFSCORRUPTED;
376 
377 	/* Make sure the entire range of blocks are valid AG inodes. */
378 	agino = XFS_AGB_TO_AGINO(mp, agbno);
379 	if (!xfs_verify_agino(sc->sa.pag, agino))
380 		return -EFSCORRUPTED;
381 
382 	agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1;
383 	if (!xfs_verify_agino(sc->sa.pag, agino))
384 		return -EFSCORRUPTED;
385 
386 	/* Make sure this isn't free space. */
387 	error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
388 	if (error)
389 		return error;
390 	if (outcome != XBTREE_RECPACKING_EMPTY)
391 		return -EFSCORRUPTED;
392 
393 	return 0;
394 }
395 
396 /* Found a fragment of the old inode btrees; dispose of them later. */
397 STATIC int
398 xrep_ibt_record_old_btree_blocks(
399 	struct xrep_ibt			*ri,
400 	const struct xfs_rmap_irec	*rec)
401 {
402 	if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock,
403 				rec->rm_blockcount))
404 		return -EFSCORRUPTED;
405 
406 	return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock,
407 			rec->rm_blockcount);
408 }
409 
410 /* Record extents that belong to inode cluster blocks. */
411 STATIC int
412 xrep_ibt_record_inode_blocks(
413 	struct xrep_ibt			*ri,
414 	const struct xfs_rmap_irec	*rec)
415 {
416 	struct xfs_mount		*mp = ri->sc->mp;
417 	struct xfs_ino_geometry		*igeo = M_IGEO(mp);
418 	xfs_agblock_t			cluster_base;
419 	int				error;
420 
421 	error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock,
422 			rec->rm_blockcount);
423 	if (error)
424 		return error;
425 
426 	trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno,
427 			rec->rm_startblock, rec->rm_blockcount, rec->rm_owner,
428 			rec->rm_offset, rec->rm_flags);
429 
430 	/*
431 	 * Record the free/hole masks for each inode cluster that could be
432 	 * mapped by this rmap record.
433 	 */
434 	for (cluster_base = 0;
435 	     cluster_base < rec->rm_blockcount;
436 	     cluster_base += igeo->blocks_per_cluster) {
437 		error = xrep_ibt_process_cluster(ri,
438 				rec->rm_startblock + cluster_base);
439 		if (error)
440 			return error;
441 	}
442 
443 	return 0;
444 }
445 
446 STATIC int
447 xrep_ibt_walk_rmap(
448 	struct xfs_btree_cur		*cur,
449 	const struct xfs_rmap_irec	*rec,
450 	void				*priv)
451 {
452 	struct xrep_ibt			*ri = priv;
453 	int				error = 0;
454 
455 	if (xchk_should_terminate(ri->sc, &error))
456 		return error;
457 
458 	switch (rec->rm_owner) {
459 	case XFS_RMAP_OWN_INOBT:
460 		return xrep_ibt_record_old_btree_blocks(ri, rec);
461 	case XFS_RMAP_OWN_INODES:
462 		return xrep_ibt_record_inode_blocks(ri, rec);
463 	}
464 	return 0;
465 }
466 
467 /*
468  * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode
469  * btrees (OWN_INOBT).  Figure out if we have enough free space to reconstruct
470  * the inode btrees.  The caller must clean up the lists if anything goes
471  * wrong.
472  */
473 STATIC int
474 xrep_ibt_find_inodes(
475 	struct xrep_ibt		*ri)
476 {
477 	struct xfs_scrub	*sc = ri->sc;
478 	int			error;
479 
480 	ri->rie.ir_startino = NULLAGINO;
481 
482 	/* Collect all reverse mappings for inode blocks. */
483 	xrep_ag_btcur_init(sc, &sc->sa);
484 	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri);
485 	xchk_ag_btcur_free(&sc->sa);
486 	if (error)
487 		return error;
488 
489 	/* If we have a record ready to go, add it to the array. */
490 	if (ri->rie.ir_startino != NULLAGINO)
491 		return xrep_ibt_stash(ri);
492 
493 	return 0;
494 }
495 
496 /* Update the AGI counters. */
497 STATIC int
498 xrep_ibt_reset_counters(
499 	struct xrep_ibt		*ri)
500 {
501 	struct xfs_scrub	*sc = ri->sc;
502 	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
503 	unsigned int		freecount = ri->icount - ri->iused;
504 
505 	/* Trigger inode count recalculation */
506 	xfs_force_summary_recalc(sc->mp);
507 
508 	/*
509 	 * The AGI header contains extra information related to the inode
510 	 * btrees, so we must update those fields here.
511 	 */
512 	agi->agi_count = cpu_to_be32(ri->icount);
513 	agi->agi_freecount = cpu_to_be32(freecount);
514 	xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
515 			   XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
516 
517 	/* Reinitialize with the values we just logged. */
518 	return xrep_reinit_pagi(sc);
519 }
520 
521 /* Retrieve finobt data for bulk load. */
522 STATIC int
523 xrep_fibt_get_records(
524 	struct xfs_btree_cur		*cur,
525 	unsigned int			idx,
526 	struct xfs_btree_block		*block,
527 	unsigned int			nr_wanted,
528 	void				*priv)
529 {
530 	struct xfs_inobt_rec_incore	*irec = &cur->bc_rec.i;
531 	struct xrep_ibt			*ri = priv;
532 	union xfs_btree_rec		*block_rec;
533 	unsigned int			loaded;
534 	int				error;
535 
536 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
537 		do {
538 			error = xfarray_load(ri->inode_records,
539 					ri->array_cur++, irec);
540 		} while (error == 0 && xfs_inobt_rec_freecount(irec) == 0);
541 		if (error)
542 			return error;
543 
544 		block_rec = xfs_btree_rec_addr(cur, idx, block);
545 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
546 	}
547 
548 	return loaded;
549 }
550 
551 /* Retrieve inobt data for bulk load. */
552 STATIC int
553 xrep_ibt_get_records(
554 	struct xfs_btree_cur		*cur,
555 	unsigned int			idx,
556 	struct xfs_btree_block		*block,
557 	unsigned int			nr_wanted,
558 	void				*priv)
559 {
560 	struct xfs_inobt_rec_incore	*irec = &cur->bc_rec.i;
561 	struct xrep_ibt			*ri = priv;
562 	union xfs_btree_rec		*block_rec;
563 	unsigned int			loaded;
564 	int				error;
565 
566 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
567 		error = xfarray_load(ri->inode_records, ri->array_cur++, irec);
568 		if (error)
569 			return error;
570 
571 		block_rec = xfs_btree_rec_addr(cur, idx, block);
572 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
573 	}
574 
575 	return loaded;
576 }
577 
578 /* Feed one of the new inobt blocks to the bulk loader. */
579 STATIC int
580 xrep_ibt_claim_block(
581 	struct xfs_btree_cur	*cur,
582 	union xfs_btree_ptr	*ptr,
583 	void			*priv)
584 {
585 	struct xrep_ibt		*ri = priv;
586 
587 	return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr);
588 }
589 
590 /* Feed one of the new finobt blocks to the bulk loader. */
591 STATIC int
592 xrep_fibt_claim_block(
593 	struct xfs_btree_cur	*cur,
594 	union xfs_btree_ptr	*ptr,
595 	void			*priv)
596 {
597 	struct xrep_ibt		*ri = priv;
598 
599 	return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr);
600 }
601 
602 /* Make sure the records do not overlap in inumber address space. */
603 STATIC int
604 xrep_ibt_check_overlap(
605 	struct xrep_ibt			*ri)
606 {
607 	struct xfs_inobt_rec_incore	irec;
608 	xfarray_idx_t			cur;
609 	xfs_agino_t			next_agino = 0;
610 	int				error = 0;
611 
612 	foreach_xfarray_idx(ri->inode_records, cur) {
613 		if (xchk_should_terminate(ri->sc, &error))
614 			return error;
615 
616 		error = xfarray_load(ri->inode_records, cur, &irec);
617 		if (error)
618 			return error;
619 
620 		if (irec.ir_startino < next_agino)
621 			return -EFSCORRUPTED;
622 
623 		next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK;
624 	}
625 
626 	return error;
627 }
628 
629 /* Build new inode btrees and dispose of the old one. */
630 STATIC int
631 xrep_ibt_build_new_trees(
632 	struct xrep_ibt		*ri)
633 {
634 	struct xfs_scrub	*sc = ri->sc;
635 	struct xfs_btree_cur	*ino_cur;
636 	struct xfs_btree_cur	*fino_cur = NULL;
637 	xfs_fsblock_t		fsbno;
638 	bool			need_finobt;
639 	int			error;
640 
641 	need_finobt = xfs_has_finobt(sc->mp);
642 
643 	/*
644 	 * Create new btrees for staging all the inobt records we collected
645 	 * earlier.  The records were collected in order of increasing agino,
646 	 * so we do not have to sort them.  Ensure there are no overlapping
647 	 * records.
648 	 */
649 	error = xrep_ibt_check_overlap(ri);
650 	if (error)
651 		return error;
652 
653 	/*
654 	 * The new inode btrees will not be rooted in the AGI until we've
655 	 * successfully rebuilt the tree.
656 	 *
657 	 * Start by setting up the inobt staging cursor.
658 	 */
659 	fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
660 			XFS_IBT_BLOCK(sc->mp)),
661 	xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno,
662 			XFS_AG_RESV_NONE);
663 	ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
664 	ri->new_inobt.bload.get_records = xrep_ibt_get_records;
665 
666 	ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL);
667 	xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake);
668 	error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
669 			xfarray_length(ri->inode_records));
670 	if (error)
671 		goto err_inocur;
672 
673 	/* Set up finobt staging cursor. */
674 	if (need_finobt) {
675 		enum xfs_ag_resv_type	resv = XFS_AG_RESV_METADATA;
676 
677 		if (sc->mp->m_finobt_nores)
678 			resv = XFS_AG_RESV_NONE;
679 
680 		fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
681 				XFS_FIBT_BLOCK(sc->mp)),
682 		xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
683 				fsbno, resv);
684 		ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
685 		ri->new_finobt.bload.get_records = xrep_fibt_get_records;
686 
687 		fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL);
688 		xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake);
689 		error = xfs_btree_bload_compute_geometry(fino_cur,
690 				&ri->new_finobt.bload, ri->finobt_recs);
691 		if (error)
692 			goto err_finocur;
693 	}
694 
695 	/* Last chance to abort before we start committing fixes. */
696 	if (xchk_should_terminate(sc, &error))
697 		goto err_finocur;
698 
699 	/* Reserve all the space we need to build the new btrees. */
700 	error = xrep_newbt_alloc_blocks(&ri->new_inobt,
701 			ri->new_inobt.bload.nr_blocks);
702 	if (error)
703 		goto err_finocur;
704 
705 	if (need_finobt) {
706 		error = xrep_newbt_alloc_blocks(&ri->new_finobt,
707 				ri->new_finobt.bload.nr_blocks);
708 		if (error)
709 			goto err_finocur;
710 	}
711 
712 	/* Add all inobt records. */
713 	ri->array_cur = XFARRAY_CURSOR_INIT;
714 	error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri);
715 	if (error)
716 		goto err_finocur;
717 
718 	/* Add all finobt records. */
719 	if (need_finobt) {
720 		ri->array_cur = XFARRAY_CURSOR_INIT;
721 		error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri);
722 		if (error)
723 			goto err_finocur;
724 	}
725 
726 	/*
727 	 * Install the new btrees in the AG header.  After this point the old
728 	 * btrees are no longer accessible and the new trees are live.
729 	 */
730 	xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp);
731 	xfs_btree_del_cursor(ino_cur, 0);
732 
733 	if (fino_cur) {
734 		xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp);
735 		xfs_btree_del_cursor(fino_cur, 0);
736 	}
737 
738 	/* Reset the AGI counters now that we've changed the inode roots. */
739 	error = xrep_ibt_reset_counters(ri);
740 	if (error)
741 		goto err_finobt;
742 
743 	/* Free unused blocks and bitmap. */
744 	if (need_finobt) {
745 		error = xrep_newbt_commit(&ri->new_finobt);
746 		if (error)
747 			goto err_inobt;
748 	}
749 	error = xrep_newbt_commit(&ri->new_inobt);
750 	if (error)
751 		return error;
752 
753 	return xrep_roll_ag_trans(sc);
754 
755 err_finocur:
756 	if (need_finobt)
757 		xfs_btree_del_cursor(fino_cur, error);
758 err_inocur:
759 	xfs_btree_del_cursor(ino_cur, error);
760 err_finobt:
761 	if (need_finobt)
762 		xrep_newbt_cancel(&ri->new_finobt);
763 err_inobt:
764 	xrep_newbt_cancel(&ri->new_inobt);
765 	return error;
766 }
767 
768 /*
769  * Now that we've logged the roots of the new btrees, invalidate all of the
770  * old blocks and free them.
771  */
772 STATIC int
773 xrep_ibt_remove_old_trees(
774 	struct xrep_ibt		*ri)
775 {
776 	struct xfs_scrub	*sc = ri->sc;
777 	int			error;
778 
779 	/*
780 	 * Free the old inode btree blocks if they're not in use.  It's ok to
781 	 * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG
782 	 * reservation because we reset the reservation before releasing the
783 	 * AGI and AGF header buffer locks.
784 	 */
785 	error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks,
786 			&XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE);
787 	if (error)
788 		return error;
789 
790 	/*
791 	 * If the finobt is enabled and has a per-AG reservation, make sure we
792 	 * reinitialize the per-AG reservations.
793 	 */
794 	if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores)
795 		sc->flags |= XREP_RESET_PERAG_RESV;
796 
797 	return 0;
798 }
799 
800 /* Repair both inode btrees. */
801 int
802 xrep_iallocbt(
803 	struct xfs_scrub	*sc)
804 {
805 	struct xrep_ibt		*ri;
806 	struct xfs_mount	*mp = sc->mp;
807 	char			*descr;
808 	xfs_agino_t		first_agino, last_agino;
809 	int			error = 0;
810 
811 	/* We require the rmapbt to rebuild anything. */
812 	if (!xfs_has_rmapbt(mp))
813 		return -EOPNOTSUPP;
814 
815 	ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS);
816 	if (!ri)
817 		return -ENOMEM;
818 	ri->sc = sc;
819 
820 	/* We rebuild both inode btrees. */
821 	sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
822 
823 	/* Set up enough storage to handle an AG with nothing but inodes. */
824 	xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino);
825 	last_agino /= XFS_INODES_PER_CHUNK;
826 	descr = xchk_xfile_ag_descr(sc, "inode index records");
827 	error = xfarray_create(descr, last_agino,
828 			sizeof(struct xfs_inobt_rec_incore),
829 			&ri->inode_records);
830 	kfree(descr);
831 	if (error)
832 		goto out_ri;
833 
834 	/* Collect the inode data and find the old btree blocks. */
835 	xagb_bitmap_init(&ri->old_iallocbt_blocks);
836 	error = xrep_ibt_find_inodes(ri);
837 	if (error)
838 		goto out_bitmap;
839 
840 	/* Rebuild the inode indexes. */
841 	error = xrep_ibt_build_new_trees(ri);
842 	if (error)
843 		goto out_bitmap;
844 
845 	/* Kill the old tree. */
846 	error = xrep_ibt_remove_old_trees(ri);
847 	if (error)
848 		goto out_bitmap;
849 
850 out_bitmap:
851 	xagb_bitmap_destroy(&ri->old_iallocbt_blocks);
852 	xfarray_destroy(ri->inode_records);
853 out_ri:
854 	kfree(ri);
855 	return error;
856 }
857 
858 /* Make sure both btrees are ok after we've rebuilt them. */
859 int
860 xrep_revalidate_iallocbt(
861 	struct xfs_scrub	*sc)
862 {
863 	__u32			old_type = sc->sm->sm_type;
864 	int			error;
865 
866 	/*
867 	 * We must update sm_type temporarily so that the tree-to-tree cross
868 	 * reference checks will work in the correct direction, and also so
869 	 * that tracing will report correctly if there are more errors.
870 	 */
871 	sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT;
872 	error = xchk_iallocbt(sc);
873 	if (error)
874 		goto out;
875 
876 	if (xfs_has_finobt(sc->mp)) {
877 		sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT;
878 		error = xchk_iallocbt(sc);
879 	}
880 
881 out:
882 	sc->sm->sm_type = old_type;
883 	return error;
884 }
885