xref: /linux/fs/xfs/scrub/ialloc_repair.c (revision 9b64ca202f364a6bf8e19bdd20953bc2d776c67f)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_bit.h"
16 #include "xfs_log_format.h"
17 #include "xfs_trans.h"
18 #include "xfs_sb.h"
19 #include "xfs_inode.h"
20 #include "xfs_alloc.h"
21 #include "xfs_ialloc.h"
22 #include "xfs_ialloc_btree.h"
23 #include "xfs_icache.h"
24 #include "xfs_rmap.h"
25 #include "xfs_rmap_btree.h"
26 #include "xfs_log.h"
27 #include "xfs_trans_priv.h"
28 #include "xfs_error.h"
29 #include "xfs_health.h"
30 #include "xfs_ag.h"
31 #include "scrub/xfs_scrub.h"
32 #include "scrub/scrub.h"
33 #include "scrub/common.h"
34 #include "scrub/btree.h"
35 #include "scrub/trace.h"
36 #include "scrub/repair.h"
37 #include "scrub/bitmap.h"
38 #include "scrub/agb_bitmap.h"
39 #include "scrub/xfile.h"
40 #include "scrub/xfarray.h"
41 #include "scrub/newbt.h"
42 #include "scrub/reap.h"
43 
44 /*
45  * Inode Btree Repair
46  * ==================
47  *
48  * A quick refresher of inode btrees on a v5 filesystem:
49  *
50  * - Inode records are read into memory in units of 'inode clusters'.  However
51  *   many inodes fit in a cluster buffer is the smallest number of inodes that
52  *   can be allocated or freed.  Clusters are never smaller than one fs block
53  *   though they can span multiple blocks.  The size (in fs blocks) is
54  *   computed with xfs_icluster_size_fsb().  The fs block alignment of a
55  *   cluster is computed with xfs_ialloc_cluster_alignment().
56  *
57  * - Each inode btree record can describe a single 'inode chunk'.  The chunk
58  *   size is defined to be 64 inodes.  If sparse inodes are enabled, every
59  *   inobt record must be aligned to the chunk size; if not, every record must
60  *   be aligned to the start of a cluster.  It is possible to construct an XFS
61  *   geometry where one inobt record maps to multiple inode clusters; it is
62  *   also possible to construct a geometry where multiple inobt records map to
63  *   different parts of one inode cluster.
64  *
65  * - If sparse inodes are not enabled, the smallest unit of allocation for
66  *   inode records is enough to contain one inode chunk's worth of inodes.
67  *
68  * - If sparse inodes are enabled, the holemask field will be active.  Each
69  *   bit of the holemask represents 4 potential inodes; if set, the
70  *   corresponding space does *not* contain inodes and must be left alone.
71  *   Clusters cannot be smaller than 4 inodes.  The smallest unit of allocation
72  *   of inode records is one inode cluster.
73  *
74  * So what's the rebuild algorithm?
75  *
76  * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT
77  * records.  The OWN_INOBT records are the old inode btree blocks and will be
78  * cleared out after we've rebuilt the tree.  Each possible inode cluster
79  * within an OWN_INODES record will be read in; for each possible inobt record
80  * associated with that cluster, compute the freemask calculated from the
81  * i_mode data in the inode chunk.  For sparse inodes the holemask will be
82  * calculated by creating the properly aligned inobt record and punching out
83  * any chunk that's missing.  Inode allocations and frees grab the AGI first,
84  * so repair protects itself from concurrent access by locking the AGI.
85  *
86  * Once we've reconstructed all the inode records, we can create new inode
87  * btree roots and reload the btrees.  We rebuild both inode trees at the same
88  * time because they have the same rmap owner and it would be more complex to
89  * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT
90  * blocks it owns.  We have all the data we need to build both, so dump
91  * everything and start over.
92  *
93  * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once.
94  */
95 
96 struct xrep_ibt {
97 	/* Record under construction. */
98 	struct xfs_inobt_rec_incore	rie;
99 
100 	/* new inobt information */
101 	struct xrep_newbt	new_inobt;
102 
103 	/* new finobt information */
104 	struct xrep_newbt	new_finobt;
105 
106 	/* Old inode btree blocks we found in the rmap. */
107 	struct xagb_bitmap	old_iallocbt_blocks;
108 
109 	/* Reconstructed inode records. */
110 	struct xfarray		*inode_records;
111 
112 	struct xfs_scrub	*sc;
113 
114 	/* Number of inodes assigned disk space. */
115 	unsigned int		icount;
116 
117 	/* Number of inodes in use. */
118 	unsigned int		iused;
119 
120 	/* Number of finobt records needed. */
121 	unsigned int		finobt_recs;
122 
123 	/* get_records()'s position in the inode record array. */
124 	xfarray_idx_t		array_cur;
125 };
126 
127 /*
128  * Is this inode in use?  If the inode is in memory we can tell from i_mode,
129  * otherwise we have to check di_mode in the on-disk buffer.  We only care
130  * that the high (i.e. non-permission) bits of _mode are zero.  This should be
131  * safe because repair keeps all AG headers locked until the end, and process
132  * trying to perform an inode allocation/free must lock the AGI.
133  *
134  * @cluster_ag_base is the inode offset of the cluster within the AG.
135  * @cluster_bp is the cluster buffer.
136  * @cluster_index is the inode offset within the inode cluster.
137  */
138 STATIC int
139 xrep_ibt_check_ifree(
140 	struct xrep_ibt		*ri,
141 	xfs_agino_t		cluster_ag_base,
142 	struct xfs_buf		*cluster_bp,
143 	unsigned int		cluster_index,
144 	bool			*inuse)
145 {
146 	struct xfs_scrub	*sc = ri->sc;
147 	struct xfs_mount	*mp = sc->mp;
148 	struct xfs_dinode	*dip;
149 	xfs_agino_t		agino;
150 	unsigned int		cluster_buf_base;
151 	unsigned int		offset;
152 	int			error;
153 
154 	agino = cluster_ag_base + cluster_index;
155 
156 	/* Inode uncached or half assembled, read disk buffer */
157 	cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
158 	offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize;
159 	if (offset >= BBTOB(cluster_bp->b_length))
160 		return -EFSCORRUPTED;
161 	dip = xfs_buf_offset(cluster_bp, offset);
162 	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
163 		return -EFSCORRUPTED;
164 
165 	if (dip->di_version >= 3 &&
166 	    be64_to_cpu(dip->di_ino) != xfs_agino_to_ino(ri->sc->sa.pag, agino))
167 		return -EFSCORRUPTED;
168 
169 	/* Will the in-core inode tell us if it's in use? */
170 	error = xchk_inode_is_allocated(sc, agino, inuse);
171 	if (!error)
172 		return 0;
173 
174 	*inuse = dip->di_mode != 0;
175 	return 0;
176 }
177 
178 /* Stash the accumulated inobt record for rebuilding. */
179 STATIC int
180 xrep_ibt_stash(
181 	struct xrep_ibt		*ri)
182 {
183 	int			error = 0;
184 
185 	if (xchk_should_terminate(ri->sc, &error))
186 		return error;
187 
188 	ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie);
189 	if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL)
190 		return -EFSCORRUPTED;
191 
192 	if (ri->rie.ir_freecount > 0)
193 		ri->finobt_recs++;
194 
195 	trace_xrep_ibt_found(ri->sc->sa.pag, &ri->rie);
196 
197 	error = xfarray_append(ri->inode_records, &ri->rie);
198 	if (error)
199 		return error;
200 
201 	ri->rie.ir_startino = NULLAGINO;
202 	return 0;
203 }
204 
205 /*
206  * Given an extent of inodes and an inode cluster buffer, calculate the
207  * location of the corresponding inobt record (creating it if necessary),
208  * then update the parts of the holemask and freemask of that record that
209  * correspond to the inode extent we were given.
210  *
211  * @cluster_ir_startino is the AG inode number of an inobt record that we're
212  * proposing to create for this inode cluster.  If sparse inodes are enabled,
213  * we must round down to a chunk boundary to find the actual sparse record.
214  * @cluster_bp is the buffer of the inode cluster.
215  * @nr_inodes is the number of inodes to check from the cluster.
216  */
217 STATIC int
218 xrep_ibt_cluster_record(
219 	struct xrep_ibt		*ri,
220 	xfs_agino_t		cluster_ir_startino,
221 	struct xfs_buf		*cluster_bp,
222 	unsigned int		nr_inodes)
223 {
224 	struct xfs_scrub	*sc = ri->sc;
225 	struct xfs_mount	*mp = sc->mp;
226 	xfs_agino_t		ir_startino;
227 	unsigned int		cluster_base;
228 	unsigned int		cluster_index;
229 	int			error = 0;
230 
231 	ir_startino = cluster_ir_startino;
232 	if (xfs_has_sparseinodes(mp))
233 		ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK);
234 	cluster_base = cluster_ir_startino - ir_startino;
235 
236 	/*
237 	 * If the accumulated inobt record doesn't map this cluster, add it to
238 	 * the list and reset it.
239 	 */
240 	if (ri->rie.ir_startino != NULLAGINO &&
241 	    ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) {
242 		error = xrep_ibt_stash(ri);
243 		if (error)
244 			return error;
245 	}
246 
247 	if (ri->rie.ir_startino == NULLAGINO) {
248 		ri->rie.ir_startino = ir_startino;
249 		ri->rie.ir_free = XFS_INOBT_ALL_FREE;
250 		ri->rie.ir_holemask = 0xFFFF;
251 		ri->rie.ir_count = 0;
252 	}
253 
254 	/* Record the whole cluster. */
255 	ri->icount += nr_inodes;
256 	ri->rie.ir_count += nr_inodes;
257 	ri->rie.ir_holemask &= ~xfs_inobt_maskn(
258 				cluster_base / XFS_INODES_PER_HOLEMASK_BIT,
259 				nr_inodes / XFS_INODES_PER_HOLEMASK_BIT);
260 
261 	/* Which inodes within this cluster are free? */
262 	for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
263 		bool		inuse = false;
264 
265 		error = xrep_ibt_check_ifree(ri, cluster_ir_startino,
266 				cluster_bp, cluster_index, &inuse);
267 		if (error)
268 			return error;
269 		if (!inuse)
270 			continue;
271 		ri->iused++;
272 		ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base +
273 						   cluster_index);
274 	}
275 	return 0;
276 }
277 
278 /*
279  * For each inode cluster covering the physical extent recorded by the rmapbt,
280  * we must calculate the properly aligned startino of that cluster, then
281  * iterate each cluster to fill in used and filled masks appropriately.  We
282  * then use the (startino, used, filled) information to construct the
283  * appropriate inode records.
284  */
285 STATIC int
286 xrep_ibt_process_cluster(
287 	struct xrep_ibt		*ri,
288 	xfs_agblock_t		cluster_bno)
289 {
290 	struct xfs_buf		*cluster_bp;
291 	struct xfs_scrub	*sc = ri->sc;
292 	struct xfs_mount	*mp = sc->mp;
293 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
294 	xfs_agino_t		cluster_ag_base;
295 	xfs_agino_t		irec_index;
296 	unsigned int		nr_inodes;
297 	int			error;
298 
299 	nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster,
300 			XFS_INODES_PER_CHUNK);
301 
302 	/*
303 	 * Grab the inode cluster buffer.  This is safe to do with a broken
304 	 * inobt because imap_to_bp directly maps the buffer without touching
305 	 * either inode btree.
306 	 */
307 	error = xfs_read_icluster(sc->sa.pag, sc->tp, cluster_bno,
308 			&cluster_bp);
309 	if (error)
310 		return error;
311 
312 	/*
313 	 * Record the contents of each possible inobt record mapping this
314 	 * cluster.
315 	 */
316 	cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno);
317 	for (irec_index = 0;
318 	     irec_index < igeo->inodes_per_cluster;
319 	     irec_index += XFS_INODES_PER_CHUNK) {
320 		error = xrep_ibt_cluster_record(ri,
321 				cluster_ag_base + irec_index, cluster_bp,
322 				nr_inodes);
323 		if (error)
324 			break;
325 
326 	}
327 
328 	xfs_trans_brelse(sc->tp, cluster_bp);
329 	return error;
330 }
331 
332 /* Check for any obvious conflicts in the inode chunk extent. */
333 STATIC int
334 xrep_ibt_check_inode_ext(
335 	struct xfs_scrub	*sc,
336 	xfs_agblock_t		agbno,
337 	xfs_extlen_t		len)
338 {
339 	struct xfs_mount	*mp = sc->mp;
340 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
341 	xfs_agino_t		agino;
342 	enum xbtree_recpacking	outcome;
343 	int			error;
344 
345 	/* Inode records must be within the AG. */
346 	if (!xfs_verify_agbext(sc->sa.pag, agbno, len))
347 		return -EFSCORRUPTED;
348 
349 	/* The entire record must align to the inode cluster size. */
350 	if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) ||
351 	    !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster))
352 		return -EFSCORRUPTED;
353 
354 	/*
355 	 * The entire record must also adhere to the inode cluster alignment
356 	 * size if sparse inodes are not enabled.
357 	 */
358 	if (!xfs_has_sparseinodes(mp) &&
359 	    (!IS_ALIGNED(agbno, igeo->cluster_align) ||
360 	     !IS_ALIGNED(agbno + len, igeo->cluster_align)))
361 		return -EFSCORRUPTED;
362 
363 	/*
364 	 * On a sparse inode fs, this cluster could be part of a sparse chunk.
365 	 * Sparse clusters must be aligned to sparse chunk alignment.
366 	 */
367 	if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align &&
368 	    (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
369 	     !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
370 		return -EFSCORRUPTED;
371 
372 	/* Make sure the entire range of blocks are valid AG inodes. */
373 	agino = XFS_AGB_TO_AGINO(mp, agbno);
374 	if (!xfs_verify_agino(sc->sa.pag, agino))
375 		return -EFSCORRUPTED;
376 
377 	agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1;
378 	if (!xfs_verify_agino(sc->sa.pag, agino))
379 		return -EFSCORRUPTED;
380 
381 	/* Make sure this isn't free space. */
382 	error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
383 	if (error)
384 		return error;
385 	if (outcome != XBTREE_RECPACKING_EMPTY)
386 		return -EFSCORRUPTED;
387 
388 	return 0;
389 }
390 
391 /* Found a fragment of the old inode btrees; dispose of them later. */
392 STATIC int
393 xrep_ibt_record_old_btree_blocks(
394 	struct xrep_ibt			*ri,
395 	const struct xfs_rmap_irec	*rec)
396 {
397 	if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock,
398 				rec->rm_blockcount))
399 		return -EFSCORRUPTED;
400 
401 	return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock,
402 			rec->rm_blockcount);
403 }
404 
405 /* Record extents that belong to inode cluster blocks. */
406 STATIC int
407 xrep_ibt_record_inode_blocks(
408 	struct xrep_ibt			*ri,
409 	const struct xfs_rmap_irec	*rec)
410 {
411 	struct xfs_mount		*mp = ri->sc->mp;
412 	struct xfs_ino_geometry		*igeo = M_IGEO(mp);
413 	xfs_agblock_t			cluster_base;
414 	int				error;
415 
416 	error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock,
417 			rec->rm_blockcount);
418 	if (error)
419 		return error;
420 
421 	trace_xrep_ibt_walk_rmap(ri->sc->sa.pag, rec);
422 
423 	/*
424 	 * Record the free/hole masks for each inode cluster that could be
425 	 * mapped by this rmap record.
426 	 */
427 	for (cluster_base = 0;
428 	     cluster_base < rec->rm_blockcount;
429 	     cluster_base += igeo->blocks_per_cluster) {
430 		error = xrep_ibt_process_cluster(ri,
431 				rec->rm_startblock + cluster_base);
432 		if (error)
433 			return error;
434 	}
435 
436 	return 0;
437 }
438 
439 STATIC int
440 xrep_ibt_walk_rmap(
441 	struct xfs_btree_cur		*cur,
442 	const struct xfs_rmap_irec	*rec,
443 	void				*priv)
444 {
445 	struct xrep_ibt			*ri = priv;
446 	int				error = 0;
447 
448 	if (xchk_should_terminate(ri->sc, &error))
449 		return error;
450 
451 	switch (rec->rm_owner) {
452 	case XFS_RMAP_OWN_INOBT:
453 		return xrep_ibt_record_old_btree_blocks(ri, rec);
454 	case XFS_RMAP_OWN_INODES:
455 		return xrep_ibt_record_inode_blocks(ri, rec);
456 	}
457 	return 0;
458 }
459 
460 /*
461  * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode
462  * btrees (OWN_INOBT).  Figure out if we have enough free space to reconstruct
463  * the inode btrees.  The caller must clean up the lists if anything goes
464  * wrong.
465  */
466 STATIC int
467 xrep_ibt_find_inodes(
468 	struct xrep_ibt		*ri)
469 {
470 	struct xfs_scrub	*sc = ri->sc;
471 	int			error;
472 
473 	ri->rie.ir_startino = NULLAGINO;
474 
475 	/* Collect all reverse mappings for inode blocks. */
476 	xrep_ag_btcur_init(sc, &sc->sa);
477 	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri);
478 	xchk_ag_btcur_free(&sc->sa);
479 	if (error)
480 		return error;
481 
482 	/* If we have a record ready to go, add it to the array. */
483 	if (ri->rie.ir_startino != NULLAGINO)
484 		return xrep_ibt_stash(ri);
485 
486 	return 0;
487 }
488 
489 /* Update the AGI counters. */
490 STATIC int
491 xrep_ibt_reset_counters(
492 	struct xrep_ibt		*ri)
493 {
494 	struct xfs_scrub	*sc = ri->sc;
495 	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
496 	unsigned int		freecount = ri->icount - ri->iused;
497 
498 	/* Trigger inode count recalculation */
499 	xfs_force_summary_recalc(sc->mp);
500 
501 	/*
502 	 * The AGI header contains extra information related to the inode
503 	 * btrees, so we must update those fields here.
504 	 */
505 	agi->agi_count = cpu_to_be32(ri->icount);
506 	agi->agi_freecount = cpu_to_be32(freecount);
507 	xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
508 			   XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
509 
510 	/* Reinitialize with the values we just logged. */
511 	return xrep_reinit_pagi(sc);
512 }
513 
514 /* Retrieve finobt data for bulk load. */
515 STATIC int
516 xrep_fibt_get_records(
517 	struct xfs_btree_cur		*cur,
518 	unsigned int			idx,
519 	struct xfs_btree_block		*block,
520 	unsigned int			nr_wanted,
521 	void				*priv)
522 {
523 	struct xfs_inobt_rec_incore	*irec = &cur->bc_rec.i;
524 	struct xrep_ibt			*ri = priv;
525 	union xfs_btree_rec		*block_rec;
526 	unsigned int			loaded;
527 	int				error;
528 
529 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
530 		do {
531 			error = xfarray_load(ri->inode_records,
532 					ri->array_cur++, irec);
533 		} while (error == 0 && xfs_inobt_rec_freecount(irec) == 0);
534 		if (error)
535 			return error;
536 
537 		block_rec = xfs_btree_rec_addr(cur, idx, block);
538 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
539 	}
540 
541 	return loaded;
542 }
543 
544 /* Retrieve inobt data for bulk load. */
545 STATIC int
546 xrep_ibt_get_records(
547 	struct xfs_btree_cur		*cur,
548 	unsigned int			idx,
549 	struct xfs_btree_block		*block,
550 	unsigned int			nr_wanted,
551 	void				*priv)
552 {
553 	struct xfs_inobt_rec_incore	*irec = &cur->bc_rec.i;
554 	struct xrep_ibt			*ri = priv;
555 	union xfs_btree_rec		*block_rec;
556 	unsigned int			loaded;
557 	int				error;
558 
559 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
560 		error = xfarray_load(ri->inode_records, ri->array_cur++, irec);
561 		if (error)
562 			return error;
563 
564 		block_rec = xfs_btree_rec_addr(cur, idx, block);
565 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
566 	}
567 
568 	return loaded;
569 }
570 
571 /* Feed one of the new inobt blocks to the bulk loader. */
572 STATIC int
573 xrep_ibt_claim_block(
574 	struct xfs_btree_cur	*cur,
575 	union xfs_btree_ptr	*ptr,
576 	void			*priv)
577 {
578 	struct xrep_ibt		*ri = priv;
579 
580 	return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr);
581 }
582 
583 /* Feed one of the new finobt blocks to the bulk loader. */
584 STATIC int
585 xrep_fibt_claim_block(
586 	struct xfs_btree_cur	*cur,
587 	union xfs_btree_ptr	*ptr,
588 	void			*priv)
589 {
590 	struct xrep_ibt		*ri = priv;
591 
592 	return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr);
593 }
594 
595 /* Make sure the records do not overlap in inumber address space. */
596 STATIC int
597 xrep_ibt_check_overlap(
598 	struct xrep_ibt			*ri)
599 {
600 	struct xfs_inobt_rec_incore	irec;
601 	xfarray_idx_t			cur;
602 	xfs_agino_t			next_agino = 0;
603 	int				error = 0;
604 
605 	foreach_xfarray_idx(ri->inode_records, cur) {
606 		if (xchk_should_terminate(ri->sc, &error))
607 			return error;
608 
609 		error = xfarray_load(ri->inode_records, cur, &irec);
610 		if (error)
611 			return error;
612 
613 		if (irec.ir_startino < next_agino)
614 			return -EFSCORRUPTED;
615 
616 		next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK;
617 	}
618 
619 	return error;
620 }
621 
622 /* Build new inode btrees and dispose of the old one. */
623 STATIC int
624 xrep_ibt_build_new_trees(
625 	struct xrep_ibt		*ri)
626 {
627 	struct xfs_scrub	*sc = ri->sc;
628 	struct xfs_btree_cur	*ino_cur;
629 	struct xfs_btree_cur	*fino_cur = NULL;
630 	bool			need_finobt;
631 	int			error;
632 
633 	need_finobt = xfs_has_finobt(sc->mp);
634 
635 	/*
636 	 * Create new btrees for staging all the inobt records we collected
637 	 * earlier.  The records were collected in order of increasing agino,
638 	 * so we do not have to sort them.  Ensure there are no overlapping
639 	 * records.
640 	 */
641 	error = xrep_ibt_check_overlap(ri);
642 	if (error)
643 		return error;
644 
645 	/*
646 	 * The new inode btrees will not be rooted in the AGI until we've
647 	 * successfully rebuilt the tree.
648 	 *
649 	 * Start by setting up the inobt staging cursor.
650 	 */
651 	xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT,
652 			xfs_agbno_to_fsb(sc->sa.pag, XFS_IBT_BLOCK(sc->mp)),
653 			XFS_AG_RESV_NONE);
654 	ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
655 	ri->new_inobt.bload.get_records = xrep_ibt_get_records;
656 
657 	ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL);
658 	xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake);
659 	error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
660 			xfarray_length(ri->inode_records));
661 	if (error)
662 		goto err_inocur;
663 
664 	/* Set up finobt staging cursor. */
665 	if (need_finobt) {
666 		enum xfs_ag_resv_type	resv = XFS_AG_RESV_METADATA;
667 
668 		if (sc->mp->m_finobt_nores)
669 			resv = XFS_AG_RESV_NONE;
670 
671 		xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
672 				xfs_agbno_to_fsb(sc->sa.pag, XFS_FIBT_BLOCK(sc->mp)),
673 				resv);
674 		ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
675 		ri->new_finobt.bload.get_records = xrep_fibt_get_records;
676 
677 		fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL);
678 		xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake);
679 		error = xfs_btree_bload_compute_geometry(fino_cur,
680 				&ri->new_finobt.bload, ri->finobt_recs);
681 		if (error)
682 			goto err_finocur;
683 	}
684 
685 	/* Last chance to abort before we start committing fixes. */
686 	if (xchk_should_terminate(sc, &error))
687 		goto err_finocur;
688 
689 	/* Reserve all the space we need to build the new btrees. */
690 	error = xrep_newbt_alloc_blocks(&ri->new_inobt,
691 			ri->new_inobt.bload.nr_blocks);
692 	if (error)
693 		goto err_finocur;
694 
695 	if (need_finobt) {
696 		error = xrep_newbt_alloc_blocks(&ri->new_finobt,
697 				ri->new_finobt.bload.nr_blocks);
698 		if (error)
699 			goto err_finocur;
700 	}
701 
702 	/* Add all inobt records. */
703 	ri->array_cur = XFARRAY_CURSOR_INIT;
704 	error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri);
705 	if (error)
706 		goto err_finocur;
707 
708 	/* Add all finobt records. */
709 	if (need_finobt) {
710 		ri->array_cur = XFARRAY_CURSOR_INIT;
711 		error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri);
712 		if (error)
713 			goto err_finocur;
714 	}
715 
716 	/*
717 	 * Install the new btrees in the AG header.  After this point the old
718 	 * btrees are no longer accessible and the new trees are live.
719 	 */
720 	xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp);
721 	xfs_btree_del_cursor(ino_cur, 0);
722 
723 	if (fino_cur) {
724 		xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp);
725 		xfs_btree_del_cursor(fino_cur, 0);
726 	}
727 
728 	/* Reset the AGI counters now that we've changed the inode roots. */
729 	error = xrep_ibt_reset_counters(ri);
730 	if (error)
731 		goto err_finobt;
732 
733 	/* Free unused blocks and bitmap. */
734 	if (need_finobt) {
735 		error = xrep_newbt_commit(&ri->new_finobt);
736 		if (error)
737 			goto err_inobt;
738 	}
739 	error = xrep_newbt_commit(&ri->new_inobt);
740 	if (error)
741 		return error;
742 
743 	return xrep_roll_ag_trans(sc);
744 
745 err_finocur:
746 	if (need_finobt)
747 		xfs_btree_del_cursor(fino_cur, error);
748 err_inocur:
749 	xfs_btree_del_cursor(ino_cur, error);
750 err_finobt:
751 	if (need_finobt)
752 		xrep_newbt_cancel(&ri->new_finobt);
753 err_inobt:
754 	xrep_newbt_cancel(&ri->new_inobt);
755 	return error;
756 }
757 
758 /*
759  * Now that we've logged the roots of the new btrees, invalidate all of the
760  * old blocks and free them.
761  */
762 STATIC int
763 xrep_ibt_remove_old_trees(
764 	struct xrep_ibt		*ri)
765 {
766 	struct xfs_scrub	*sc = ri->sc;
767 	int			error;
768 
769 	/*
770 	 * Free the old inode btree blocks if they're not in use.  It's ok to
771 	 * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG
772 	 * reservation because we reset the reservation before releasing the
773 	 * AGI and AGF header buffer locks.
774 	 */
775 	error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks,
776 			&XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE);
777 	if (error)
778 		return error;
779 
780 	/*
781 	 * If the finobt is enabled and has a per-AG reservation, make sure we
782 	 * reinitialize the per-AG reservations.
783 	 */
784 	if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores)
785 		sc->flags |= XREP_RESET_PERAG_RESV;
786 
787 	return 0;
788 }
789 
790 /* Repair both inode btrees. */
791 int
792 xrep_iallocbt(
793 	struct xfs_scrub	*sc)
794 {
795 	struct xrep_ibt		*ri;
796 	struct xfs_mount	*mp = sc->mp;
797 	xfs_agino_t		first_agino, last_agino;
798 	int			error = 0;
799 
800 	/* We require the rmapbt to rebuild anything. */
801 	if (!xfs_has_rmapbt(mp))
802 		return -EOPNOTSUPP;
803 
804 	ri = kzalloc_obj(struct xrep_ibt, XCHK_GFP_FLAGS);
805 	if (!ri)
806 		return -ENOMEM;
807 	ri->sc = sc;
808 
809 	/* We rebuild both inode btrees. */
810 	sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
811 
812 	/* Set up enough storage to handle an AG with nothing but inodes. */
813 	xfs_agino_range(mp, pag_agno(sc->sa.pag), &first_agino, &last_agino);
814 	last_agino /= XFS_INODES_PER_CHUNK;
815 	error = xfarray_create("inode index records", last_agino,
816 			sizeof(struct xfs_inobt_rec_incore),
817 			&ri->inode_records);
818 	if (error)
819 		goto out_ri;
820 
821 	/* Collect the inode data and find the old btree blocks. */
822 	xagb_bitmap_init(&ri->old_iallocbt_blocks);
823 	error = xrep_ibt_find_inodes(ri);
824 	if (error)
825 		goto out_bitmap;
826 
827 	/* Rebuild the inode indexes. */
828 	error = xrep_ibt_build_new_trees(ri);
829 	if (error)
830 		goto out_bitmap;
831 
832 	/* Kill the old tree. */
833 	error = xrep_ibt_remove_old_trees(ri);
834 	if (error)
835 		goto out_bitmap;
836 
837 out_bitmap:
838 	xagb_bitmap_destroy(&ri->old_iallocbt_blocks);
839 	xfarray_destroy(ri->inode_records);
840 out_ri:
841 	kfree(ri);
842 	return error;
843 }
844 
845 /* Make sure both btrees are ok after we've rebuilt them. */
846 int
847 xrep_revalidate_iallocbt(
848 	struct xfs_scrub	*sc)
849 {
850 	__u32			old_type = sc->sm->sm_type;
851 	int			error;
852 
853 	/*
854 	 * We must update sm_type temporarily so that the tree-to-tree cross
855 	 * reference checks will work in the correct direction, and also so
856 	 * that tracing will report correctly if there are more errors.
857 	 */
858 	sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT;
859 	error = xchk_iallocbt(sc);
860 	if (error)
861 		goto out;
862 
863 	/*
864 	 * If the inobt is still corrupt, we've failed to repair the filesystem
865 	 * and should just bail out.
866 	 *
867 	 * If the inobt fails cross-examination with the finobt, the scan will
868 	 * free the finobt cursor, so we need to mark the repair incomplete
869 	 * and avoid walking off the end of the NULL finobt cursor.
870 	 */
871 	if (!xfs_has_finobt(sc->mp) ||
872 	    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
873 		goto out;
874 
875 	sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT;
876 	if (!sc->sa.fino_cur) {
877 		xchk_set_incomplete(sc);
878 		goto out;
879 	}
880 	error = xchk_iallocbt(sc);
881 
882 out:
883 	sc->sm->sm_type = old_type;
884 	return error;
885 }
886