xref: /linux/fs/xfs/scrub/ialloc_repair.c (revision 2330437da0994321020777c605a2a8cb0ecb7001)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_bit.h"
16 #include "xfs_log_format.h"
17 #include "xfs_trans.h"
18 #include "xfs_sb.h"
19 #include "xfs_inode.h"
20 #include "xfs_alloc.h"
21 #include "xfs_ialloc.h"
22 #include "xfs_ialloc_btree.h"
23 #include "xfs_icache.h"
24 #include "xfs_rmap.h"
25 #include "xfs_rmap_btree.h"
26 #include "xfs_log.h"
27 #include "xfs_trans_priv.h"
28 #include "xfs_error.h"
29 #include "xfs_health.h"
30 #include "xfs_ag.h"
31 #include "scrub/xfs_scrub.h"
32 #include "scrub/scrub.h"
33 #include "scrub/common.h"
34 #include "scrub/btree.h"
35 #include "scrub/trace.h"
36 #include "scrub/repair.h"
37 #include "scrub/bitmap.h"
38 #include "scrub/agb_bitmap.h"
39 #include "scrub/xfile.h"
40 #include "scrub/xfarray.h"
41 #include "scrub/newbt.h"
42 #include "scrub/reap.h"
43 
44 /*
45  * Inode Btree Repair
46  * ==================
47  *
48  * A quick refresher of inode btrees on a v5 filesystem:
49  *
50  * - Inode records are read into memory in units of 'inode clusters'.  However
51  *   many inodes fit in a cluster buffer is the smallest number of inodes that
52  *   can be allocated or freed.  Clusters are never smaller than one fs block
53  *   though they can span multiple blocks.  The size (in fs blocks) is
54  *   computed with xfs_icluster_size_fsb().  The fs block alignment of a
55  *   cluster is computed with xfs_ialloc_cluster_alignment().
56  *
57  * - Each inode btree record can describe a single 'inode chunk'.  The chunk
58  *   size is defined to be 64 inodes.  If sparse inodes are enabled, every
59  *   inobt record must be aligned to the chunk size; if not, every record must
60  *   be aligned to the start of a cluster.  It is possible to construct an XFS
61  *   geometry where one inobt record maps to multiple inode clusters; it is
62  *   also possible to construct a geometry where multiple inobt records map to
63  *   different parts of one inode cluster.
64  *
65  * - If sparse inodes are not enabled, the smallest unit of allocation for
66  *   inode records is enough to contain one inode chunk's worth of inodes.
67  *
68  * - If sparse inodes are enabled, the holemask field will be active.  Each
69  *   bit of the holemask represents 4 potential inodes; if set, the
70  *   corresponding space does *not* contain inodes and must be left alone.
71  *   Clusters cannot be smaller than 4 inodes.  The smallest unit of allocation
72  *   of inode records is one inode cluster.
73  *
74  * So what's the rebuild algorithm?
75  *
76  * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT
77  * records.  The OWN_INOBT records are the old inode btree blocks and will be
78  * cleared out after we've rebuilt the tree.  Each possible inode cluster
79  * within an OWN_INODES record will be read in; for each possible inobt record
80  * associated with that cluster, compute the freemask calculated from the
81  * i_mode data in the inode chunk.  For sparse inodes the holemask will be
82  * calculated by creating the properly aligned inobt record and punching out
83  * any chunk that's missing.  Inode allocations and frees grab the AGI first,
84  * so repair protects itself from concurrent access by locking the AGI.
85  *
86  * Once we've reconstructed all the inode records, we can create new inode
87  * btree roots and reload the btrees.  We rebuild both inode trees at the same
88  * time because they have the same rmap owner and it would be more complex to
89  * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT
90  * blocks it owns.  We have all the data we need to build both, so dump
91  * everything and start over.
92  *
93  * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once.
94  */
95 
96 struct xrep_ibt {
97 	/* Record under construction. */
98 	struct xfs_inobt_rec_incore	rie;
99 
100 	/* new inobt information */
101 	struct xrep_newbt	new_inobt;
102 
103 	/* new finobt information */
104 	struct xrep_newbt	new_finobt;
105 
106 	/* Old inode btree blocks we found in the rmap. */
107 	struct xagb_bitmap	old_iallocbt_blocks;
108 
109 	/* Reconstructed inode records. */
110 	struct xfarray		*inode_records;
111 
112 	struct xfs_scrub	*sc;
113 
114 	/* Number of inodes assigned disk space. */
115 	unsigned int		icount;
116 
117 	/* Number of inodes in use. */
118 	unsigned int		iused;
119 
120 	/* Number of finobt records needed. */
121 	unsigned int		finobt_recs;
122 
123 	/* get_records()'s position in the inode record array. */
124 	xfarray_idx_t		array_cur;
125 };
126 
127 /*
128  * Is this inode in use?  If the inode is in memory we can tell from i_mode,
129  * otherwise we have to check di_mode in the on-disk buffer.  We only care
130  * that the high (i.e. non-permission) bits of _mode are zero.  This should be
131  * safe because repair keeps all AG headers locked until the end, and process
132  * trying to perform an inode allocation/free must lock the AGI.
133  *
134  * @cluster_ag_base is the inode offset of the cluster within the AG.
135  * @cluster_bp is the cluster buffer.
136  * @cluster_index is the inode offset within the inode cluster.
137  */
138 STATIC int
139 xrep_ibt_check_ifree(
140 	struct xrep_ibt		*ri,
141 	xfs_agino_t		cluster_ag_base,
142 	struct xfs_buf		*cluster_bp,
143 	unsigned int		cluster_index,
144 	bool			*inuse)
145 {
146 	struct xfs_scrub	*sc = ri->sc;
147 	struct xfs_mount	*mp = sc->mp;
148 	struct xfs_dinode	*dip;
149 	xfs_agino_t		agino;
150 	unsigned int		cluster_buf_base;
151 	unsigned int		offset;
152 	int			error;
153 
154 	agino = cluster_ag_base + cluster_index;
155 
156 	/* Inode uncached or half assembled, read disk buffer */
157 	cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
158 	offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize;
159 	if (offset >= BBTOB(cluster_bp->b_length))
160 		return -EFSCORRUPTED;
161 	dip = xfs_buf_offset(cluster_bp, offset);
162 	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
163 		return -EFSCORRUPTED;
164 
165 	if (dip->di_version >= 3 &&
166 	    be64_to_cpu(dip->di_ino) != xfs_agino_to_ino(ri->sc->sa.pag, agino))
167 		return -EFSCORRUPTED;
168 
169 	/* Will the in-core inode tell us if it's in use? */
170 	error = xchk_inode_is_allocated(sc, agino, inuse);
171 	if (!error)
172 		return 0;
173 
174 	*inuse = dip->di_mode != 0;
175 	return 0;
176 }
177 
178 /* Stash the accumulated inobt record for rebuilding. */
179 STATIC int
180 xrep_ibt_stash(
181 	struct xrep_ibt		*ri)
182 {
183 	int			error = 0;
184 
185 	if (xchk_should_terminate(ri->sc, &error))
186 		return error;
187 
188 	ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie);
189 	if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL)
190 		return -EFSCORRUPTED;
191 
192 	if (ri->rie.ir_freecount > 0)
193 		ri->finobt_recs++;
194 
195 	trace_xrep_ibt_found(ri->sc->sa.pag, &ri->rie);
196 
197 	error = xfarray_append(ri->inode_records, &ri->rie);
198 	if (error)
199 		return error;
200 
201 	ri->rie.ir_startino = NULLAGINO;
202 	return 0;
203 }
204 
205 /*
206  * Given an extent of inodes and an inode cluster buffer, calculate the
207  * location of the corresponding inobt record (creating it if necessary),
208  * then update the parts of the holemask and freemask of that record that
209  * correspond to the inode extent we were given.
210  *
211  * @cluster_ir_startino is the AG inode number of an inobt record that we're
212  * proposing to create for this inode cluster.  If sparse inodes are enabled,
213  * we must round down to a chunk boundary to find the actual sparse record.
214  * @cluster_bp is the buffer of the inode cluster.
215  * @nr_inodes is the number of inodes to check from the cluster.
216  */
217 STATIC int
218 xrep_ibt_cluster_record(
219 	struct xrep_ibt		*ri,
220 	xfs_agino_t		cluster_ir_startino,
221 	struct xfs_buf		*cluster_bp,
222 	unsigned int		nr_inodes)
223 {
224 	struct xfs_scrub	*sc = ri->sc;
225 	struct xfs_mount	*mp = sc->mp;
226 	xfs_agino_t		ir_startino;
227 	unsigned int		cluster_base;
228 	unsigned int		cluster_index;
229 	int			error = 0;
230 
231 	ir_startino = cluster_ir_startino;
232 	if (xfs_has_sparseinodes(mp))
233 		ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK);
234 	cluster_base = cluster_ir_startino - ir_startino;
235 
236 	/*
237 	 * If the accumulated inobt record doesn't map this cluster, add it to
238 	 * the list and reset it.
239 	 */
240 	if (ri->rie.ir_startino != NULLAGINO &&
241 	    ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) {
242 		error = xrep_ibt_stash(ri);
243 		if (error)
244 			return error;
245 	}
246 
247 	if (ri->rie.ir_startino == NULLAGINO) {
248 		ri->rie.ir_startino = ir_startino;
249 		ri->rie.ir_free = XFS_INOBT_ALL_FREE;
250 		ri->rie.ir_holemask = 0xFFFF;
251 		ri->rie.ir_count = 0;
252 	}
253 
254 	/* Record the whole cluster. */
255 	ri->icount += nr_inodes;
256 	ri->rie.ir_count += nr_inodes;
257 	ri->rie.ir_holemask &= ~xfs_inobt_maskn(
258 				cluster_base / XFS_INODES_PER_HOLEMASK_BIT,
259 				nr_inodes / XFS_INODES_PER_HOLEMASK_BIT);
260 
261 	/* Which inodes within this cluster are free? */
262 	for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
263 		bool		inuse = false;
264 
265 		error = xrep_ibt_check_ifree(ri, cluster_ir_startino,
266 				cluster_bp, cluster_index, &inuse);
267 		if (error)
268 			return error;
269 		if (!inuse)
270 			continue;
271 		ri->iused++;
272 		ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base +
273 						   cluster_index);
274 	}
275 	return 0;
276 }
277 
278 /*
279  * For each inode cluster covering the physical extent recorded by the rmapbt,
280  * we must calculate the properly aligned startino of that cluster, then
281  * iterate each cluster to fill in used and filled masks appropriately.  We
282  * then use the (startino, used, filled) information to construct the
283  * appropriate inode records.
284  */
285 STATIC int
286 xrep_ibt_process_cluster(
287 	struct xrep_ibt		*ri,
288 	xfs_agblock_t		cluster_bno)
289 {
290 	struct xfs_imap		imap;
291 	struct xfs_buf		*cluster_bp;
292 	struct xfs_scrub	*sc = ri->sc;
293 	struct xfs_mount	*mp = sc->mp;
294 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
295 	xfs_agino_t		cluster_ag_base;
296 	xfs_agino_t		irec_index;
297 	unsigned int		nr_inodes;
298 	int			error;
299 
300 	nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster,
301 			XFS_INODES_PER_CHUNK);
302 
303 	/*
304 	 * Grab the inode cluster buffer.  This is safe to do with a broken
305 	 * inobt because imap_to_bp directly maps the buffer without touching
306 	 * either inode btree.
307 	 */
308 	imap.im_blkno = xfs_agbno_to_daddr(sc->sa.pag, cluster_bno);
309 	imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
310 	imap.im_boffset = 0;
311 	error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp);
312 	if (error)
313 		return error;
314 
315 	/*
316 	 * Record the contents of each possible inobt record mapping this
317 	 * cluster.
318 	 */
319 	cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno);
320 	for (irec_index = 0;
321 	     irec_index < igeo->inodes_per_cluster;
322 	     irec_index += XFS_INODES_PER_CHUNK) {
323 		error = xrep_ibt_cluster_record(ri,
324 				cluster_ag_base + irec_index, cluster_bp,
325 				nr_inodes);
326 		if (error)
327 			break;
328 
329 	}
330 
331 	xfs_trans_brelse(sc->tp, cluster_bp);
332 	return error;
333 }
334 
335 /* Check for any obvious conflicts in the inode chunk extent. */
336 STATIC int
337 xrep_ibt_check_inode_ext(
338 	struct xfs_scrub	*sc,
339 	xfs_agblock_t		agbno,
340 	xfs_extlen_t		len)
341 {
342 	struct xfs_mount	*mp = sc->mp;
343 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
344 	xfs_agino_t		agino;
345 	enum xbtree_recpacking	outcome;
346 	int			error;
347 
348 	/* Inode records must be within the AG. */
349 	if (!xfs_verify_agbext(sc->sa.pag, agbno, len))
350 		return -EFSCORRUPTED;
351 
352 	/* The entire record must align to the inode cluster size. */
353 	if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) ||
354 	    !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster))
355 		return -EFSCORRUPTED;
356 
357 	/*
358 	 * The entire record must also adhere to the inode cluster alignment
359 	 * size if sparse inodes are not enabled.
360 	 */
361 	if (!xfs_has_sparseinodes(mp) &&
362 	    (!IS_ALIGNED(agbno, igeo->cluster_align) ||
363 	     !IS_ALIGNED(agbno + len, igeo->cluster_align)))
364 		return -EFSCORRUPTED;
365 
366 	/*
367 	 * On a sparse inode fs, this cluster could be part of a sparse chunk.
368 	 * Sparse clusters must be aligned to sparse chunk alignment.
369 	 */
370 	if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align &&
371 	    (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
372 	     !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
373 		return -EFSCORRUPTED;
374 
375 	/* Make sure the entire range of blocks are valid AG inodes. */
376 	agino = XFS_AGB_TO_AGINO(mp, agbno);
377 	if (!xfs_verify_agino(sc->sa.pag, agino))
378 		return -EFSCORRUPTED;
379 
380 	agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1;
381 	if (!xfs_verify_agino(sc->sa.pag, agino))
382 		return -EFSCORRUPTED;
383 
384 	/* Make sure this isn't free space. */
385 	error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
386 	if (error)
387 		return error;
388 	if (outcome != XBTREE_RECPACKING_EMPTY)
389 		return -EFSCORRUPTED;
390 
391 	return 0;
392 }
393 
394 /* Found a fragment of the old inode btrees; dispose of them later. */
395 STATIC int
396 xrep_ibt_record_old_btree_blocks(
397 	struct xrep_ibt			*ri,
398 	const struct xfs_rmap_irec	*rec)
399 {
400 	if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock,
401 				rec->rm_blockcount))
402 		return -EFSCORRUPTED;
403 
404 	return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock,
405 			rec->rm_blockcount);
406 }
407 
408 /* Record extents that belong to inode cluster blocks. */
409 STATIC int
410 xrep_ibt_record_inode_blocks(
411 	struct xrep_ibt			*ri,
412 	const struct xfs_rmap_irec	*rec)
413 {
414 	struct xfs_mount		*mp = ri->sc->mp;
415 	struct xfs_ino_geometry		*igeo = M_IGEO(mp);
416 	xfs_agblock_t			cluster_base;
417 	int				error;
418 
419 	error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock,
420 			rec->rm_blockcount);
421 	if (error)
422 		return error;
423 
424 	trace_xrep_ibt_walk_rmap(ri->sc->sa.pag, rec);
425 
426 	/*
427 	 * Record the free/hole masks for each inode cluster that could be
428 	 * mapped by this rmap record.
429 	 */
430 	for (cluster_base = 0;
431 	     cluster_base < rec->rm_blockcount;
432 	     cluster_base += igeo->blocks_per_cluster) {
433 		error = xrep_ibt_process_cluster(ri,
434 				rec->rm_startblock + cluster_base);
435 		if (error)
436 			return error;
437 	}
438 
439 	return 0;
440 }
441 
442 STATIC int
443 xrep_ibt_walk_rmap(
444 	struct xfs_btree_cur		*cur,
445 	const struct xfs_rmap_irec	*rec,
446 	void				*priv)
447 {
448 	struct xrep_ibt			*ri = priv;
449 	int				error = 0;
450 
451 	if (xchk_should_terminate(ri->sc, &error))
452 		return error;
453 
454 	switch (rec->rm_owner) {
455 	case XFS_RMAP_OWN_INOBT:
456 		return xrep_ibt_record_old_btree_blocks(ri, rec);
457 	case XFS_RMAP_OWN_INODES:
458 		return xrep_ibt_record_inode_blocks(ri, rec);
459 	}
460 	return 0;
461 }
462 
463 /*
464  * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode
465  * btrees (OWN_INOBT).  Figure out if we have enough free space to reconstruct
466  * the inode btrees.  The caller must clean up the lists if anything goes
467  * wrong.
468  */
469 STATIC int
470 xrep_ibt_find_inodes(
471 	struct xrep_ibt		*ri)
472 {
473 	struct xfs_scrub	*sc = ri->sc;
474 	int			error;
475 
476 	ri->rie.ir_startino = NULLAGINO;
477 
478 	/* Collect all reverse mappings for inode blocks. */
479 	xrep_ag_btcur_init(sc, &sc->sa);
480 	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri);
481 	xchk_ag_btcur_free(&sc->sa);
482 	if (error)
483 		return error;
484 
485 	/* If we have a record ready to go, add it to the array. */
486 	if (ri->rie.ir_startino != NULLAGINO)
487 		return xrep_ibt_stash(ri);
488 
489 	return 0;
490 }
491 
492 /* Update the AGI counters. */
493 STATIC int
494 xrep_ibt_reset_counters(
495 	struct xrep_ibt		*ri)
496 {
497 	struct xfs_scrub	*sc = ri->sc;
498 	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
499 	unsigned int		freecount = ri->icount - ri->iused;
500 
501 	/* Trigger inode count recalculation */
502 	xfs_force_summary_recalc(sc->mp);
503 
504 	/*
505 	 * The AGI header contains extra information related to the inode
506 	 * btrees, so we must update those fields here.
507 	 */
508 	agi->agi_count = cpu_to_be32(ri->icount);
509 	agi->agi_freecount = cpu_to_be32(freecount);
510 	xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
511 			   XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
512 
513 	/* Reinitialize with the values we just logged. */
514 	return xrep_reinit_pagi(sc);
515 }
516 
517 /* Retrieve finobt data for bulk load. */
518 STATIC int
519 xrep_fibt_get_records(
520 	struct xfs_btree_cur		*cur,
521 	unsigned int			idx,
522 	struct xfs_btree_block		*block,
523 	unsigned int			nr_wanted,
524 	void				*priv)
525 {
526 	struct xfs_inobt_rec_incore	*irec = &cur->bc_rec.i;
527 	struct xrep_ibt			*ri = priv;
528 	union xfs_btree_rec		*block_rec;
529 	unsigned int			loaded;
530 	int				error;
531 
532 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
533 		do {
534 			error = xfarray_load(ri->inode_records,
535 					ri->array_cur++, irec);
536 		} while (error == 0 && xfs_inobt_rec_freecount(irec) == 0);
537 		if (error)
538 			return error;
539 
540 		block_rec = xfs_btree_rec_addr(cur, idx, block);
541 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
542 	}
543 
544 	return loaded;
545 }
546 
547 /* Retrieve inobt data for bulk load. */
548 STATIC int
549 xrep_ibt_get_records(
550 	struct xfs_btree_cur		*cur,
551 	unsigned int			idx,
552 	struct xfs_btree_block		*block,
553 	unsigned int			nr_wanted,
554 	void				*priv)
555 {
556 	struct xfs_inobt_rec_incore	*irec = &cur->bc_rec.i;
557 	struct xrep_ibt			*ri = priv;
558 	union xfs_btree_rec		*block_rec;
559 	unsigned int			loaded;
560 	int				error;
561 
562 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
563 		error = xfarray_load(ri->inode_records, ri->array_cur++, irec);
564 		if (error)
565 			return error;
566 
567 		block_rec = xfs_btree_rec_addr(cur, idx, block);
568 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
569 	}
570 
571 	return loaded;
572 }
573 
574 /* Feed one of the new inobt blocks to the bulk loader. */
575 STATIC int
576 xrep_ibt_claim_block(
577 	struct xfs_btree_cur	*cur,
578 	union xfs_btree_ptr	*ptr,
579 	void			*priv)
580 {
581 	struct xrep_ibt		*ri = priv;
582 
583 	return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr);
584 }
585 
586 /* Feed one of the new finobt blocks to the bulk loader. */
587 STATIC int
588 xrep_fibt_claim_block(
589 	struct xfs_btree_cur	*cur,
590 	union xfs_btree_ptr	*ptr,
591 	void			*priv)
592 {
593 	struct xrep_ibt		*ri = priv;
594 
595 	return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr);
596 }
597 
598 /* Make sure the records do not overlap in inumber address space. */
599 STATIC int
600 xrep_ibt_check_overlap(
601 	struct xrep_ibt			*ri)
602 {
603 	struct xfs_inobt_rec_incore	irec;
604 	xfarray_idx_t			cur;
605 	xfs_agino_t			next_agino = 0;
606 	int				error = 0;
607 
608 	foreach_xfarray_idx(ri->inode_records, cur) {
609 		if (xchk_should_terminate(ri->sc, &error))
610 			return error;
611 
612 		error = xfarray_load(ri->inode_records, cur, &irec);
613 		if (error)
614 			return error;
615 
616 		if (irec.ir_startino < next_agino)
617 			return -EFSCORRUPTED;
618 
619 		next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK;
620 	}
621 
622 	return error;
623 }
624 
625 /* Build new inode btrees and dispose of the old one. */
626 STATIC int
627 xrep_ibt_build_new_trees(
628 	struct xrep_ibt		*ri)
629 {
630 	struct xfs_scrub	*sc = ri->sc;
631 	struct xfs_btree_cur	*ino_cur;
632 	struct xfs_btree_cur	*fino_cur = NULL;
633 	bool			need_finobt;
634 	int			error;
635 
636 	need_finobt = xfs_has_finobt(sc->mp);
637 
638 	/*
639 	 * Create new btrees for staging all the inobt records we collected
640 	 * earlier.  The records were collected in order of increasing agino,
641 	 * so we do not have to sort them.  Ensure there are no overlapping
642 	 * records.
643 	 */
644 	error = xrep_ibt_check_overlap(ri);
645 	if (error)
646 		return error;
647 
648 	/*
649 	 * The new inode btrees will not be rooted in the AGI until we've
650 	 * successfully rebuilt the tree.
651 	 *
652 	 * Start by setting up the inobt staging cursor.
653 	 */
654 	xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT,
655 			xfs_agbno_to_fsb(sc->sa.pag, XFS_IBT_BLOCK(sc->mp)),
656 			XFS_AG_RESV_NONE);
657 	ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
658 	ri->new_inobt.bload.get_records = xrep_ibt_get_records;
659 
660 	ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL);
661 	xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake);
662 	error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
663 			xfarray_length(ri->inode_records));
664 	if (error)
665 		goto err_inocur;
666 
667 	/* Set up finobt staging cursor. */
668 	if (need_finobt) {
669 		enum xfs_ag_resv_type	resv = XFS_AG_RESV_METADATA;
670 
671 		if (sc->mp->m_finobt_nores)
672 			resv = XFS_AG_RESV_NONE;
673 
674 		xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
675 				xfs_agbno_to_fsb(sc->sa.pag, XFS_FIBT_BLOCK(sc->mp)),
676 				resv);
677 		ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
678 		ri->new_finobt.bload.get_records = xrep_fibt_get_records;
679 
680 		fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL);
681 		xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake);
682 		error = xfs_btree_bload_compute_geometry(fino_cur,
683 				&ri->new_finobt.bload, ri->finobt_recs);
684 		if (error)
685 			goto err_finocur;
686 	}
687 
688 	/* Last chance to abort before we start committing fixes. */
689 	if (xchk_should_terminate(sc, &error))
690 		goto err_finocur;
691 
692 	/* Reserve all the space we need to build the new btrees. */
693 	error = xrep_newbt_alloc_blocks(&ri->new_inobt,
694 			ri->new_inobt.bload.nr_blocks);
695 	if (error)
696 		goto err_finocur;
697 
698 	if (need_finobt) {
699 		error = xrep_newbt_alloc_blocks(&ri->new_finobt,
700 				ri->new_finobt.bload.nr_blocks);
701 		if (error)
702 			goto err_finocur;
703 	}
704 
705 	/* Add all inobt records. */
706 	ri->array_cur = XFARRAY_CURSOR_INIT;
707 	error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri);
708 	if (error)
709 		goto err_finocur;
710 
711 	/* Add all finobt records. */
712 	if (need_finobt) {
713 		ri->array_cur = XFARRAY_CURSOR_INIT;
714 		error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri);
715 		if (error)
716 			goto err_finocur;
717 	}
718 
719 	/*
720 	 * Install the new btrees in the AG header.  After this point the old
721 	 * btrees are no longer accessible and the new trees are live.
722 	 */
723 	xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp);
724 	xfs_btree_del_cursor(ino_cur, 0);
725 
726 	if (fino_cur) {
727 		xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp);
728 		xfs_btree_del_cursor(fino_cur, 0);
729 	}
730 
731 	/* Reset the AGI counters now that we've changed the inode roots. */
732 	error = xrep_ibt_reset_counters(ri);
733 	if (error)
734 		goto err_finobt;
735 
736 	/* Free unused blocks and bitmap. */
737 	if (need_finobt) {
738 		error = xrep_newbt_commit(&ri->new_finobt);
739 		if (error)
740 			goto err_inobt;
741 	}
742 	error = xrep_newbt_commit(&ri->new_inobt);
743 	if (error)
744 		return error;
745 
746 	return xrep_roll_ag_trans(sc);
747 
748 err_finocur:
749 	if (need_finobt)
750 		xfs_btree_del_cursor(fino_cur, error);
751 err_inocur:
752 	xfs_btree_del_cursor(ino_cur, error);
753 err_finobt:
754 	if (need_finobt)
755 		xrep_newbt_cancel(&ri->new_finobt);
756 err_inobt:
757 	xrep_newbt_cancel(&ri->new_inobt);
758 	return error;
759 }
760 
761 /*
762  * Now that we've logged the roots of the new btrees, invalidate all of the
763  * old blocks and free them.
764  */
765 STATIC int
766 xrep_ibt_remove_old_trees(
767 	struct xrep_ibt		*ri)
768 {
769 	struct xfs_scrub	*sc = ri->sc;
770 	int			error;
771 
772 	/*
773 	 * Free the old inode btree blocks if they're not in use.  It's ok to
774 	 * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG
775 	 * reservation because we reset the reservation before releasing the
776 	 * AGI and AGF header buffer locks.
777 	 */
778 	error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks,
779 			&XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE);
780 	if (error)
781 		return error;
782 
783 	/*
784 	 * If the finobt is enabled and has a per-AG reservation, make sure we
785 	 * reinitialize the per-AG reservations.
786 	 */
787 	if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores)
788 		sc->flags |= XREP_RESET_PERAG_RESV;
789 
790 	return 0;
791 }
792 
793 /* Repair both inode btrees. */
794 int
795 xrep_iallocbt(
796 	struct xfs_scrub	*sc)
797 {
798 	struct xrep_ibt		*ri;
799 	struct xfs_mount	*mp = sc->mp;
800 	char			*descr;
801 	xfs_agino_t		first_agino, last_agino;
802 	int			error = 0;
803 
804 	/* We require the rmapbt to rebuild anything. */
805 	if (!xfs_has_rmapbt(mp))
806 		return -EOPNOTSUPP;
807 
808 	ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS);
809 	if (!ri)
810 		return -ENOMEM;
811 	ri->sc = sc;
812 
813 	/* We rebuild both inode btrees. */
814 	sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
815 
816 	/* Set up enough storage to handle an AG with nothing but inodes. */
817 	xfs_agino_range(mp, pag_agno(sc->sa.pag), &first_agino, &last_agino);
818 	last_agino /= XFS_INODES_PER_CHUNK;
819 	descr = xchk_xfile_ag_descr(sc, "inode index records");
820 	error = xfarray_create(descr, last_agino,
821 			sizeof(struct xfs_inobt_rec_incore),
822 			&ri->inode_records);
823 	kfree(descr);
824 	if (error)
825 		goto out_ri;
826 
827 	/* Collect the inode data and find the old btree blocks. */
828 	xagb_bitmap_init(&ri->old_iallocbt_blocks);
829 	error = xrep_ibt_find_inodes(ri);
830 	if (error)
831 		goto out_bitmap;
832 
833 	/* Rebuild the inode indexes. */
834 	error = xrep_ibt_build_new_trees(ri);
835 	if (error)
836 		goto out_bitmap;
837 
838 	/* Kill the old tree. */
839 	error = xrep_ibt_remove_old_trees(ri);
840 	if (error)
841 		goto out_bitmap;
842 
843 out_bitmap:
844 	xagb_bitmap_destroy(&ri->old_iallocbt_blocks);
845 	xfarray_destroy(ri->inode_records);
846 out_ri:
847 	kfree(ri);
848 	return error;
849 }
850 
851 /* Make sure both btrees are ok after we've rebuilt them. */
852 int
853 xrep_revalidate_iallocbt(
854 	struct xfs_scrub	*sc)
855 {
856 	__u32			old_type = sc->sm->sm_type;
857 	int			error;
858 
859 	/*
860 	 * We must update sm_type temporarily so that the tree-to-tree cross
861 	 * reference checks will work in the correct direction, and also so
862 	 * that tracing will report correctly if there are more errors.
863 	 */
864 	sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT;
865 	error = xchk_iallocbt(sc);
866 	if (error)
867 		goto out;
868 
869 	if (xfs_has_finobt(sc->mp)) {
870 		sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT;
871 		error = xchk_iallocbt(sc);
872 	}
873 
874 out:
875 	sc->sm->sm_type = old_type;
876 	return error;
877 }
878