xref: /linux/fs/xfs/scrub/refcount_repair.c (revision 1a562c0d44974d3cf89c6cc5c34c708c08af420e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_inode.h"
16 #include "xfs_bit.h"
17 #include "xfs_log_format.h"
18 #include "xfs_trans.h"
19 #include "xfs_sb.h"
20 #include "xfs_alloc.h"
21 #include "xfs_ialloc.h"
22 #include "xfs_rmap.h"
23 #include "xfs_rmap_btree.h"
24 #include "xfs_refcount.h"
25 #include "xfs_refcount_btree.h"
26 #include "xfs_error.h"
27 #include "xfs_ag.h"
28 #include "scrub/xfs_scrub.h"
29 #include "scrub/scrub.h"
30 #include "scrub/common.h"
31 #include "scrub/btree.h"
32 #include "scrub/trace.h"
33 #include "scrub/repair.h"
34 #include "scrub/bitmap.h"
35 #include "scrub/agb_bitmap.h"
36 #include "scrub/xfile.h"
37 #include "scrub/xfarray.h"
38 #include "scrub/newbt.h"
39 #include "scrub/reap.h"
40 
41 /*
42  * Rebuilding the Reference Count Btree
43  * ====================================
44  *
45  * This algorithm is "borrowed" from xfs_repair.  Imagine the rmap
46  * entries as rectangles representing extents of physical blocks, and
47  * that the rectangles can be laid down to allow them to overlap each
48  * other; then we know that we must emit a refcnt btree entry wherever
49  * the amount of overlap changes, i.e. the emission stimulus is
50  * level-triggered:
51  *
52  *                 -    ---
53  *       --      ----- ----   ---        ------
54  * --   ----     ----------- ----     ---------
55  * -------------------------------- -----------
56  * ^ ^  ^^ ^^    ^ ^^ ^^^  ^^^^  ^ ^^ ^  ^     ^
57  * 2 1  23 21    3 43 234  2123  1 01 2  3     0
58  *
59  * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
60  *
61  * Note that in the actual refcnt btree we don't store the refcount < 2
62  * cases because the bnobt tells us which blocks are free; single-use
63  * blocks aren't recorded in the bnobt or the refcntbt.  If the rmapbt
64  * supports storing multiple entries covering a given block we could
65  * theoretically dispense with the refcntbt and simply count rmaps, but
66  * that's inefficient in the (hot) write path, so we'll take the cost of
67  * the extra tree to save time.  Also there's no guarantee that rmap
68  * will be enabled.
69  *
70  * Given an array of rmaps sorted by physical block number, a starting
71  * physical block (sp), a bag to hold rmaps that cover sp, and the next
72  * physical block where the level changes (np), we can reconstruct the
73  * refcount btree as follows:
74  *
75  * While there are still unprocessed rmaps in the array,
76  *  - Set sp to the physical block (pblk) of the next unprocessed rmap.
77  *  - Add to the bag all rmaps in the array where startblock == sp.
78  *  - Set np to the physical block where the bag size will change.  This
79  *    is the minimum of (the pblk of the next unprocessed rmap) and
80  *    (startblock + len of each rmap in the bag).
81  *  - Record the bag size as old_bag_size.
82  *
83  *  - While the bag isn't empty,
84  *     - Remove from the bag all rmaps where startblock + len == np.
85  *     - Add to the bag all rmaps in the array where startblock == np.
86  *     - If the bag size isn't old_bag_size, store the refcount entry
87  *       (sp, np - sp, bag_size) in the refcnt btree.
88  *     - If the bag is empty, break out of the inner loop.
89  *     - Set old_bag_size to the bag size
90  *     - Set sp = np.
91  *     - Set np to the physical block where the bag size will change.
92  *       This is the minimum of (the pblk of the next unprocessed rmap)
93  *       and (startblock + len of each rmap in the bag).
94  *
95  * Like all the other repairers, we make a list of all the refcount
96  * records we need, then reinitialize the refcount btree root and
97  * insert all the records.
98  */
99 
100 /* The only parts of the rmap that we care about for computing refcounts. */
101 struct xrep_refc_rmap {
102 	xfs_agblock_t		startblock;
103 	xfs_extlen_t		blockcount;
104 } __packed;
105 
106 struct xrep_refc {
107 	/* refcount extents */
108 	struct xfarray		*refcount_records;
109 
110 	/* new refcountbt information */
111 	struct xrep_newbt	new_btree;
112 
113 	/* old refcountbt blocks */
114 	struct xagb_bitmap	old_refcountbt_blocks;
115 
116 	struct xfs_scrub	*sc;
117 
118 	/* get_records()'s position in the refcount record array. */
119 	xfarray_idx_t		array_cur;
120 
121 	/* # of refcountbt blocks */
122 	xfs_extlen_t		btblocks;
123 };
124 
125 /* Check for any obvious conflicts with this shared/CoW staging extent. */
126 STATIC int
127 xrep_refc_check_ext(
128 	struct xfs_scrub		*sc,
129 	const struct xfs_refcount_irec	*rec)
130 {
131 	enum xbtree_recpacking		outcome;
132 	int				error;
133 
134 	if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL)
135 		return -EFSCORRUPTED;
136 
137 	/* Make sure this isn't free space. */
138 	error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock,
139 			rec->rc_blockcount, &outcome);
140 	if (error)
141 		return error;
142 	if (outcome != XBTREE_RECPACKING_EMPTY)
143 		return -EFSCORRUPTED;
144 
145 	/* Must not be an inode chunk. */
146 	error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
147 			rec->rc_startblock, rec->rc_blockcount, &outcome);
148 	if (error)
149 		return error;
150 	if (outcome != XBTREE_RECPACKING_EMPTY)
151 		return -EFSCORRUPTED;
152 
153 	return 0;
154 }
155 
156 /* Record a reference count extent. */
157 STATIC int
158 xrep_refc_stash(
159 	struct xrep_refc		*rr,
160 	enum xfs_refc_domain		domain,
161 	xfs_agblock_t			agbno,
162 	xfs_extlen_t			len,
163 	uint64_t			refcount)
164 {
165 	struct xfs_refcount_irec	irec = {
166 		.rc_startblock		= agbno,
167 		.rc_blockcount		= len,
168 		.rc_domain		= domain,
169 	};
170 	struct xfs_scrub		*sc = rr->sc;
171 	int				error = 0;
172 
173 	if (xchk_should_terminate(sc, &error))
174 		return error;
175 
176 	irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
177 
178 	error = xrep_refc_check_ext(rr->sc, &irec);
179 	if (error)
180 		return error;
181 
182 	trace_xrep_refc_found(sc->sa.pag, &irec);
183 
184 	return xfarray_append(rr->refcount_records, &irec);
185 }
186 
187 /* Record a CoW staging extent. */
188 STATIC int
189 xrep_refc_stash_cow(
190 	struct xrep_refc		*rr,
191 	xfs_agblock_t			agbno,
192 	xfs_extlen_t			len)
193 {
194 	return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1);
195 }
196 
197 /* Decide if an rmap could describe a shared extent. */
198 static inline bool
199 xrep_refc_rmap_shareable(
200 	struct xfs_mount		*mp,
201 	const struct xfs_rmap_irec	*rmap)
202 {
203 	/* AG metadata are never sharable */
204 	if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
205 		return false;
206 
207 	/* Metadata in files are never shareable */
208 	if (xfs_internal_inum(mp, rmap->rm_owner))
209 		return false;
210 
211 	/* Metadata and unwritten file blocks are not shareable. */
212 	if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
213 			      XFS_RMAP_UNWRITTEN))
214 		return false;
215 
216 	return true;
217 }
218 
219 /*
220  * Walk along the reverse mapping records until we find one that could describe
221  * a shared extent.
222  */
223 STATIC int
224 xrep_refc_walk_rmaps(
225 	struct xrep_refc	*rr,
226 	struct xrep_refc_rmap	*rrm,
227 	bool			*have_rec)
228 {
229 	struct xfs_rmap_irec	rmap;
230 	struct xfs_btree_cur	*cur = rr->sc->sa.rmap_cur;
231 	struct xfs_mount	*mp = cur->bc_mp;
232 	int			have_gt;
233 	int			error = 0;
234 
235 	*have_rec = false;
236 
237 	/*
238 	 * Loop through the remaining rmaps.  Remember CoW staging
239 	 * extents and the refcountbt blocks from the old tree for later
240 	 * disposal.  We can only share written data fork extents, so
241 	 * keep looping until we find an rmap for one.
242 	 */
243 	do {
244 		if (xchk_should_terminate(rr->sc, &error))
245 			return error;
246 
247 		error = xfs_btree_increment(cur, 0, &have_gt);
248 		if (error)
249 			return error;
250 		if (!have_gt)
251 			return 0;
252 
253 		error = xfs_rmap_get_rec(cur, &rmap, &have_gt);
254 		if (error)
255 			return error;
256 		if (XFS_IS_CORRUPT(mp, !have_gt))
257 			return -EFSCORRUPTED;
258 
259 		if (rmap.rm_owner == XFS_RMAP_OWN_COW) {
260 			error = xrep_refc_stash_cow(rr, rmap.rm_startblock,
261 					rmap.rm_blockcount);
262 			if (error)
263 				return error;
264 		} else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) {
265 			/* refcountbt block, dump it when we're done. */
266 			rr->btblocks += rmap.rm_blockcount;
267 			error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
268 					rmap.rm_startblock, rmap.rm_blockcount);
269 			if (error)
270 				return error;
271 		}
272 	} while (!xrep_refc_rmap_shareable(mp, &rmap));
273 
274 	rrm->startblock = rmap.rm_startblock;
275 	rrm->blockcount = rmap.rm_blockcount;
276 	*have_rec = true;
277 	return 0;
278 }
279 
280 static inline uint32_t
281 xrep_refc_encode_startblock(
282 	const struct xfs_refcount_irec	*irec)
283 {
284 	uint32_t			start;
285 
286 	start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
287 	if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
288 		start |= XFS_REFC_COWFLAG;
289 
290 	return start;
291 }
292 
293 /* Sort in the same order as the ondisk records. */
294 static int
295 xrep_refc_extent_cmp(
296 	const void			*a,
297 	const void			*b)
298 {
299 	const struct xfs_refcount_irec	*ap = a;
300 	const struct xfs_refcount_irec	*bp = b;
301 	uint32_t			sa, sb;
302 
303 	sa = xrep_refc_encode_startblock(ap);
304 	sb = xrep_refc_encode_startblock(bp);
305 
306 	if (sa > sb)
307 		return 1;
308 	if (sa < sb)
309 		return -1;
310 	return 0;
311 }
312 
313 /*
314  * Sort the refcount extents by startblock or else the btree records will be in
315  * the wrong order.  Make sure the records do not overlap in physical space.
316  */
317 STATIC int
318 xrep_refc_sort_records(
319 	struct xrep_refc		*rr)
320 {
321 	struct xfs_refcount_irec	irec;
322 	xfarray_idx_t			cur;
323 	enum xfs_refc_domain		dom = XFS_REFC_DOMAIN_SHARED;
324 	xfs_agblock_t			next_agbno = 0;
325 	int				error;
326 
327 	error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp,
328 			XFARRAY_SORT_KILLABLE);
329 	if (error)
330 		return error;
331 
332 	foreach_xfarray_idx(rr->refcount_records, cur) {
333 		if (xchk_should_terminate(rr->sc, &error))
334 			return error;
335 
336 		error = xfarray_load(rr->refcount_records, cur, &irec);
337 		if (error)
338 			return error;
339 
340 		if (dom == XFS_REFC_DOMAIN_SHARED &&
341 		    irec.rc_domain == XFS_REFC_DOMAIN_COW) {
342 			dom = irec.rc_domain;
343 			next_agbno = 0;
344 		}
345 
346 		if (dom != irec.rc_domain)
347 			return -EFSCORRUPTED;
348 		if (irec.rc_startblock < next_agbno)
349 			return -EFSCORRUPTED;
350 
351 		next_agbno = irec.rc_startblock + irec.rc_blockcount;
352 	}
353 
354 	return error;
355 }
356 
357 #define RRM_NEXT(r)	((r).startblock + (r).blockcount)
358 /*
359  * Find the next block where the refcount changes, given the next rmap we
360  * looked at and the ones we're already tracking.
361  */
362 static inline int
363 xrep_refc_next_edge(
364 	struct xfarray		*rmap_bag,
365 	struct xrep_refc_rmap	*next_rrm,
366 	bool			next_valid,
367 	xfs_agblock_t		*nbnop)
368 {
369 	struct xrep_refc_rmap	rrm;
370 	xfarray_idx_t		array_cur = XFARRAY_CURSOR_INIT;
371 	xfs_agblock_t		nbno = NULLAGBLOCK;
372 	int			error;
373 
374 	if (next_valid)
375 		nbno = next_rrm->startblock;
376 
377 	while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1)
378 		nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm));
379 
380 	if (error)
381 		return error;
382 
383 	/*
384 	 * We should have found /something/ because either next_rrm is the next
385 	 * interesting rmap to look at after emitting this refcount extent, or
386 	 * there are other rmaps in rmap_bag contributing to the current
387 	 * sharing count.  But if something is seriously wrong, bail out.
388 	 */
389 	if (nbno == NULLAGBLOCK)
390 		return -EFSCORRUPTED;
391 
392 	*nbnop = nbno;
393 	return 0;
394 }
395 
396 /*
397  * Walk forward through the rmap btree to collect all rmaps starting at
398  * @bno in @rmap_bag.  These represent the file(s) that share ownership of
399  * the current block.  Upon return, the rmap cursor points to the last record
400  * satisfying the startblock constraint.
401  */
402 static int
403 xrep_refc_push_rmaps_at(
404 	struct xrep_refc	*rr,
405 	struct xfarray		*rmap_bag,
406 	xfs_agblock_t		bno,
407 	struct xrep_refc_rmap	*rrm,
408 	bool			*have,
409 	uint64_t		*stack_sz)
410 {
411 	struct xfs_scrub	*sc = rr->sc;
412 	int			have_gt;
413 	int			error;
414 
415 	while (*have && rrm->startblock == bno) {
416 		error = xfarray_store_anywhere(rmap_bag, rrm);
417 		if (error)
418 			return error;
419 		(*stack_sz)++;
420 		error = xrep_refc_walk_rmaps(rr, rrm, have);
421 		if (error)
422 			return error;
423 	}
424 
425 	error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
426 	if (error)
427 		return error;
428 	if (XFS_IS_CORRUPT(sc->mp, !have_gt))
429 		return -EFSCORRUPTED;
430 
431 	return 0;
432 }
433 
434 /* Iterate all the rmap records to generate reference count data. */
435 STATIC int
436 xrep_refc_find_refcounts(
437 	struct xrep_refc	*rr)
438 {
439 	struct xrep_refc_rmap	rrm;
440 	struct xfs_scrub	*sc = rr->sc;
441 	struct xfarray		*rmap_bag;
442 	char			*descr;
443 	uint64_t		old_stack_sz;
444 	uint64_t		stack_sz = 0;
445 	xfs_agblock_t		sbno;
446 	xfs_agblock_t		cbno;
447 	xfs_agblock_t		nbno;
448 	bool			have;
449 	int			error;
450 
451 	xrep_ag_btcur_init(sc, &sc->sa);
452 
453 	/*
454 	 * Set up a sparse array to store all the rmap records that we're
455 	 * tracking to generate a reference count record.  If this exceeds
456 	 * MAXREFCOUNT, we clamp rc_refcount.
457 	 */
458 	descr = xchk_xfile_ag_descr(sc, "rmap record bag");
459 	error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap),
460 			&rmap_bag);
461 	kfree(descr);
462 	if (error)
463 		goto out_cur;
464 
465 	/* Start the rmapbt cursor to the left of all records. */
466 	error = xfs_btree_goto_left_edge(sc->sa.rmap_cur);
467 	if (error)
468 		goto out_bag;
469 
470 	/* Process reverse mappings into refcount data. */
471 	while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
472 		/* Push all rmaps with pblk == sbno onto the stack */
473 		error = xrep_refc_walk_rmaps(rr, &rrm, &have);
474 		if (error)
475 			goto out_bag;
476 		if (!have)
477 			break;
478 		sbno = cbno = rrm.startblock;
479 		error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno,
480 					&rrm, &have, &stack_sz);
481 		if (error)
482 			goto out_bag;
483 
484 		/* Set nbno to the bno of the next refcount change */
485 		error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno);
486 		if (error)
487 			goto out_bag;
488 
489 		ASSERT(nbno > sbno);
490 		old_stack_sz = stack_sz;
491 
492 		/* While stack isn't empty... */
493 		while (stack_sz) {
494 			xfarray_idx_t	array_cur = XFARRAY_CURSOR_INIT;
495 
496 			/* Pop all rmaps that end at nbno */
497 			while ((error = xfarray_iter(rmap_bag, &array_cur,
498 								&rrm)) == 1) {
499 				if (RRM_NEXT(rrm) != nbno)
500 					continue;
501 				error = xfarray_unset(rmap_bag, array_cur - 1);
502 				if (error)
503 					goto out_bag;
504 				stack_sz--;
505 			}
506 			if (error)
507 				goto out_bag;
508 
509 			/* Push array items that start at nbno */
510 			error = xrep_refc_walk_rmaps(rr, &rrm, &have);
511 			if (error)
512 				goto out_bag;
513 			if (have) {
514 				error = xrep_refc_push_rmaps_at(rr, rmap_bag,
515 						nbno, &rrm, &have, &stack_sz);
516 				if (error)
517 					goto out_bag;
518 			}
519 
520 			/* Emit refcount if necessary */
521 			ASSERT(nbno > cbno);
522 			if (stack_sz != old_stack_sz) {
523 				if (old_stack_sz > 1) {
524 					error = xrep_refc_stash(rr,
525 							XFS_REFC_DOMAIN_SHARED,
526 							cbno, nbno - cbno,
527 							old_stack_sz);
528 					if (error)
529 						goto out_bag;
530 				}
531 				cbno = nbno;
532 			}
533 
534 			/* Stack empty, go find the next rmap */
535 			if (stack_sz == 0)
536 				break;
537 			old_stack_sz = stack_sz;
538 			sbno = nbno;
539 
540 			/* Set nbno to the bno of the next refcount change */
541 			error = xrep_refc_next_edge(rmap_bag, &rrm, have,
542 					&nbno);
543 			if (error)
544 				goto out_bag;
545 
546 			ASSERT(nbno > sbno);
547 		}
548 	}
549 
550 	ASSERT(stack_sz == 0);
551 out_bag:
552 	xfarray_destroy(rmap_bag);
553 out_cur:
554 	xchk_ag_btcur_free(&sc->sa);
555 	return error;
556 }
557 #undef RRM_NEXT
558 
559 /* Retrieve refcountbt data for bulk load. */
560 STATIC int
561 xrep_refc_get_records(
562 	struct xfs_btree_cur		*cur,
563 	unsigned int			idx,
564 	struct xfs_btree_block		*block,
565 	unsigned int			nr_wanted,
566 	void				*priv)
567 {
568 	struct xfs_refcount_irec	*irec = &cur->bc_rec.rc;
569 	struct xrep_refc		*rr = priv;
570 	union xfs_btree_rec		*block_rec;
571 	unsigned int			loaded;
572 	int				error;
573 
574 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
575 		error = xfarray_load(rr->refcount_records, rr->array_cur++,
576 				irec);
577 		if (error)
578 			return error;
579 
580 		block_rec = xfs_btree_rec_addr(cur, idx, block);
581 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
582 	}
583 
584 	return loaded;
585 }
586 
587 /* Feed one of the new btree blocks to the bulk loader. */
588 STATIC int
589 xrep_refc_claim_block(
590 	struct xfs_btree_cur	*cur,
591 	union xfs_btree_ptr	*ptr,
592 	void			*priv)
593 {
594 	struct xrep_refc        *rr = priv;
595 
596 	return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
597 }
598 
599 /* Update the AGF counters. */
600 STATIC int
601 xrep_refc_reset_counters(
602 	struct xrep_refc	*rr)
603 {
604 	struct xfs_scrub	*sc = rr->sc;
605 	struct xfs_perag	*pag = sc->sa.pag;
606 
607 	/*
608 	 * After we commit the new btree to disk, it is possible that the
609 	 * process to reap the old btree blocks will race with the AIL trying
610 	 * to checkpoint the old btree blocks into the filesystem.  If the new
611 	 * tree is shorter than the old one, the refcountbt write verifier will
612 	 * fail and the AIL will shut down the filesystem.
613 	 *
614 	 * To avoid this, save the old incore btree height values as the alt
615 	 * height values before re-initializing the perag info from the updated
616 	 * AGF to capture all the new values.
617 	 */
618 	pag->pagf_repair_refcount_level = pag->pagf_refcount_level;
619 
620 	/* Reinitialize with the values we just logged. */
621 	return xrep_reinit_pagf(sc);
622 }
623 
624 /*
625  * Use the collected refcount information to stage a new refcount btree.  If
626  * this is successful we'll return with the new btree root information logged
627  * to the repair transaction but not yet committed.
628  */
629 STATIC int
630 xrep_refc_build_new_tree(
631 	struct xrep_refc	*rr)
632 {
633 	struct xfs_scrub	*sc = rr->sc;
634 	struct xfs_btree_cur	*refc_cur;
635 	struct xfs_perag	*pag = sc->sa.pag;
636 	xfs_fsblock_t		fsbno;
637 	int			error;
638 
639 	error = xrep_refc_sort_records(rr);
640 	if (error)
641 		return error;
642 
643 	/*
644 	 * Prepare to construct the new btree by reserving disk space for the
645 	 * new btree and setting up all the accounting information we'll need
646 	 * to root the new btree while it's under construction and before we
647 	 * attach it to the AG header.
648 	 */
649 	fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp));
650 	xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno,
651 			XFS_AG_RESV_METADATA);
652 	rr->new_btree.bload.get_records = xrep_refc_get_records;
653 	rr->new_btree.bload.claim_block = xrep_refc_claim_block;
654 
655 	/* Compute how many blocks we'll need. */
656 	refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake,
657 			pag);
658 	error = xfs_btree_bload_compute_geometry(refc_cur,
659 			&rr->new_btree.bload,
660 			xfarray_length(rr->refcount_records));
661 	if (error)
662 		goto err_cur;
663 
664 	/* Last chance to abort before we start committing fixes. */
665 	if (xchk_should_terminate(sc, &error))
666 		goto err_cur;
667 
668 	/* Reserve the space we'll need for the new btree. */
669 	error = xrep_newbt_alloc_blocks(&rr->new_btree,
670 			rr->new_btree.bload.nr_blocks);
671 	if (error)
672 		goto err_cur;
673 
674 	/*
675 	 * Due to btree slack factors, it's possible for a new btree to be one
676 	 * level taller than the old btree.  Update the incore btree height so
677 	 * that we don't trip the verifiers when writing the new btree blocks
678 	 * to disk.
679 	 */
680 	pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height;
681 
682 	/* Add all observed refcount records. */
683 	rr->array_cur = XFARRAY_CURSOR_INIT;
684 	error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
685 	if (error)
686 		goto err_level;
687 
688 	/*
689 	 * Install the new btree in the AG header.  After this point the old
690 	 * btree is no longer accessible and the new tree is live.
691 	 */
692 	xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp);
693 	xfs_btree_del_cursor(refc_cur, 0);
694 
695 	/* Reset the AGF counters now that we've changed the btree shape. */
696 	error = xrep_refc_reset_counters(rr);
697 	if (error)
698 		goto err_newbt;
699 
700 	/* Dispose of any unused blocks and the accounting information. */
701 	error = xrep_newbt_commit(&rr->new_btree);
702 	if (error)
703 		return error;
704 
705 	return xrep_roll_ag_trans(sc);
706 
707 err_level:
708 	pag->pagf_repair_refcount_level = 0;
709 err_cur:
710 	xfs_btree_del_cursor(refc_cur, error);
711 err_newbt:
712 	xrep_newbt_cancel(&rr->new_btree);
713 	return error;
714 }
715 
716 /*
717  * Now that we've logged the roots of the new btrees, invalidate all of the
718  * old blocks and free them.
719  */
720 STATIC int
721 xrep_refc_remove_old_tree(
722 	struct xrep_refc	*rr)
723 {
724 	struct xfs_scrub	*sc = rr->sc;
725 	struct xfs_perag	*pag = sc->sa.pag;
726 	int			error;
727 
728 	/* Free the old refcountbt blocks if they're not in use. */
729 	error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks,
730 			&XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
731 	if (error)
732 		return error;
733 
734 	/*
735 	 * Now that we've zapped all the old refcountbt blocks we can turn off
736 	 * the alternate height mechanism and reset the per-AG space
737 	 * reservations.
738 	 */
739 	pag->pagf_repair_refcount_level = 0;
740 	sc->flags |= XREP_RESET_PERAG_RESV;
741 	return 0;
742 }
743 
744 /* Rebuild the refcount btree. */
745 int
746 xrep_refcountbt(
747 	struct xfs_scrub	*sc)
748 {
749 	struct xrep_refc	*rr;
750 	struct xfs_mount	*mp = sc->mp;
751 	char			*descr;
752 	int			error;
753 
754 	/* We require the rmapbt to rebuild anything. */
755 	if (!xfs_has_rmapbt(mp))
756 		return -EOPNOTSUPP;
757 
758 	rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS);
759 	if (!rr)
760 		return -ENOMEM;
761 	rr->sc = sc;
762 
763 	/* Set up enough storage to handle one refcount record per block. */
764 	descr = xchk_xfile_ag_descr(sc, "reference count records");
765 	error = xfarray_create(descr, mp->m_sb.sb_agblocks,
766 			sizeof(struct xfs_refcount_irec),
767 			&rr->refcount_records);
768 	kfree(descr);
769 	if (error)
770 		goto out_rr;
771 
772 	/* Collect all reference counts. */
773 	xagb_bitmap_init(&rr->old_refcountbt_blocks);
774 	error = xrep_refc_find_refcounts(rr);
775 	if (error)
776 		goto out_bitmap;
777 
778 	/* Rebuild the refcount information. */
779 	error = xrep_refc_build_new_tree(rr);
780 	if (error)
781 		goto out_bitmap;
782 
783 	/* Kill the old tree. */
784 	error = xrep_refc_remove_old_tree(rr);
785 	if (error)
786 		goto out_bitmap;
787 
788 out_bitmap:
789 	xagb_bitmap_destroy(&rr->old_refcountbt_blocks);
790 	xfarray_destroy(rr->refcount_records);
791 out_rr:
792 	kfree(rr);
793 	return error;
794 }
795