xref: /linux/fs/xfs/scrub/alloc_repair.c (revision be239684b18e1cdcafcf8c7face4a2f562c745ad)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_bit.h"
16 #include "xfs_log_format.h"
17 #include "xfs_trans.h"
18 #include "xfs_sb.h"
19 #include "xfs_alloc.h"
20 #include "xfs_alloc_btree.h"
21 #include "xfs_rmap.h"
22 #include "xfs_rmap_btree.h"
23 #include "xfs_inode.h"
24 #include "xfs_refcount.h"
25 #include "xfs_extent_busy.h"
26 #include "xfs_health.h"
27 #include "xfs_bmap.h"
28 #include "xfs_ialloc.h"
29 #include "xfs_ag.h"
30 #include "scrub/xfs_scrub.h"
31 #include "scrub/scrub.h"
32 #include "scrub/common.h"
33 #include "scrub/btree.h"
34 #include "scrub/trace.h"
35 #include "scrub/repair.h"
36 #include "scrub/bitmap.h"
37 #include "scrub/agb_bitmap.h"
38 #include "scrub/xfile.h"
39 #include "scrub/xfarray.h"
40 #include "scrub/newbt.h"
41 #include "scrub/reap.h"
42 
43 /*
44  * Free Space Btree Repair
45  * =======================
46  *
47  * The reverse mappings are supposed to record all space usage for the entire
48  * AG.  Therefore, we can recreate the free extent records in an AG by looking
49  * for gaps in the physical extents recorded in the rmapbt.  These records are
50  * staged in @free_records.  Identifying the gaps is more difficult on a
51  * reflink filesystem because rmap records are allowed to overlap.
52  *
53  * Because the final step of building a new index is to free the space used by
54  * the old index, repair needs to find that space.  Unfortunately, all
55  * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share
56  * the same rmapbt owner code (OWN_AG), so this is not straightforward.
57  *
58  * The scan of the reverse mapping information records the space used by OWN_AG
59  * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed.  While
60  * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to
61  * record all visited rmap btree blocks and all blocks owned by the AGFL.
62  *
63  * After that is where the definitions of old_allocbt_blocks shifts.  This
64  * expression identifies possible former bnobt/cntbt blocks:
65  *
66  *	(OWN_AG blocks) & ~(rmapbt blocks | agfl blocks);
67  *
68  * Substituting from above definitions, that becomes:
69  *
70  *	old_allocbt_blocks & ~not_allocbt_blocks
71  *
72  * The OWN_AG bitmap itself isn't needed after this point, so what we really do
73  * instead is:
74  *
75  *	old_allocbt_blocks &= ~not_allocbt_blocks;
76  *
77  * After this point, @old_allocbt_blocks is a bitmap of alleged former
78  * bnobt/cntbt blocks.  The xagb_bitmap_disunion operation modifies its first
79  * parameter in place to avoid copying records around.
80  *
81  * Next, some of the space described by @free_records are diverted to the newbt
82  * reservation and used to format new btree blocks.  The remaining records are
83  * written to the new btree indices.  We reconstruct both bnobt and cntbt at
84  * the same time since we've already done all the work.
85  *
86  * We use the prefix 'xrep_abt' here because we regenerate both free space
87  * allocation btrees at the same time.
88  */
89 
90 struct xrep_abt {
91 	/* Blocks owned by the rmapbt or the agfl. */
92 	struct xagb_bitmap	not_allocbt_blocks;
93 
94 	/* All OWN_AG blocks. */
95 	struct xagb_bitmap	old_allocbt_blocks;
96 
97 	/*
98 	 * New bnobt information.  All btree block reservations are added to
99 	 * the reservation list in new_bnobt.
100 	 */
101 	struct xrep_newbt	new_bnobt;
102 
103 	/* new cntbt information */
104 	struct xrep_newbt	new_cntbt;
105 
106 	/* Free space extents. */
107 	struct xfarray		*free_records;
108 
109 	struct xfs_scrub	*sc;
110 
111 	/* Number of non-null records in @free_records. */
112 	uint64_t		nr_real_records;
113 
114 	/* get_records()'s position in the free space record array. */
115 	xfarray_idx_t		array_cur;
116 
117 	/*
118 	 * Next block we anticipate seeing in the rmap records.  If the next
119 	 * rmap record is greater than next_agbno, we have found unused space.
120 	 */
121 	xfs_agblock_t		next_agbno;
122 
123 	/* Number of free blocks in this AG. */
124 	xfs_agblock_t		nr_blocks;
125 
126 	/* Longest free extent we found in the AG. */
127 	xfs_agblock_t		longest;
128 };
129 
130 /* Set up to repair AG free space btrees. */
131 int
132 xrep_setup_ag_allocbt(
133 	struct xfs_scrub	*sc)
134 {
135 	unsigned int		busy_gen;
136 
137 	/*
138 	 * Make sure the busy extent list is clear because we can't put extents
139 	 * on there twice.
140 	 */
141 	busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
142 	if (xfs_extent_busy_list_empty(sc->sa.pag))
143 		return 0;
144 
145 	return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
146 }
147 
148 /* Check for any obvious conflicts in the free extent. */
149 STATIC int
150 xrep_abt_check_free_ext(
151 	struct xfs_scrub	*sc,
152 	const struct xfs_alloc_rec_incore *rec)
153 {
154 	enum xbtree_recpacking	outcome;
155 	int			error;
156 
157 	if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL)
158 		return -EFSCORRUPTED;
159 
160 	/* Must not be an inode chunk. */
161 	error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
162 			rec->ar_startblock, rec->ar_blockcount, &outcome);
163 	if (error)
164 		return error;
165 	if (outcome != XBTREE_RECPACKING_EMPTY)
166 		return -EFSCORRUPTED;
167 
168 	/* Must not be shared or CoW staging. */
169 	if (sc->sa.refc_cur) {
170 		error = xfs_refcount_has_records(sc->sa.refc_cur,
171 				XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
172 				rec->ar_blockcount, &outcome);
173 		if (error)
174 			return error;
175 		if (outcome != XBTREE_RECPACKING_EMPTY)
176 			return -EFSCORRUPTED;
177 
178 		error = xfs_refcount_has_records(sc->sa.refc_cur,
179 				XFS_REFC_DOMAIN_COW, rec->ar_startblock,
180 				rec->ar_blockcount, &outcome);
181 		if (error)
182 			return error;
183 		if (outcome != XBTREE_RECPACKING_EMPTY)
184 			return -EFSCORRUPTED;
185 	}
186 
187 	return 0;
188 }
189 
190 /*
191  * Stash a free space record for all the space since the last bno we found
192  * all the way up to @end.
193  */
194 static int
195 xrep_abt_stash(
196 	struct xrep_abt		*ra,
197 	xfs_agblock_t		end)
198 {
199 	struct xfs_alloc_rec_incore arec = {
200 		.ar_startblock	= ra->next_agbno,
201 		.ar_blockcount	= end - ra->next_agbno,
202 	};
203 	struct xfs_scrub	*sc = ra->sc;
204 	int			error = 0;
205 
206 	if (xchk_should_terminate(sc, &error))
207 		return error;
208 
209 	error = xrep_abt_check_free_ext(ra->sc, &arec);
210 	if (error)
211 		return error;
212 
213 	trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
214 
215 	error = xfarray_append(ra->free_records, &arec);
216 	if (error)
217 		return error;
218 
219 	ra->nr_blocks += arec.ar_blockcount;
220 	return 0;
221 }
222 
223 /* Record extents that aren't in use from gaps in the rmap records. */
224 STATIC int
225 xrep_abt_walk_rmap(
226 	struct xfs_btree_cur		*cur,
227 	const struct xfs_rmap_irec	*rec,
228 	void				*priv)
229 {
230 	struct xrep_abt			*ra = priv;
231 	int				error;
232 
233 	/* Record all the OWN_AG blocks... */
234 	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
235 		error = xagb_bitmap_set(&ra->old_allocbt_blocks,
236 				rec->rm_startblock, rec->rm_blockcount);
237 		if (error)
238 			return error;
239 	}
240 
241 	/* ...and all the rmapbt blocks... */
242 	error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
243 	if (error)
244 		return error;
245 
246 	/* ...and all the free space. */
247 	if (rec->rm_startblock > ra->next_agbno) {
248 		error = xrep_abt_stash(ra, rec->rm_startblock);
249 		if (error)
250 			return error;
251 	}
252 
253 	/*
254 	 * rmap records can overlap on reflink filesystems, so project
255 	 * next_agbno as far out into the AG space as we currently know about.
256 	 */
257 	ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
258 			rec->rm_startblock + rec->rm_blockcount);
259 	return 0;
260 }
261 
262 /* Collect an AGFL block for the not-to-release list. */
263 static int
264 xrep_abt_walk_agfl(
265 	struct xfs_mount	*mp,
266 	xfs_agblock_t		agbno,
267 	void			*priv)
268 {
269 	struct xrep_abt		*ra = priv;
270 
271 	return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1);
272 }
273 
274 /*
275  * Compare two free space extents by block number.  We want to sort in order of
276  * increasing block number.
277  */
278 static int
279 xrep_bnobt_extent_cmp(
280 	const void		*a,
281 	const void		*b)
282 {
283 	const struct xfs_alloc_rec_incore *ap = a;
284 	const struct xfs_alloc_rec_incore *bp = b;
285 
286 	if (ap->ar_startblock > bp->ar_startblock)
287 		return 1;
288 	else if (ap->ar_startblock < bp->ar_startblock)
289 		return -1;
290 	return 0;
291 }
292 
293 /*
294  * Re-sort the free extents by block number so that we can put the records into
295  * the bnobt in the correct order.  Make sure the records do not overlap in
296  * physical space.
297  */
298 STATIC int
299 xrep_bnobt_sort_records(
300 	struct xrep_abt			*ra)
301 {
302 	struct xfs_alloc_rec_incore	arec;
303 	xfarray_idx_t			cur = XFARRAY_CURSOR_INIT;
304 	xfs_agblock_t			next_agbno = 0;
305 	int				error;
306 
307 	error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0);
308 	if (error)
309 		return error;
310 
311 	while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) {
312 		if (arec.ar_startblock < next_agbno)
313 			return -EFSCORRUPTED;
314 
315 		next_agbno = arec.ar_startblock + arec.ar_blockcount;
316 	}
317 
318 	return error;
319 }
320 
321 /*
322  * Compare two free space extents by length and then block number.  We want
323  * to sort first in order of increasing length and then in order of increasing
324  * block number.
325  */
326 static int
327 xrep_cntbt_extent_cmp(
328 	const void			*a,
329 	const void			*b)
330 {
331 	const struct xfs_alloc_rec_incore *ap = a;
332 	const struct xfs_alloc_rec_incore *bp = b;
333 
334 	if (ap->ar_blockcount > bp->ar_blockcount)
335 		return 1;
336 	else if (ap->ar_blockcount < bp->ar_blockcount)
337 		return -1;
338 	return xrep_bnobt_extent_cmp(a, b);
339 }
340 
341 /*
342  * Sort the free extents by length so so that we can put the records into the
343  * cntbt in the correct order.  Don't let userspace kill us if we're resorting
344  * after allocating btree blocks.
345  */
346 STATIC int
347 xrep_cntbt_sort_records(
348 	struct xrep_abt			*ra,
349 	bool				is_resort)
350 {
351 	return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
352 			is_resort ? 0 : XFARRAY_SORT_KILLABLE);
353 }
354 
355 /*
356  * Iterate all reverse mappings to find (1) the gaps between rmap records (all
357  * unowned space), (2) the OWN_AG extents (which encompass the free space
358  * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
359  * blocks.  The free space is (1) + (2) - (3) - (4).
360  */
361 STATIC int
362 xrep_abt_find_freespace(
363 	struct xrep_abt		*ra)
364 {
365 	struct xfs_scrub	*sc = ra->sc;
366 	struct xfs_mount	*mp = sc->mp;
367 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
368 	struct xfs_buf		*agfl_bp;
369 	xfs_agblock_t		agend;
370 	int			error;
371 
372 	xagb_bitmap_init(&ra->not_allocbt_blocks);
373 
374 	xrep_ag_btcur_init(sc, &sc->sa);
375 
376 	/*
377 	 * Iterate all the reverse mappings to find gaps in the physical
378 	 * mappings, all the OWN_AG blocks, and all the rmapbt extents.
379 	 */
380 	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
381 	if (error)
382 		goto err;
383 
384 	/* Insert a record for space between the last rmap and EOAG. */
385 	agend = be32_to_cpu(agf->agf_length);
386 	if (ra->next_agbno < agend) {
387 		error = xrep_abt_stash(ra, agend);
388 		if (error)
389 			goto err;
390 	}
391 
392 	/* Collect all the AGFL blocks. */
393 	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
394 	if (error)
395 		goto err;
396 
397 	error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra);
398 	if (error)
399 		goto err_agfl;
400 
401 	/* Compute the old bnobt/cntbt blocks. */
402 	error = xagb_bitmap_disunion(&ra->old_allocbt_blocks,
403 			&ra->not_allocbt_blocks);
404 	if (error)
405 		goto err_agfl;
406 
407 	ra->nr_real_records = xfarray_length(ra->free_records);
408 err_agfl:
409 	xfs_trans_brelse(sc->tp, agfl_bp);
410 err:
411 	xchk_ag_btcur_free(&sc->sa);
412 	xagb_bitmap_destroy(&ra->not_allocbt_blocks);
413 	return error;
414 }
415 
416 /*
417  * We're going to use the observed free space records to reserve blocks for the
418  * new free space btrees, so we play an iterative game where we try to converge
419  * on the number of blocks we need:
420  *
421  * 1. Estimate how many blocks we'll need to store the records.
422  * 2. If the first free record has more blocks than we need, we're done.
423  *    We will have to re-sort the records prior to building the cntbt.
424  * 3. If that record has exactly the number of blocks we need, null out the
425  *    record.  We're done.
426  * 4. Otherwise, we still need more blocks.  Null out the record, subtract its
427  *    length from the number of blocks we need, and go back to step 1.
428  *
429  * Fortunately, we don't have to do any transaction work to play this game, so
430  * we don't have to tear down the staging cursors.
431  */
432 STATIC int
433 xrep_abt_reserve_space(
434 	struct xrep_abt		*ra,
435 	struct xfs_btree_cur	*bno_cur,
436 	struct xfs_btree_cur	*cnt_cur,
437 	bool			*needs_resort)
438 {
439 	struct xfs_scrub	*sc = ra->sc;
440 	xfarray_idx_t		record_nr;
441 	unsigned int		allocated = 0;
442 	int			error = 0;
443 
444 	record_nr = xfarray_length(ra->free_records) - 1;
445 	do {
446 		struct xfs_alloc_rec_incore arec;
447 		uint64_t		required;
448 		unsigned int		desired;
449 		unsigned int		len;
450 
451 		/* Compute how many blocks we'll need. */
452 		error = xfs_btree_bload_compute_geometry(cnt_cur,
453 				&ra->new_cntbt.bload, ra->nr_real_records);
454 		if (error)
455 			break;
456 
457 		error = xfs_btree_bload_compute_geometry(bno_cur,
458 				&ra->new_bnobt.bload, ra->nr_real_records);
459 		if (error)
460 			break;
461 
462 		/* How many btree blocks do we need to store all records? */
463 		required = ra->new_bnobt.bload.nr_blocks +
464 			   ra->new_cntbt.bload.nr_blocks;
465 		ASSERT(required < INT_MAX);
466 
467 		/* If we've reserved enough blocks, we're done. */
468 		if (allocated >= required)
469 			break;
470 
471 		desired = required - allocated;
472 
473 		/* We need space but there's none left; bye! */
474 		if (ra->nr_real_records == 0) {
475 			error = -ENOSPC;
476 			break;
477 		}
478 
479 		/* Grab the first record from the list. */
480 		error = xfarray_load(ra->free_records, record_nr, &arec);
481 		if (error)
482 			break;
483 
484 		ASSERT(arec.ar_blockcount <= UINT_MAX);
485 		len = min_t(unsigned int, arec.ar_blockcount, desired);
486 
487 		trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
488 				arec.ar_startblock, len, XFS_RMAP_OWN_AG);
489 
490 		error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
491 				arec.ar_startblock, len);
492 		if (error)
493 			break;
494 		allocated += len;
495 		ra->nr_blocks -= len;
496 
497 		if (arec.ar_blockcount > desired) {
498 			/*
499 			 * Record has more space than we need.  The number of
500 			 * free records doesn't change, so shrink the free
501 			 * record, inform the caller that the records are no
502 			 * longer sorted by length, and exit.
503 			 */
504 			arec.ar_startblock += desired;
505 			arec.ar_blockcount -= desired;
506 			error = xfarray_store(ra->free_records, record_nr,
507 					&arec);
508 			if (error)
509 				break;
510 
511 			*needs_resort = true;
512 			return 0;
513 		}
514 
515 		/*
516 		 * We're going to use up the entire record, so unset it and
517 		 * move on to the next one.  This changes the number of free
518 		 * records (but doesn't break the sorting order), so we must
519 		 * go around the loop once more to re-run _bload_init.
520 		 */
521 		error = xfarray_unset(ra->free_records, record_nr);
522 		if (error)
523 			break;
524 		ra->nr_real_records--;
525 		record_nr--;
526 	} while (1);
527 
528 	return error;
529 }
530 
531 STATIC int
532 xrep_abt_dispose_one(
533 	struct xrep_abt		*ra,
534 	struct xrep_newbt_resv	*resv)
535 {
536 	struct xfs_scrub	*sc = ra->sc;
537 	struct xfs_perag	*pag = sc->sa.pag;
538 	xfs_agblock_t		free_agbno = resv->agbno + resv->used;
539 	xfs_extlen_t		free_aglen = resv->len - resv->used;
540 	int			error;
541 
542 	ASSERT(pag == resv->pag);
543 
544 	/* Add a deferred rmap for each extent we used. */
545 	if (resv->used > 0)
546 		xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
547 				resv->used, XFS_RMAP_OWN_AG);
548 
549 	/*
550 	 * For each reserved btree block we didn't use, add it to the free
551 	 * space btree.  We didn't touch fdblocks when we reserved them, so
552 	 * we don't touch it now.
553 	 */
554 	if (free_aglen == 0)
555 		return 0;
556 
557 	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
558 			free_aglen, ra->new_bnobt.oinfo.oi_owner);
559 
560 	error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
561 			&ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
562 	if (error)
563 		return error;
564 
565 	return xrep_defer_finish(sc);
566 }
567 
568 /*
569  * Deal with all the space we reserved.  Blocks that were allocated for the
570  * free space btrees need to have a (deferred) rmap added for the OWN_AG
571  * allocation, and blocks that didn't get used can be freed via the usual
572  * (deferred) means.
573  */
574 STATIC void
575 xrep_abt_dispose_reservations(
576 	struct xrep_abt		*ra,
577 	int			error)
578 {
579 	struct xrep_newbt_resv	*resv, *n;
580 
581 	if (error)
582 		goto junkit;
583 
584 	list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
585 		error = xrep_abt_dispose_one(ra, resv);
586 		if (error)
587 			goto junkit;
588 	}
589 
590 junkit:
591 	list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
592 		xfs_perag_put(resv->pag);
593 		list_del(&resv->list);
594 		kfree(resv);
595 	}
596 
597 	xrep_newbt_cancel(&ra->new_bnobt);
598 	xrep_newbt_cancel(&ra->new_cntbt);
599 }
600 
601 /* Retrieve free space data for bulk load. */
602 STATIC int
603 xrep_abt_get_records(
604 	struct xfs_btree_cur		*cur,
605 	unsigned int			idx,
606 	struct xfs_btree_block		*block,
607 	unsigned int			nr_wanted,
608 	void				*priv)
609 {
610 	struct xfs_alloc_rec_incore	*arec = &cur->bc_rec.a;
611 	struct xrep_abt			*ra = priv;
612 	union xfs_btree_rec		*block_rec;
613 	unsigned int			loaded;
614 	int				error;
615 
616 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
617 		error = xfarray_load_next(ra->free_records, &ra->array_cur,
618 				arec);
619 		if (error)
620 			return error;
621 
622 		ra->longest = max(ra->longest, arec->ar_blockcount);
623 
624 		block_rec = xfs_btree_rec_addr(cur, idx, block);
625 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
626 	}
627 
628 	return loaded;
629 }
630 
631 /* Feed one of the new btree blocks to the bulk loader. */
632 STATIC int
633 xrep_abt_claim_block(
634 	struct xfs_btree_cur	*cur,
635 	union xfs_btree_ptr	*ptr,
636 	void			*priv)
637 {
638 	struct xrep_abt		*ra = priv;
639 
640 	return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr);
641 }
642 
643 /*
644  * Reset the AGF counters to reflect the free space btrees that we just
645  * rebuilt, then reinitialize the per-AG data.
646  */
647 STATIC int
648 xrep_abt_reset_counters(
649 	struct xrep_abt		*ra)
650 {
651 	struct xfs_scrub	*sc = ra->sc;
652 	struct xfs_perag	*pag = sc->sa.pag;
653 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
654 	unsigned int		freesp_btreeblks = 0;
655 
656 	/*
657 	 * Compute the contribution to agf_btreeblks for the new free space
658 	 * btrees.  This is the computed btree size minus anything we didn't
659 	 * use.
660 	 */
661 	freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
662 	freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
663 
664 	freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt);
665 	freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt);
666 
667 	/*
668 	 * The AGF header contains extra information related to the free space
669 	 * btrees, so we must update those fields here.
670 	 */
671 	agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
672 				(be32_to_cpu(agf->agf_rmap_blocks) - 1));
673 	agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
674 	agf->agf_longest = cpu_to_be32(ra->longest);
675 	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
676 						 XFS_AGF_LONGEST |
677 						 XFS_AGF_FREEBLKS);
678 
679 	/*
680 	 * After we commit the new btree to disk, it is possible that the
681 	 * process to reap the old btree blocks will race with the AIL trying
682 	 * to checkpoint the old btree blocks into the filesystem.  If the new
683 	 * tree is shorter than the old one, the allocbt write verifier will
684 	 * fail and the AIL will shut down the filesystem.
685 	 *
686 	 * To avoid this, save the old incore btree height values as the alt
687 	 * height values before re-initializing the perag info from the updated
688 	 * AGF to capture all the new values.
689 	 */
690 	pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi];
691 	pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi];
692 
693 	/* Reinitialize with the values we just logged. */
694 	return xrep_reinit_pagf(sc);
695 }
696 
697 /*
698  * Use the collected free space information to stage new free space btrees.
699  * If this is successful we'll return with the new btree root
700  * information logged to the repair transaction but not yet committed.
701  */
702 STATIC int
703 xrep_abt_build_new_trees(
704 	struct xrep_abt		*ra)
705 {
706 	struct xfs_scrub	*sc = ra->sc;
707 	struct xfs_btree_cur	*bno_cur;
708 	struct xfs_btree_cur	*cnt_cur;
709 	struct xfs_perag	*pag = sc->sa.pag;
710 	bool			needs_resort = false;
711 	int			error;
712 
713 	/*
714 	 * Sort the free extents by length so that we can set up the free space
715 	 * btrees in as few extents as possible.  This reduces the amount of
716 	 * deferred rmap / free work we have to do at the end.
717 	 */
718 	error = xrep_cntbt_sort_records(ra, false);
719 	if (error)
720 		return error;
721 
722 	/*
723 	 * Prepare to construct the new btree by reserving disk space for the
724 	 * new btree and setting up all the accounting information we'll need
725 	 * to root the new btree while it's under construction and before we
726 	 * attach it to the AG header.
727 	 */
728 	xrep_newbt_init_bare(&ra->new_bnobt, sc);
729 	xrep_newbt_init_bare(&ra->new_cntbt, sc);
730 
731 	ra->new_bnobt.bload.get_records = xrep_abt_get_records;
732 	ra->new_cntbt.bload.get_records = xrep_abt_get_records;
733 
734 	ra->new_bnobt.bload.claim_block = xrep_abt_claim_block;
735 	ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
736 
737 	/* Allocate cursors for the staged btrees. */
738 	bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake,
739 			pag, XFS_BTNUM_BNO);
740 	cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake,
741 			pag, XFS_BTNUM_CNT);
742 
743 	/* Last chance to abort before we start committing fixes. */
744 	if (xchk_should_terminate(sc, &error))
745 		goto err_cur;
746 
747 	/* Reserve the space we'll need for the new btrees. */
748 	error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort);
749 	if (error)
750 		goto err_cur;
751 
752 	/*
753 	 * If we need to re-sort the free extents by length, do so so that we
754 	 * can put the records into the cntbt in the correct order.
755 	 */
756 	if (needs_resort) {
757 		error = xrep_cntbt_sort_records(ra, needs_resort);
758 		if (error)
759 			goto err_cur;
760 	}
761 
762 	/*
763 	 * Due to btree slack factors, it's possible for a new btree to be one
764 	 * level taller than the old btree.  Update the alternate incore btree
765 	 * height so that we don't trip the verifiers when writing the new
766 	 * btree blocks to disk.
767 	 */
768 	pag->pagf_repair_levels[XFS_BTNUM_BNOi] =
769 					ra->new_bnobt.bload.btree_height;
770 	pag->pagf_repair_levels[XFS_BTNUM_CNTi] =
771 					ra->new_cntbt.bload.btree_height;
772 
773 	/* Load the free space by length tree. */
774 	ra->array_cur = XFARRAY_CURSOR_INIT;
775 	ra->longest = 0;
776 	error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra);
777 	if (error)
778 		goto err_levels;
779 
780 	error = xrep_bnobt_sort_records(ra);
781 	if (error)
782 		return error;
783 
784 	/* Load the free space by block number tree. */
785 	ra->array_cur = XFARRAY_CURSOR_INIT;
786 	error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra);
787 	if (error)
788 		goto err_levels;
789 
790 	/*
791 	 * Install the new btrees in the AG header.  After this point the old
792 	 * btrees are no longer accessible and the new trees are live.
793 	 */
794 	xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
795 	xfs_btree_del_cursor(bno_cur, 0);
796 	xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
797 	xfs_btree_del_cursor(cnt_cur, 0);
798 
799 	/* Reset the AGF counters now that we've changed the btree shape. */
800 	error = xrep_abt_reset_counters(ra);
801 	if (error)
802 		goto err_newbt;
803 
804 	/* Dispose of any unused blocks and the accounting information. */
805 	xrep_abt_dispose_reservations(ra, error);
806 
807 	return xrep_roll_ag_trans(sc);
808 
809 err_levels:
810 	pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
811 	pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
812 err_cur:
813 	xfs_btree_del_cursor(cnt_cur, error);
814 	xfs_btree_del_cursor(bno_cur, error);
815 err_newbt:
816 	xrep_abt_dispose_reservations(ra, error);
817 	return error;
818 }
819 
820 /*
821  * Now that we've logged the roots of the new btrees, invalidate all of the
822  * old blocks and free them.
823  */
824 STATIC int
825 xrep_abt_remove_old_trees(
826 	struct xrep_abt		*ra)
827 {
828 	struct xfs_perag	*pag = ra->sc->sa.pag;
829 	int			error;
830 
831 	/* Free the old btree blocks if they're not in use. */
832 	error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
833 			&XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
834 	if (error)
835 		return error;
836 
837 	/*
838 	 * Now that we've zapped all the old allocbt blocks we can turn off
839 	 * the alternate height mechanism.
840 	 */
841 	pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
842 	pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
843 	return 0;
844 }
845 
846 /* Repair the freespace btrees for some AG. */
847 int
848 xrep_allocbt(
849 	struct xfs_scrub	*sc)
850 {
851 	struct xrep_abt		*ra;
852 	struct xfs_mount	*mp = sc->mp;
853 	char			*descr;
854 	int			error;
855 
856 	/* We require the rmapbt to rebuild anything. */
857 	if (!xfs_has_rmapbt(mp))
858 		return -EOPNOTSUPP;
859 
860 	ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS);
861 	if (!ra)
862 		return -ENOMEM;
863 	ra->sc = sc;
864 
865 	/* We rebuild both data structures. */
866 	sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
867 
868 	/*
869 	 * Make sure the busy extent list is clear because we can't put extents
870 	 * on there twice.  In theory we cleared this before we started, but
871 	 * let's not risk the filesystem.
872 	 */
873 	if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
874 		error = -EDEADLOCK;
875 		goto out_ra;
876 	}
877 
878 	/* Set up enough storage to handle maximally fragmented free space. */
879 	descr = xchk_xfile_ag_descr(sc, "free space records");
880 	error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2,
881 			sizeof(struct xfs_alloc_rec_incore),
882 			&ra->free_records);
883 	kfree(descr);
884 	if (error)
885 		goto out_ra;
886 
887 	/* Collect the free space data and find the old btree blocks. */
888 	xagb_bitmap_init(&ra->old_allocbt_blocks);
889 	error = xrep_abt_find_freespace(ra);
890 	if (error)
891 		goto out_bitmap;
892 
893 	/* Rebuild the free space information. */
894 	error = xrep_abt_build_new_trees(ra);
895 	if (error)
896 		goto out_bitmap;
897 
898 	/* Kill the old trees. */
899 	error = xrep_abt_remove_old_trees(ra);
900 	if (error)
901 		goto out_bitmap;
902 
903 out_bitmap:
904 	xagb_bitmap_destroy(&ra->old_allocbt_blocks);
905 	xfarray_destroy(ra->free_records);
906 out_ra:
907 	kfree(ra);
908 	return error;
909 }
910 
911 /* Make sure both btrees are ok after we've rebuilt them. */
912 int
913 xrep_revalidate_allocbt(
914 	struct xfs_scrub	*sc)
915 {
916 	__u32			old_type = sc->sm->sm_type;
917 	int			error;
918 
919 	/*
920 	 * We must update sm_type temporarily so that the tree-to-tree cross
921 	 * reference checks will work in the correct direction, and also so
922 	 * that tracing will report correctly if there are more errors.
923 	 */
924 	sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
925 	error = xchk_allocbt(sc);
926 	if (error)
927 		goto out;
928 
929 	sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
930 	error = xchk_allocbt(sc);
931 out:
932 	sc->sm->sm_type = old_type;
933 	return error;
934 }
935