xref: /linux/fs/xfs/scrub/rmap_repair.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_buf_mem.h"
16 #include "xfs_btree_mem.h"
17 #include "xfs_bit.h"
18 #include "xfs_log_format.h"
19 #include "xfs_trans.h"
20 #include "xfs_sb.h"
21 #include "xfs_alloc.h"
22 #include "xfs_alloc_btree.h"
23 #include "xfs_ialloc.h"
24 #include "xfs_ialloc_btree.h"
25 #include "xfs_rmap.h"
26 #include "xfs_rmap_btree.h"
27 #include "xfs_inode.h"
28 #include "xfs_icache.h"
29 #include "xfs_bmap.h"
30 #include "xfs_bmap_btree.h"
31 #include "xfs_refcount.h"
32 #include "xfs_refcount_btree.h"
33 #include "xfs_ag.h"
34 #include "xfs_rtrmap_btree.h"
35 #include "xfs_rtgroup.h"
36 #include "xfs_rtrefcount_btree.h"
37 #include "scrub/xfs_scrub.h"
38 #include "scrub/scrub.h"
39 #include "scrub/common.h"
40 #include "scrub/btree.h"
41 #include "scrub/trace.h"
42 #include "scrub/repair.h"
43 #include "scrub/bitmap.h"
44 #include "scrub/agb_bitmap.h"
45 #include "scrub/xfile.h"
46 #include "scrub/xfarray.h"
47 #include "scrub/iscan.h"
48 #include "scrub/newbt.h"
49 #include "scrub/reap.h"
50 
51 /*
52  * Reverse Mapping Btree Repair
53  * ============================
54  *
55  * This is the most involved of all the AG space btree rebuilds.  Everywhere
56  * else in XFS we lock inodes and then AG data structures, but generating the
57  * list of rmap records requires that we be able to scan both block mapping
58  * btrees of every inode in the filesystem to see if it owns any extents in
59  * this AG.  We can't tolerate any inode updates while we do this, so we
60  * freeze the filesystem to lock everyone else out, and grant ourselves
61  * special privileges to run transactions with regular background reclamation
62  * turned off.
63  *
64  * We also have to be very careful not to allow inode reclaim to start a
65  * transaction because all transactions (other than our own) will block.
66  * Deferred inode inactivation helps us out there.
67  *
68  * I) Reverse mappings for all non-space metadata and file data are collected
69  * according to the following algorithm:
70  *
71  * 1. For each fork of each inode:
72  * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
73  * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
74  *      bmaps into rmap records (see 1.1.4).  Set bits in BMBIT for each btree
75  *      block.
76  * 1.3. If the incore extent map is loaded but the fork is in btree format,
77  *      just visit the bmbt blocks to set the corresponding BMBIT areas.
78  * 1.4. From the incore extent map, accumulate each bmap that falls into our
79  *      target AG.  Remember, multiple bmap records can map to a single rmap
80  *      record, so we cannot simply emit rmap records 1:1.
81  * 1.5. Emit rmap records for each extent in BMBIT and free it.
82  * 2. Create bitmaps INOBIT and ICHUNKBIT.
83  * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
84  *    and set bits in INOBIT for each btree block.  If the inobt has no records
85  *    at all, we must be careful to record its root in INOBIT.
86  * 4. For each block in the finobt, set the corresponding INOBIT area.
87  * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
88  * 6. Create bitmaps REFCBIT and COWBIT.
89  * 7. For each CoW staging extent in the refcountbt, set the corresponding
90  *    areas in COWBIT.
91  * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
92  * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
93  * A. Emit rmap for the AG headers.
94  * B. Emit rmap for the log, if there is one.
95  *
96  * II) The rmapbt shape and space metadata rmaps are computed as follows:
97  *
98  * 1. Count the rmaps collected in the previous step. (= NR)
99  * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
100  * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
101  * 4. Create bitmap AGBIT.
102  * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
103  * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
104  * 7. Count the extents in AGBIT. (= AGNR)
105  * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
106  * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
107  *    and clear AGBIT.  Go to step 5.
108  * A. Emit rmaps for each extent in AGBIT.
109  *
110  * III) The rmapbt is constructed and set in place as follows:
111  *
112  * 1. Sort the rmap records.
113  * 2. Bulk load the rmaps.
114  *
115  * IV) Reap the old btree blocks.
116  *
117  * 1. Create a bitmap OLDRMBIT.
118  * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
119  * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
120  * 4. Reap the extents corresponding to the set areas in OLDRMBIT.  These are
121  *    the parts of the AG that the rmap didn't find during its scan of the
122  *    primary metadata and aren't known to be in the free space, which implies
123  *    that they were the old rmapbt blocks.
124  * 5. Commit.
125  *
126  * We use the 'xrep_rmap' prefix for all the rmap functions.
127  */
128 
129 /* Context for collecting rmaps */
130 struct xrep_rmap {
131 	/* new rmapbt information */
132 	struct xrep_newbt	new_btree;
133 
134 	/* lock for the xfbtree and xfile */
135 	struct mutex		lock;
136 
137 	/* rmap records generated from primary metadata */
138 	struct xfbtree		rmap_btree;
139 
140 	struct xfs_scrub	*sc;
141 
142 	/* in-memory btree cursor for the xfs_btree_bload iteration */
143 	struct xfs_btree_cur	*mcur;
144 
145 	/* Hooks into rmap update code. */
146 	struct xfs_rmap_hook	rhook;
147 
148 	/* inode scan cursor */
149 	struct xchk_iscan	iscan;
150 
151 	/* Number of non-freespace records found. */
152 	unsigned long long	nr_records;
153 
154 	/* bnobt/cntbt contribution to btreeblks */
155 	xfs_agblock_t		freesp_btblocks;
156 
157 	/* old agf_rmap_blocks counter */
158 	unsigned int		old_rmapbt_fsbcount;
159 };
160 
161 /* Set us up to repair reverse mapping btrees. */
162 int
163 xrep_setup_ag_rmapbt(
164 	struct xfs_scrub	*sc)
165 {
166 	struct xrep_rmap	*rr;
167 	int			error;
168 
169 	xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
170 
171 	error = xrep_setup_xfbtree(sc, "reverse mapping records");
172 	if (error)
173 		return error;
174 
175 	rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS);
176 	if (!rr)
177 		return -ENOMEM;
178 
179 	rr->sc = sc;
180 	sc->buf = rr;
181 	return 0;
182 }
183 
184 /* Make sure there's nothing funny about this mapping. */
185 STATIC int
186 xrep_rmap_check_mapping(
187 	struct xfs_scrub	*sc,
188 	const struct xfs_rmap_irec *rec)
189 {
190 	enum xbtree_recpacking	outcome;
191 	int			error;
192 
193 	if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
194 		return -EFSCORRUPTED;
195 
196 	/* Make sure this isn't free space. */
197 	error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
198 			rec->rm_blockcount, &outcome);
199 	if (error)
200 		return error;
201 	if (outcome != XBTREE_RECPACKING_EMPTY)
202 		return -EFSCORRUPTED;
203 
204 	return 0;
205 }
206 
207 /* Store a reverse-mapping record. */
208 static inline int
209 xrep_rmap_stash(
210 	struct xrep_rmap	*rr,
211 	xfs_agblock_t		startblock,
212 	xfs_extlen_t		blockcount,
213 	uint64_t		owner,
214 	uint64_t		offset,
215 	unsigned int		flags)
216 {
217 	struct xfs_rmap_irec	rmap = {
218 		.rm_startblock	= startblock,
219 		.rm_blockcount	= blockcount,
220 		.rm_owner	= owner,
221 		.rm_offset	= offset,
222 		.rm_flags	= flags,
223 	};
224 	struct xfs_scrub	*sc = rr->sc;
225 	struct xfs_btree_cur	*mcur;
226 	int			error = 0;
227 
228 	if (xchk_should_terminate(sc, &error))
229 		return error;
230 
231 	if (xchk_iscan_aborted(&rr->iscan))
232 		return -EFSCORRUPTED;
233 
234 	trace_xrep_rmap_found(sc->sa.pag, &rmap);
235 
236 	mutex_lock(&rr->lock);
237 	mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
238 	error = xfs_rmap_map_raw(mcur, &rmap);
239 	xfs_btree_del_cursor(mcur, error);
240 	if (error)
241 		goto out_cancel;
242 
243 	error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
244 	if (error)
245 		goto out_abort;
246 
247 	mutex_unlock(&rr->lock);
248 	return 0;
249 
250 out_cancel:
251 	xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
252 out_abort:
253 	xchk_iscan_abort(&rr->iscan);
254 	mutex_unlock(&rr->lock);
255 	return error;
256 }
257 
258 struct xrep_rmap_stash_run {
259 	struct xrep_rmap	*rr;
260 	uint64_t		owner;
261 	unsigned int		rmap_flags;
262 };
263 
264 static int
265 xrep_rmap_stash_run(
266 	uint32_t			start,
267 	uint32_t			len,
268 	void				*priv)
269 {
270 	struct xrep_rmap_stash_run	*rsr = priv;
271 	struct xrep_rmap		*rr = rsr->rr;
272 
273 	return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
274 }
275 
276 /*
277  * Emit rmaps for every extent of bits set in the bitmap.  Caller must ensure
278  * that the ranges are in units of FS blocks.
279  */
280 STATIC int
281 xrep_rmap_stash_bitmap(
282 	struct xrep_rmap		*rr,
283 	struct xagb_bitmap		*bitmap,
284 	const struct xfs_owner_info	*oinfo)
285 {
286 	struct xrep_rmap_stash_run	rsr = {
287 		.rr			= rr,
288 		.owner			= oinfo->oi_owner,
289 		.rmap_flags		= 0,
290 	};
291 
292 	if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
293 		rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
294 	if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
295 		rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
296 
297 	return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
298 }
299 
300 /* Section (I): Finding all file and bmbt extents. */
301 
302 /* Context for accumulating rmaps for an inode fork. */
303 struct xrep_rmap_ifork {
304 	/*
305 	 * Accumulate rmap data here to turn multiple adjacent bmaps into a
306 	 * single rmap.
307 	 */
308 	struct xfs_rmap_irec	accum;
309 
310 	/* Bitmap of bmbt blocks in this AG. */
311 	struct xagb_bitmap	bmbt_blocks;
312 
313 	struct xrep_rmap	*rr;
314 
315 	/* Which inode fork? */
316 	int			whichfork;
317 };
318 
319 /* Stash an rmap that we accumulated while walking an inode fork. */
320 STATIC int
321 xrep_rmap_stash_accumulated(
322 	struct xrep_rmap_ifork	*rf)
323 {
324 	if (rf->accum.rm_blockcount == 0)
325 		return 0;
326 
327 	return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
328 			rf->accum.rm_blockcount, rf->accum.rm_owner,
329 			rf->accum.rm_offset, rf->accum.rm_flags);
330 }
331 
332 /* Accumulate a bmbt record. */
333 STATIC int
334 xrep_rmap_visit_bmbt(
335 	struct xfs_btree_cur	*cur,
336 	struct xfs_bmbt_irec	*rec,
337 	void			*priv)
338 {
339 	struct xrep_rmap_ifork	*rf = priv;
340 	struct xfs_mount	*mp = rf->rr->sc->mp;
341 	struct xfs_rmap_irec	*accum = &rf->accum;
342 	xfs_agblock_t		agbno;
343 	unsigned int		rmap_flags = 0;
344 	int			error;
345 
346 	if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
347 			pag_agno(rf->rr->sc->sa.pag))
348 		return 0;
349 
350 	agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
351 	if (rf->whichfork == XFS_ATTR_FORK)
352 		rmap_flags |= XFS_RMAP_ATTR_FORK;
353 	if (rec->br_state == XFS_EXT_UNWRITTEN)
354 		rmap_flags |= XFS_RMAP_UNWRITTEN;
355 
356 	/* If this bmap is adjacent to the previous one, just add it. */
357 	if (accum->rm_blockcount > 0 &&
358 	    rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
359 	    agbno == accum->rm_startblock + accum->rm_blockcount &&
360 	    rmap_flags == accum->rm_flags) {
361 		accum->rm_blockcount += rec->br_blockcount;
362 		return 0;
363 	}
364 
365 	/* Otherwise stash the old rmap and start accumulating a new one. */
366 	error = xrep_rmap_stash_accumulated(rf);
367 	if (error)
368 		return error;
369 
370 	accum->rm_startblock = agbno;
371 	accum->rm_blockcount = rec->br_blockcount;
372 	accum->rm_offset = rec->br_startoff;
373 	accum->rm_flags = rmap_flags;
374 	return 0;
375 }
376 
377 /* Add a btree block to the bitmap. */
378 STATIC int
379 xrep_rmap_visit_iroot_btree_block(
380 	struct xfs_btree_cur	*cur,
381 	int			level,
382 	void			*priv)
383 {
384 	struct xrep_rmap_ifork	*rf = priv;
385 	struct xfs_buf		*bp;
386 	xfs_fsblock_t		fsbno;
387 	xfs_agblock_t		agbno;
388 
389 	xfs_btree_get_block(cur, level, &bp);
390 	if (!bp)
391 		return 0;
392 
393 	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
394 	if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != pag_agno(rf->rr->sc->sa.pag))
395 		return 0;
396 
397 	agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
398 	return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
399 }
400 
401 /*
402  * Iterate a metadata btree rooted in an inode to collect rmap records for
403  * anything in this fork that matches the AG.
404  */
405 STATIC int
406 xrep_rmap_scan_iroot_btree(
407 	struct xrep_rmap_ifork	*rf,
408 	struct xfs_btree_cur	*cur)
409 {
410 	struct xfs_owner_info	oinfo;
411 	struct xrep_rmap	*rr = rf->rr;
412 	int			error;
413 
414 	xagb_bitmap_init(&rf->bmbt_blocks);
415 
416 	/* Record all the blocks in the btree itself. */
417 	error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
418 			XFS_BTREE_VISIT_ALL, rf);
419 	if (error)
420 		goto out;
421 
422 	/* Emit rmaps for the btree blocks. */
423 	xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
424 	error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
425 	if (error)
426 		goto out;
427 
428 	/* Stash any remaining accumulated rmaps. */
429 	error = xrep_rmap_stash_accumulated(rf);
430 out:
431 	xagb_bitmap_destroy(&rf->bmbt_blocks);
432 	return error;
433 }
434 
435 /*
436  * Iterate the block mapping btree to collect rmap records for anything in this
437  * fork that matches the AG.  Sets @mappings_done to true if we've scanned the
438  * block mappings in this fork.
439  */
440 STATIC int
441 xrep_rmap_scan_bmbt(
442 	struct xrep_rmap_ifork	*rf,
443 	struct xfs_inode	*ip,
444 	bool			*mappings_done)
445 {
446 	struct xrep_rmap	*rr = rf->rr;
447 	struct xfs_btree_cur	*cur;
448 	struct xfs_ifork	*ifp;
449 	int			error;
450 
451 	*mappings_done = false;
452 	ifp = xfs_ifork_ptr(ip, rf->whichfork);
453 	cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
454 
455 	if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
456 	    xfs_need_iread_extents(ifp)) {
457 		/*
458 		 * If the incore extent cache isn't loaded, scan the bmbt for
459 		 * mapping records.  This avoids loading the incore extent
460 		 * tree, which will increase memory pressure at a time when
461 		 * we're trying to run as quickly as we possibly can.  Ignore
462 		 * realtime extents.
463 		 */
464 		error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
465 		if (error)
466 			goto out_cur;
467 
468 		*mappings_done = true;
469 	}
470 
471 	/* Scan for the bmbt blocks, which always live on the data device. */
472 	error = xrep_rmap_scan_iroot_btree(rf, cur);
473 out_cur:
474 	xfs_btree_del_cursor(cur, error);
475 	return error;
476 }
477 
478 /*
479  * Iterate the in-core extent cache to collect rmap records for anything in
480  * this fork that matches the AG.
481  */
482 STATIC int
483 xrep_rmap_scan_iext(
484 	struct xrep_rmap_ifork	*rf,
485 	struct xfs_ifork	*ifp)
486 {
487 	struct xfs_bmbt_irec	rec;
488 	struct xfs_iext_cursor	icur;
489 	int			error;
490 
491 	for_each_xfs_iext(ifp, &icur, &rec) {
492 		if (isnullstartblock(rec.br_startblock))
493 			continue;
494 		error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
495 		if (error)
496 			return error;
497 	}
498 
499 	return xrep_rmap_stash_accumulated(rf);
500 }
501 
502 static int
503 xrep_rmap_scan_meta_btree(
504 	struct xrep_rmap_ifork	*rf,
505 	struct xfs_inode	*ip)
506 {
507 	struct xfs_scrub	*sc = rf->rr->sc;
508 	struct xfs_rtgroup	*rtg = NULL;
509 	struct xfs_btree_cur	*cur = NULL;
510 	enum xfs_rtg_inodes	type;
511 	int			error;
512 
513 	if (rf->whichfork != XFS_DATA_FORK)
514 		return -EFSCORRUPTED;
515 
516 	switch (ip->i_metatype) {
517 	case XFS_METAFILE_RTRMAP:
518 		type = XFS_RTGI_RMAP;
519 		break;
520 	case XFS_METAFILE_RTREFCOUNT:
521 		type = XFS_RTGI_REFCOUNT;
522 		break;
523 	default:
524 		ASSERT(0);
525 		return -EFSCORRUPTED;
526 	}
527 
528 	while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
529 		if (ip == rtg->rtg_inodes[type])
530 			goto found;
531 	}
532 
533 	/*
534 	 * We should never find an rt metadata btree inode that isn't
535 	 * associated with an rtgroup yet has ondisk blocks allocated to it.
536 	 */
537 	if (ip->i_nblocks) {
538 		ASSERT(0);
539 		return -EFSCORRUPTED;
540 	}
541 
542 	return 0;
543 
544 found:
545 	switch (ip->i_metatype) {
546 	case XFS_METAFILE_RTRMAP:
547 		cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg);
548 		break;
549 	case XFS_METAFILE_RTREFCOUNT:
550 		cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg);
551 		break;
552 	default:
553 		ASSERT(0);
554 		error = -EFSCORRUPTED;
555 		goto out_rtg;
556 	}
557 
558 	error = xrep_rmap_scan_iroot_btree(rf, cur);
559 	xfs_btree_del_cursor(cur, error);
560 out_rtg:
561 	xfs_rtgroup_rele(rtg);
562 	return error;
563 }
564 
565 /* Find all the extents from a given AG in an inode fork. */
566 STATIC int
567 xrep_rmap_scan_ifork(
568 	struct xrep_rmap	*rr,
569 	struct xfs_inode	*ip,
570 	int			whichfork)
571 {
572 	struct xrep_rmap_ifork	rf = {
573 		.accum		= { .rm_owner = ip->i_ino, },
574 		.rr		= rr,
575 		.whichfork	= whichfork,
576 	};
577 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
578 	bool			mappings_done;
579 	int			error = 0;
580 
581 	if (!ifp)
582 		return 0;
583 
584 	switch (ifp->if_format) {
585 	case XFS_DINODE_FMT_BTREE:
586 		/*
587 		 * Scan the bmap btree for data device mappings.  This includes
588 		 * the btree blocks themselves, even if this is a realtime
589 		 * file.
590 		 */
591 		error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
592 		if (error || mappings_done)
593 			return error;
594 		fallthrough;
595 	case XFS_DINODE_FMT_EXTENTS:
596 		/* Scan incore extent cache if this isn't a realtime file. */
597 		if (xfs_ifork_is_realtime(ip, whichfork))
598 			return 0;
599 
600 		return xrep_rmap_scan_iext(&rf, ifp);
601 	case XFS_DINODE_FMT_META_BTREE:
602 		return xrep_rmap_scan_meta_btree(&rf, ip);
603 	}
604 
605 	return 0;
606 }
607 
608 /*
609  * Take ILOCK on a file that we want to scan.
610  *
611  * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
612  * attr bmbt.  Otherwise, take ILOCK_SHARED.
613  */
614 static inline unsigned int
615 xrep_rmap_scan_ilock(
616 	struct xfs_inode	*ip)
617 {
618 	uint			lock_mode = XFS_ILOCK_SHARED;
619 
620 	if (xfs_need_iread_extents(&ip->i_df)) {
621 		lock_mode = XFS_ILOCK_EXCL;
622 		goto lock;
623 	}
624 
625 	if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
626 		lock_mode = XFS_ILOCK_EXCL;
627 
628 lock:
629 	xfs_ilock(ip, lock_mode);
630 	return lock_mode;
631 }
632 
633 /* Record reverse mappings for a file. */
634 STATIC int
635 xrep_rmap_scan_inode(
636 	struct xrep_rmap	*rr,
637 	struct xfs_inode	*ip)
638 {
639 	unsigned int		lock_mode = xrep_rmap_scan_ilock(ip);
640 	int			error;
641 
642 	/* Check the data fork. */
643 	error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
644 	if (error)
645 		goto out_unlock;
646 
647 	/* Check the attr fork. */
648 	error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
649 	if (error)
650 		goto out_unlock;
651 
652 	/* COW fork extents are "owned" by the refcount btree. */
653 
654 	xchk_iscan_mark_visited(&rr->iscan, ip);
655 out_unlock:
656 	xfs_iunlock(ip, lock_mode);
657 	return error;
658 }
659 
660 /* Section (I): Find all AG metadata extents except for free space metadata. */
661 
662 struct xrep_rmap_inodes {
663 	struct xrep_rmap	*rr;
664 	struct xagb_bitmap	inobt_blocks;	/* INOBIT */
665 	struct xagb_bitmap	ichunk_blocks;	/* ICHUNKBIT */
666 };
667 
668 /* Record inode btree rmaps. */
669 STATIC int
670 xrep_rmap_walk_inobt(
671 	struct xfs_btree_cur		*cur,
672 	const union xfs_btree_rec	*rec,
673 	void				*priv)
674 {
675 	struct xfs_inobt_rec_incore	irec;
676 	struct xrep_rmap_inodes		*ri = priv;
677 	struct xfs_mount		*mp = cur->bc_mp;
678 	xfs_agblock_t			agbno;
679 	xfs_extlen_t			aglen;
680 	xfs_agino_t			agino;
681 	xfs_agino_t			iperhole;
682 	unsigned int			i;
683 	int				error;
684 
685 	/* Record the inobt blocks. */
686 	error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
687 	if (error)
688 		return error;
689 
690 	xfs_inobt_btrec_to_irec(mp, rec, &irec);
691 	if (xfs_inobt_check_irec(to_perag(cur->bc_group), &irec) != NULL)
692 		return -EFSCORRUPTED;
693 
694 	agino = irec.ir_startino;
695 
696 	/* Record a non-sparse inode chunk. */
697 	if (!xfs_inobt_issparse(irec.ir_holemask)) {
698 		agbno = XFS_AGINO_TO_AGBNO(mp, agino);
699 		aglen = max_t(xfs_extlen_t, 1,
700 				XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
701 
702 		return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
703 	}
704 
705 	/* Iterate each chunk. */
706 	iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
707 			XFS_INODES_PER_HOLEMASK_BIT);
708 	aglen = iperhole / mp->m_sb.sb_inopblock;
709 	for (i = 0, agino = irec.ir_startino;
710 	     i < XFS_INOBT_HOLEMASK_BITS;
711 	     i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
712 		/* Skip holes. */
713 		if (irec.ir_holemask & (1 << i))
714 			continue;
715 
716 		/* Record the inode chunk otherwise. */
717 		agbno = XFS_AGINO_TO_AGBNO(mp, agino);
718 		error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
719 		if (error)
720 			return error;
721 	}
722 
723 	return 0;
724 }
725 
726 /* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
727 STATIC int
728 xrep_rmap_find_inode_rmaps(
729 	struct xrep_rmap	*rr)
730 {
731 	struct xrep_rmap_inodes	ri = {
732 		.rr		= rr,
733 	};
734 	struct xfs_scrub	*sc = rr->sc;
735 	int			error;
736 
737 	xagb_bitmap_init(&ri.inobt_blocks);
738 	xagb_bitmap_init(&ri.ichunk_blocks);
739 
740 	/*
741 	 * Iterate every record in the inobt so we can capture all the inode
742 	 * chunks and the blocks in the inobt itself.
743 	 */
744 	error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
745 	if (error)
746 		goto out_bitmap;
747 
748 	/*
749 	 * Note that if there are zero records in the inobt then query_all does
750 	 * nothing and we have to account the empty inobt root manually.
751 	 */
752 	if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
753 		struct xfs_agi	*agi = sc->sa.agi_bp->b_addr;
754 
755 		error = xagb_bitmap_set(&ri.inobt_blocks,
756 				be32_to_cpu(agi->agi_root), 1);
757 		if (error)
758 			goto out_bitmap;
759 	}
760 
761 	/* Scan the finobt too. */
762 	if (xfs_has_finobt(sc->mp)) {
763 		error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
764 				sc->sa.fino_cur);
765 		if (error)
766 			goto out_bitmap;
767 	}
768 
769 	/* Generate rmaps for everything. */
770 	error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
771 			&XFS_RMAP_OINFO_INOBT);
772 	if (error)
773 		goto out_bitmap;
774 	error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
775 			&XFS_RMAP_OINFO_INODES);
776 
777 out_bitmap:
778 	xagb_bitmap_destroy(&ri.inobt_blocks);
779 	xagb_bitmap_destroy(&ri.ichunk_blocks);
780 	return error;
781 }
782 
783 /* Record a CoW staging extent. */
784 STATIC int
785 xrep_rmap_walk_cowblocks(
786 	struct xfs_btree_cur		*cur,
787 	const struct xfs_refcount_irec	*irec,
788 	void				*priv)
789 {
790 	struct xagb_bitmap		*bitmap = priv;
791 
792 	if (!xfs_refcount_check_domain(irec) ||
793 	    irec->rc_domain != XFS_REFC_DOMAIN_COW)
794 		return -EFSCORRUPTED;
795 
796 	return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
797 }
798 
799 /*
800  * Collect rmaps for the blocks containing the refcount btree, and all CoW
801  * staging extents.
802  */
803 STATIC int
804 xrep_rmap_find_refcount_rmaps(
805 	struct xrep_rmap	*rr)
806 {
807 	struct xagb_bitmap	refcountbt_blocks;	/* REFCBIT */
808 	struct xagb_bitmap	cow_blocks;		/* COWBIT */
809 	struct xfs_refcount_irec low = {
810 		.rc_startblock	= 0,
811 		.rc_domain	= XFS_REFC_DOMAIN_COW,
812 	};
813 	struct xfs_refcount_irec high = {
814 		.rc_startblock	= -1U,
815 		.rc_domain	= XFS_REFC_DOMAIN_COW,
816 	};
817 	struct xfs_scrub	*sc = rr->sc;
818 	int			error;
819 
820 	if (!xfs_has_reflink(sc->mp))
821 		return 0;
822 
823 	xagb_bitmap_init(&refcountbt_blocks);
824 	xagb_bitmap_init(&cow_blocks);
825 
826 	/* refcountbt */
827 	error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
828 	if (error)
829 		goto out_bitmap;
830 
831 	/* Collect rmaps for CoW staging extents. */
832 	error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
833 			xrep_rmap_walk_cowblocks, &cow_blocks);
834 	if (error)
835 		goto out_bitmap;
836 
837 	/* Generate rmaps for everything. */
838 	error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
839 	if (error)
840 		goto out_bitmap;
841 	error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
842 			&XFS_RMAP_OINFO_REFC);
843 
844 out_bitmap:
845 	xagb_bitmap_destroy(&cow_blocks);
846 	xagb_bitmap_destroy(&refcountbt_blocks);
847 	return error;
848 }
849 
850 /* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
851 STATIC int
852 xrep_rmap_find_agheader_rmaps(
853 	struct xrep_rmap	*rr)
854 {
855 	struct xfs_scrub	*sc = rr->sc;
856 
857 	/* Create a record for the AG sb->agfl. */
858 	return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
859 			XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
860 			XFS_RMAP_OWN_FS, 0, 0);
861 }
862 
863 /* Generate rmaps for the log, if it's in this AG. */
864 STATIC int
865 xrep_rmap_find_log_rmaps(
866 	struct xrep_rmap	*rr)
867 {
868 	struct xfs_scrub	*sc = rr->sc;
869 
870 	if (!xfs_ag_contains_log(sc->mp, pag_agno(sc->sa.pag)))
871 		return 0;
872 
873 	return xrep_rmap_stash(rr,
874 			XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
875 			sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
876 }
877 
878 /* Check and count all the records that we gathered. */
879 STATIC int
880 xrep_rmap_check_record(
881 	struct xfs_btree_cur		*cur,
882 	const struct xfs_rmap_irec	*rec,
883 	void				*priv)
884 {
885 	struct xrep_rmap		*rr = priv;
886 	int				error;
887 
888 	error = xrep_rmap_check_mapping(rr->sc, rec);
889 	if (error)
890 		return error;
891 
892 	rr->nr_records++;
893 	return 0;
894 }
895 
896 /*
897  * Generate all the reverse-mappings for this AG, a list of the old rmapbt
898  * blocks, and the new btreeblks count.  Figure out if we have enough free
899  * space to reconstruct the inode btrees.  The caller must clean up the lists
900  * if anything goes wrong.  This implements section (I) above.
901  */
902 STATIC int
903 xrep_rmap_find_rmaps(
904 	struct xrep_rmap	*rr)
905 {
906 	struct xfs_scrub	*sc = rr->sc;
907 	struct xchk_ag		*sa = &sc->sa;
908 	struct xfs_inode	*ip;
909 	struct xfs_btree_cur	*mcur;
910 	int			error;
911 
912 	/* Find all the per-AG metadata. */
913 	xrep_ag_btcur_init(sc, &sc->sa);
914 
915 	error = xrep_rmap_find_inode_rmaps(rr);
916 	if (error)
917 		goto end_agscan;
918 
919 	error = xrep_rmap_find_refcount_rmaps(rr);
920 	if (error)
921 		goto end_agscan;
922 
923 	error = xrep_rmap_find_agheader_rmaps(rr);
924 	if (error)
925 		goto end_agscan;
926 
927 	error = xrep_rmap_find_log_rmaps(rr);
928 end_agscan:
929 	xchk_ag_btcur_free(&sc->sa);
930 	if (error)
931 		return error;
932 
933 	/*
934 	 * Set up for a potentially lengthy filesystem scan by reducing our
935 	 * transaction resource usage for the duration.  Specifically:
936 	 *
937 	 * Unlock the AG header buffers and cancel the transaction to release
938 	 * the log grant space while we scan the filesystem.
939 	 *
940 	 * Create a new empty transaction to eliminate the possibility of the
941 	 * inode scan deadlocking on cyclical metadata.
942 	 *
943 	 * We pass the empty transaction to the file scanning function to avoid
944 	 * repeatedly cycling empty transactions.  This can be done even though
945 	 * we take the IOLOCK to quiesce the file because empty transactions
946 	 * do not take sb_internal.
947 	 */
948 	sa->agf_bp = NULL;
949 	sa->agi_bp = NULL;
950 	xchk_trans_cancel(sc);
951 	xchk_trans_alloc_empty(sc);
952 
953 	/* Iterate all AGs for inodes rmaps. */
954 	while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
955 		error = xrep_rmap_scan_inode(rr, ip);
956 		xchk_irele(sc, ip);
957 		if (error)
958 			break;
959 
960 		if (xchk_should_terminate(sc, &error))
961 			break;
962 	}
963 	xchk_iscan_iter_finish(&rr->iscan);
964 	if (error)
965 		return error;
966 
967 	/*
968 	 * Switch out for a real transaction and lock the AG headers in
969 	 * preparation for building a new tree.
970 	 */
971 	xchk_trans_cancel(sc);
972 	error = xchk_setup_fs(sc);
973 	if (error)
974 		return error;
975 	error = xchk_perag_drain_and_lock(sc);
976 	if (error)
977 		return error;
978 
979 	/*
980 	 * If a hook failed to update the in-memory btree, we lack the data to
981 	 * continue the repair.
982 	 */
983 	if (xchk_iscan_aborted(&rr->iscan))
984 		return -EFSCORRUPTED;
985 
986 	/*
987 	 * Now that we have everything locked again, we need to count the
988 	 * number of rmap records stashed in the btree.  This should reflect
989 	 * all actively-owned space in the filesystem.  At the same time, check
990 	 * all our records before we start building a new btree, which requires
991 	 * a bnobt cursor.
992 	 */
993 	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
994 	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
995 			sc->sa.pag);
996 
997 	rr->nr_records = 0;
998 	error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
999 
1000 	xfs_btree_del_cursor(sc->sa.bno_cur, error);
1001 	sc->sa.bno_cur = NULL;
1002 	xfs_btree_del_cursor(mcur, error);
1003 
1004 	return error;
1005 }
1006 
1007 /* Section (II): Reserving space for new rmapbt and setting free space bitmap */
1008 
1009 struct xrep_rmap_agfl {
1010 	struct xagb_bitmap	*bitmap;
1011 	xfs_agnumber_t		agno;
1012 };
1013 
1014 /* Add an AGFL block to the rmap list. */
1015 STATIC int
1016 xrep_rmap_walk_agfl(
1017 	struct xfs_mount	*mp,
1018 	xfs_agblock_t		agbno,
1019 	void			*priv)
1020 {
1021 	struct xrep_rmap_agfl	*ra = priv;
1022 
1023 	return xagb_bitmap_set(ra->bitmap, agbno, 1);
1024 }
1025 
1026 /*
1027  * Run one round of reserving space for the new rmapbt and recomputing the
1028  * number of blocks needed to store the previously observed rmapbt records and
1029  * the ones we'll create for the free space metadata.  When we don't need more
1030  * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
1031  * true.
1032  */
1033 STATIC int
1034 xrep_rmap_try_reserve(
1035 	struct xrep_rmap	*rr,
1036 	struct xfs_btree_cur	*rmap_cur,
1037 	struct xagb_bitmap	*freesp_blocks,
1038 	uint64_t		*blocks_reserved,
1039 	bool			*done)
1040 {
1041 	struct xrep_rmap_agfl	ra = {
1042 		.bitmap		= freesp_blocks,
1043 		.agno		= pag_agno(rr->sc->sa.pag),
1044 	};
1045 	struct xfs_scrub	*sc = rr->sc;
1046 	struct xrep_newbt_resv	*resv, *n;
1047 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1048 	struct xfs_buf		*agfl_bp;
1049 	uint64_t		nr_blocks;	/* RMB */
1050 	uint64_t		freesp_records;
1051 	int			error;
1052 
1053 	/*
1054 	 * We're going to recompute new_btree.bload.nr_blocks at the end of
1055 	 * this function to reflect however many btree blocks we need to store
1056 	 * all the rmap records (including the ones that reflect the changes we
1057 	 * made to support the new rmapbt blocks), so we save the old value
1058 	 * here so we can decide if we've reserved enough blocks.
1059 	 */
1060 	nr_blocks = rr->new_btree.bload.nr_blocks;
1061 
1062 	/*
1063 	 * Make sure we've reserved enough space for the new btree.  This can
1064 	 * change the shape of the free space btrees, which can cause secondary
1065 	 * interactions with the rmap records because all three space btrees
1066 	 * have the same rmap owner.  We'll account for all that below.
1067 	 */
1068 	error = xrep_newbt_alloc_blocks(&rr->new_btree,
1069 			nr_blocks - *blocks_reserved);
1070 	if (error)
1071 		return error;
1072 
1073 	*blocks_reserved = rr->new_btree.bload.nr_blocks;
1074 
1075 	/* Clear everything in the bitmap. */
1076 	xagb_bitmap_destroy(freesp_blocks);
1077 
1078 	/* Set all the bnobt blocks in the bitmap. */
1079 	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1080 			sc->sa.pag);
1081 	error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
1082 	xfs_btree_del_cursor(sc->sa.bno_cur, error);
1083 	sc->sa.bno_cur = NULL;
1084 	if (error)
1085 		return error;
1086 
1087 	/* Set all the cntbt blocks in the bitmap. */
1088 	sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1089 			sc->sa.pag);
1090 	error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
1091 	xfs_btree_del_cursor(sc->sa.cnt_cur, error);
1092 	sc->sa.cnt_cur = NULL;
1093 	if (error)
1094 		return error;
1095 
1096 	/* Record our new btreeblks value. */
1097 	rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
1098 
1099 	/* Set all the new rmapbt blocks in the bitmap. */
1100 	list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
1101 		error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
1102 		if (error)
1103 			return error;
1104 	}
1105 
1106 	/* Set all the AGFL blocks in the bitmap. */
1107 	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
1108 	if (error)
1109 		return error;
1110 
1111 	error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
1112 	if (error)
1113 		return error;
1114 
1115 	/* Count the extents in the bitmap. */
1116 	freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
1117 
1118 	/* Compute how many blocks we'll need for all the rmaps. */
1119 	error = xfs_btree_bload_compute_geometry(rmap_cur,
1120 			&rr->new_btree.bload, rr->nr_records + freesp_records);
1121 	if (error)
1122 		return error;
1123 
1124 	/* We're done when we don't need more blocks. */
1125 	*done = nr_blocks >= rr->new_btree.bload.nr_blocks;
1126 	return 0;
1127 }
1128 
1129 /*
1130  * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
1131  * the free space metadata.  This implements section (II) above.
1132  */
1133 STATIC int
1134 xrep_rmap_reserve_space(
1135 	struct xrep_rmap	*rr,
1136 	struct xfs_btree_cur	*rmap_cur)
1137 {
1138 	struct xagb_bitmap	freesp_blocks;	/* AGBIT */
1139 	uint64_t		blocks_reserved = 0;
1140 	bool			done = false;
1141 	int			error;
1142 
1143 	/* Compute how many blocks we'll need for the rmaps collected so far. */
1144 	error = xfs_btree_bload_compute_geometry(rmap_cur,
1145 			&rr->new_btree.bload, rr->nr_records);
1146 	if (error)
1147 		return error;
1148 
1149 	/* Last chance to abort before we start committing fixes. */
1150 	if (xchk_should_terminate(rr->sc, &error))
1151 		return error;
1152 
1153 	xagb_bitmap_init(&freesp_blocks);
1154 
1155 	/*
1156 	 * Iteratively reserve space for the new rmapbt and recompute the
1157 	 * number of blocks needed to store the previously observed rmapbt
1158 	 * records and the ones we'll create for the free space metadata.
1159 	 * Finish when we don't need more blocks.
1160 	 */
1161 	do {
1162 		error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
1163 				&blocks_reserved, &done);
1164 		if (error)
1165 			goto out_bitmap;
1166 	} while (!done);
1167 
1168 	/* Emit rmaps for everything in the free space bitmap. */
1169 	xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
1170 	error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
1171 	xchk_ag_btcur_free(&rr->sc->sa);
1172 
1173 out_bitmap:
1174 	xagb_bitmap_destroy(&freesp_blocks);
1175 	return error;
1176 }
1177 
1178 /* Section (III): Building the new rmap btree. */
1179 
1180 /* Update the AGF counters. */
1181 STATIC int
1182 xrep_rmap_reset_counters(
1183 	struct xrep_rmap	*rr)
1184 {
1185 	struct xfs_scrub	*sc = rr->sc;
1186 	struct xfs_perag	*pag = sc->sa.pag;
1187 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1188 	xfs_agblock_t		rmap_btblocks;
1189 
1190 	/*
1191 	 * The AGF header contains extra information related to the reverse
1192 	 * mapping btree, so we must update those fields here.
1193 	 */
1194 	rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
1195 	agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
1196 	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
1197 
1198 	/*
1199 	 * After we commit the new btree to disk, it is possible that the
1200 	 * process to reap the old btree blocks will race with the AIL trying
1201 	 * to checkpoint the old btree blocks into the filesystem.  If the new
1202 	 * tree is shorter than the old one, the rmapbt write verifier will
1203 	 * fail and the AIL will shut down the filesystem.
1204 	 *
1205 	 * To avoid this, save the old incore btree height values as the alt
1206 	 * height values before re-initializing the perag info from the updated
1207 	 * AGF to capture all the new values.
1208 	 */
1209 	pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
1210 
1211 	/* Reinitialize with the values we just logged. */
1212 	return xrep_reinit_pagf(sc);
1213 }
1214 
1215 /* Retrieve rmapbt data for bulk load. */
1216 STATIC int
1217 xrep_rmap_get_records(
1218 	struct xfs_btree_cur	*cur,
1219 	unsigned int		idx,
1220 	struct xfs_btree_block	*block,
1221 	unsigned int		nr_wanted,
1222 	void			*priv)
1223 {
1224 	struct xrep_rmap	*rr = priv;
1225 	union xfs_btree_rec	*block_rec;
1226 	unsigned int		loaded;
1227 	int			error;
1228 
1229 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
1230 		int		stat = 0;
1231 
1232 		error = xfs_btree_increment(rr->mcur, 0, &stat);
1233 		if (error)
1234 			return error;
1235 		if (!stat)
1236 			return -EFSCORRUPTED;
1237 
1238 		error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
1239 		if (error)
1240 			return error;
1241 		if (!stat)
1242 			return -EFSCORRUPTED;
1243 
1244 		block_rec = xfs_btree_rec_addr(cur, idx, block);
1245 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
1246 	}
1247 
1248 	return loaded;
1249 }
1250 
1251 /* Feed one of the new btree blocks to the bulk loader. */
1252 STATIC int
1253 xrep_rmap_claim_block(
1254 	struct xfs_btree_cur	*cur,
1255 	union xfs_btree_ptr	*ptr,
1256 	void			*priv)
1257 {
1258 	struct xrep_rmap        *rr = priv;
1259 
1260 	return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
1261 }
1262 
1263 /* Custom allocation function for new rmap btrees. */
1264 STATIC int
1265 xrep_rmap_alloc_vextent(
1266 	struct xfs_scrub	*sc,
1267 	struct xfs_alloc_arg	*args,
1268 	xfs_fsblock_t		alloc_hint)
1269 {
1270 	int			error;
1271 
1272 	/*
1273 	 * We don't want an rmap update on the allocation, since we iteratively
1274 	 * compute the OWN_AG records /after/ allocating blocks for the records
1275 	 * that we already know we need to store.  Therefore, fix the freelist
1276 	 * with the NORMAP flag set so that we don't also try to create an rmap
1277 	 * for new AGFL blocks.
1278 	 */
1279 	error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
1280 	if (error)
1281 		return error;
1282 
1283 	/*
1284 	 * If xrep_fix_freelist fixed the freelist by moving blocks from the
1285 	 * free space btrees or by removing blocks from the AGFL and queueing
1286 	 * an EFI to free the block, the transaction will be dirty.  This
1287 	 * second case is of interest to us.
1288 	 *
1289 	 * Later on, we will need to compare gaps in the new recordset against
1290 	 * the block usage of all OWN_AG owners in order to free the old
1291 	 * btree's blocks, which means that we can't have EFIs for former AGFL
1292 	 * blocks attached to the repair transaction when we commit the new
1293 	 * btree.
1294 	 *
1295 	 * xrep_newbt_alloc_blocks guarantees this for us by calling
1296 	 * xrep_defer_finish to commit anything that fix_freelist may have
1297 	 * added to the transaction.
1298 	 */
1299 	return xfs_alloc_vextent_near_bno(args, alloc_hint);
1300 }
1301 
1302 
1303 /* Count the records in this btree. */
1304 STATIC int
1305 xrep_rmap_count_records(
1306 	struct xfs_btree_cur	*cur,
1307 	unsigned long long	*nr)
1308 {
1309 	int			running = 1;
1310 	int			error;
1311 
1312 	*nr = 0;
1313 
1314 	error = xfs_btree_goto_left_edge(cur);
1315 	if (error)
1316 		return error;
1317 
1318 	while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
1319 		if (running)
1320 			(*nr)++;
1321 	}
1322 
1323 	return error;
1324 }
1325 /*
1326  * Use the collected rmap information to stage a new rmap btree.  If this is
1327  * successful we'll return with the new btree root information logged to the
1328  * repair transaction but not yet committed.  This implements section (III)
1329  * above.
1330  */
1331 STATIC int
1332 xrep_rmap_build_new_tree(
1333 	struct xrep_rmap	*rr)
1334 {
1335 	struct xfs_scrub	*sc = rr->sc;
1336 	struct xfs_perag	*pag = sc->sa.pag;
1337 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1338 	struct xfs_btree_cur	*rmap_cur;
1339 	int			error;
1340 
1341 	/*
1342 	 * Preserve the old rmapbt block count so that we can adjust the
1343 	 * per-AG rmapbt reservation after we commit the new btree root and
1344 	 * want to dispose of the old btree blocks.
1345 	 */
1346 	rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
1347 
1348 	/*
1349 	 * Prepare to construct the new btree by reserving disk space for the
1350 	 * new btree and setting up all the accounting information we'll need
1351 	 * to root the new btree while it's under construction and before we
1352 	 * attach it to the AG header.  The new blocks are accounted to the
1353 	 * rmapbt per-AG reservation, which we will adjust further after
1354 	 * committing the new btree.
1355 	 */
1356 	xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
1357 			xfs_agbno_to_fsb(pag, XFS_RMAP_BLOCK(sc->mp)),
1358 			XFS_AG_RESV_RMAPBT);
1359 	rr->new_btree.bload.get_records = xrep_rmap_get_records;
1360 	rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
1361 	rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
1362 	rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
1363 	xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
1364 
1365 	/*
1366 	 * Initialize @rr->new_btree, reserve space for the new rmapbt,
1367 	 * and compute OWN_AG rmaps.
1368 	 */
1369 	error = xrep_rmap_reserve_space(rr, rmap_cur);
1370 	if (error)
1371 		goto err_cur;
1372 
1373 	/*
1374 	 * Count the rmapbt records again, because the space reservation
1375 	 * for the rmapbt itself probably added more records to the btree.
1376 	 */
1377 	rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
1378 			&rr->rmap_btree);
1379 
1380 	error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
1381 	if (error)
1382 		goto err_mcur;
1383 
1384 	/*
1385 	 * Due to btree slack factors, it's possible for a new btree to be one
1386 	 * level taller than the old btree.  Update the incore btree height so
1387 	 * that we don't trip the verifiers when writing the new btree blocks
1388 	 * to disk.
1389 	 */
1390 	pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
1391 
1392 	/*
1393 	 * Move the cursor to the left edge of the tree so that the first
1394 	 * increment in ->get_records positions us at the first record.
1395 	 */
1396 	error = xfs_btree_goto_left_edge(rr->mcur);
1397 	if (error)
1398 		goto err_level;
1399 
1400 	/* Add all observed rmap records. */
1401 	error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
1402 	if (error)
1403 		goto err_level;
1404 
1405 	/*
1406 	 * Install the new btree in the AG header.  After this point the old
1407 	 * btree is no longer accessible and the new tree is live.
1408 	 */
1409 	xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
1410 	xfs_btree_del_cursor(rmap_cur, 0);
1411 	xfs_btree_del_cursor(rr->mcur, 0);
1412 	rr->mcur = NULL;
1413 
1414 	/*
1415 	 * Now that we've written the new btree to disk, we don't need to keep
1416 	 * updating the in-memory btree.  Abort the scan to stop live updates.
1417 	 */
1418 	xchk_iscan_abort(&rr->iscan);
1419 
1420 	/*
1421 	 * The newly committed rmap recordset includes mappings for the blocks
1422 	 * that we reserved to build the new btree.  If there is excess space
1423 	 * reservation to be freed, the corresponding rmap records must also be
1424 	 * removed.
1425 	 */
1426 	rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
1427 
1428 	/* Reset the AGF counters now that we've changed the btree shape. */
1429 	error = xrep_rmap_reset_counters(rr);
1430 	if (error)
1431 		goto err_newbt;
1432 
1433 	/* Dispose of any unused blocks and the accounting information. */
1434 	error = xrep_newbt_commit(&rr->new_btree);
1435 	if (error)
1436 		return error;
1437 
1438 	return xrep_roll_ag_trans(sc);
1439 
1440 err_level:
1441 	pag->pagf_repair_rmap_level = 0;
1442 err_mcur:
1443 	xfs_btree_del_cursor(rr->mcur, error);
1444 err_cur:
1445 	xfs_btree_del_cursor(rmap_cur, error);
1446 err_newbt:
1447 	xrep_newbt_cancel(&rr->new_btree);
1448 	return error;
1449 }
1450 
1451 /* Section (IV): Reaping the old btree. */
1452 
1453 struct xrep_rmap_find_gaps {
1454 	struct xagb_bitmap	rmap_gaps;
1455 	xfs_agblock_t		next_agbno;
1456 };
1457 
1458 /* Subtract each free extent in the bnobt from the rmap gaps. */
1459 STATIC int
1460 xrep_rmap_find_freesp(
1461 	struct xfs_btree_cur		*cur,
1462 	const struct xfs_alloc_rec_incore *rec,
1463 	void				*priv)
1464 {
1465 	struct xrep_rmap_find_gaps	*rfg = priv;
1466 
1467 	return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
1468 			rec->ar_blockcount);
1469 }
1470 
1471 /* Record the free space we find, as part of cleaning out the btree. */
1472 STATIC int
1473 xrep_rmap_find_gaps(
1474 	struct xfs_btree_cur		*cur,
1475 	const struct xfs_rmap_irec	*rec,
1476 	void				*priv)
1477 {
1478 	struct xrep_rmap_find_gaps	*rfg = priv;
1479 	int				error;
1480 
1481 	if (rec->rm_startblock > rfg->next_agbno) {
1482 		error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
1483 				rec->rm_startblock - rfg->next_agbno);
1484 		if (error)
1485 			return error;
1486 	}
1487 
1488 	rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
1489 				rec->rm_startblock + rec->rm_blockcount);
1490 	return 0;
1491 }
1492 
1493 /*
1494  * Reap the old rmapbt blocks.  Now that the rmapbt is fully rebuilt, we make
1495  * a list of gaps in the rmap records and a list of the extents mentioned in
1496  * the bnobt.  Any block that's in the new rmapbt gap list but not mentioned
1497  * in the bnobt is a block from the old rmapbt and can be removed.
1498  */
1499 STATIC int
1500 xrep_rmap_remove_old_tree(
1501 	struct xrep_rmap	*rr)
1502 {
1503 	struct xrep_rmap_find_gaps rfg = {
1504 		.next_agbno	= 0,
1505 	};
1506 	struct xfs_scrub	*sc = rr->sc;
1507 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1508 	struct xfs_perag	*pag = sc->sa.pag;
1509 	struct xfs_btree_cur	*mcur;
1510 	xfs_agblock_t		agend;
1511 	int			error;
1512 
1513 	xagb_bitmap_init(&rfg.rmap_gaps);
1514 
1515 	/* Compute free space from the new rmapbt. */
1516 	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
1517 
1518 	error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
1519 	xfs_btree_del_cursor(mcur, error);
1520 	if (error)
1521 		goto out_bitmap;
1522 
1523 	/* Insert a record for space between the last rmap and EOAG. */
1524 	agend = be32_to_cpu(agf->agf_length);
1525 	if (rfg.next_agbno < agend) {
1526 		error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
1527 				agend - rfg.next_agbno);
1528 		if (error)
1529 			goto out_bitmap;
1530 	}
1531 
1532 	/* Compute free space from the existing bnobt. */
1533 	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1534 			sc->sa.pag);
1535 	error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
1536 			&rfg);
1537 	xfs_btree_del_cursor(sc->sa.bno_cur, error);
1538 	sc->sa.bno_cur = NULL;
1539 	if (error)
1540 		goto out_bitmap;
1541 
1542 	/*
1543 	 * Free the "free" blocks that the new rmapbt knows about but the bnobt
1544 	 * doesn't--these are the old rmapbt blocks.  Credit the old rmapbt
1545 	 * block usage count back to the per-AG rmapbt reservation (and not
1546 	 * fdblocks, since the rmap btree lives in free space) to keep the
1547 	 * reservation and free space accounting correct.
1548 	 */
1549 	error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
1550 			&XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
1551 	if (error)
1552 		goto out_bitmap;
1553 
1554 	/*
1555 	 * Now that we've zapped all the old rmapbt blocks we can turn off
1556 	 * the alternate height mechanism and reset the per-AG space
1557 	 * reservation.
1558 	 */
1559 	pag->pagf_repair_rmap_level = 0;
1560 	sc->flags |= XREP_RESET_PERAG_RESV;
1561 out_bitmap:
1562 	xagb_bitmap_destroy(&rfg.rmap_gaps);
1563 	return error;
1564 }
1565 
1566 static inline bool
1567 xrep_rmapbt_want_live_update(
1568 	struct xchk_iscan		*iscan,
1569 	const struct xfs_owner_info	*oi)
1570 {
1571 	if (xchk_iscan_aborted(iscan))
1572 		return false;
1573 
1574 	/*
1575 	 * Before unlocking the AG header to perform the inode scan, we
1576 	 * recorded reverse mappings for all AG metadata except for the OWN_AG
1577 	 * metadata.  IOWs, the in-memory btree knows about the AG headers, the
1578 	 * two inode btrees, the CoW staging extents, and the refcount btrees.
1579 	 * For these types of metadata, we need to record the live updates in
1580 	 * the in-memory rmap btree.
1581 	 *
1582 	 * However, we do not scan the free space btrees or the AGFL until we
1583 	 * have re-locked the AGF and are ready to reserve space for the new
1584 	 * rmap btree, so we do not want live updates for OWN_AG metadata.
1585 	 */
1586 	if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
1587 		return oi->oi_owner != XFS_RMAP_OWN_AG;
1588 
1589 	/* Ignore updates to files that the scanner hasn't visited yet. */
1590 	return xchk_iscan_want_live_update(iscan, oi->oi_owner);
1591 }
1592 
1593 /*
1594  * Apply a rmapbt update from the regular filesystem into our shadow btree.
1595  * We're running from the thread that owns the AGF buffer and is generating
1596  * the update, so we must be careful about which parts of the struct xrep_rmap
1597  * that we change.
1598  */
1599 static int
1600 xrep_rmapbt_live_update(
1601 	struct notifier_block		*nb,
1602 	unsigned long			action,
1603 	void				*data)
1604 {
1605 	struct xfs_rmap_update_params	*p = data;
1606 	struct xrep_rmap		*rr;
1607 	struct xfs_mount		*mp;
1608 	struct xfs_btree_cur		*mcur;
1609 	struct xfs_trans		*tp;
1610 	int				error;
1611 
1612 	rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
1613 	mp = rr->sc->mp;
1614 
1615 	if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
1616 		goto out_unlock;
1617 
1618 	trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p);
1619 
1620 	tp = xfs_trans_alloc_empty(mp);
1621 
1622 	mutex_lock(&rr->lock);
1623 	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
1624 	error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
1625 			p->blockcount, &p->oinfo, p->unwritten);
1626 	xfs_btree_del_cursor(mcur, error);
1627 	if (error)
1628 		goto out_cancel;
1629 
1630 	error = xfbtree_trans_commit(&rr->rmap_btree, tp);
1631 	if (error)
1632 		goto out_cancel;
1633 
1634 	xfs_trans_cancel(tp);
1635 	mutex_unlock(&rr->lock);
1636 	return NOTIFY_DONE;
1637 
1638 out_cancel:
1639 	xfbtree_trans_cancel(&rr->rmap_btree, tp);
1640 	xfs_trans_cancel(tp);
1641 	mutex_unlock(&rr->lock);
1642 	xchk_iscan_abort(&rr->iscan);
1643 out_unlock:
1644 	return NOTIFY_DONE;
1645 }
1646 
1647 /* Set up the filesystem scan components. */
1648 STATIC int
1649 xrep_rmap_setup_scan(
1650 	struct xrep_rmap	*rr)
1651 {
1652 	struct xfs_scrub	*sc = rr->sc;
1653 	int			error;
1654 
1655 	mutex_init(&rr->lock);
1656 
1657 	/* Set up in-memory rmap btree */
1658 	error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
1659 			pag_agno(sc->sa.pag));
1660 	if (error)
1661 		goto out_mutex;
1662 
1663 	/* Retry iget every tenth of a second for up to 30 seconds. */
1664 	xchk_iscan_start(sc, 30000, 100, &rr->iscan);
1665 
1666 	/*
1667 	 * Hook into live rmap operations so that we can update our in-memory
1668 	 * btree to reflect live changes on the filesystem.  Since we drop the
1669 	 * AGF buffer to scan all the inodes, we need this piece to avoid
1670 	 * installing a stale btree.
1671 	 */
1672 	ASSERT(sc->flags & XCHK_FSGATES_RMAP);
1673 	xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
1674 	error = xfs_rmap_hook_add(pag_group(sc->sa.pag), &rr->rhook);
1675 	if (error)
1676 		goto out_iscan;
1677 	return 0;
1678 
1679 out_iscan:
1680 	xchk_iscan_teardown(&rr->iscan);
1681 	xfbtree_destroy(&rr->rmap_btree);
1682 out_mutex:
1683 	mutex_destroy(&rr->lock);
1684 	return error;
1685 }
1686 
1687 /* Tear down scan components. */
1688 STATIC void
1689 xrep_rmap_teardown(
1690 	struct xrep_rmap	*rr)
1691 {
1692 	struct xfs_scrub	*sc = rr->sc;
1693 
1694 	xchk_iscan_abort(&rr->iscan);
1695 	xfs_rmap_hook_del(pag_group(sc->sa.pag), &rr->rhook);
1696 	xchk_iscan_teardown(&rr->iscan);
1697 	xfbtree_destroy(&rr->rmap_btree);
1698 	mutex_destroy(&rr->lock);
1699 }
1700 
1701 /* Repair the rmap btree for some AG. */
1702 int
1703 xrep_rmapbt(
1704 	struct xfs_scrub	*sc)
1705 {
1706 	struct xrep_rmap	*rr = sc->buf;
1707 	int			error;
1708 
1709 	error = xrep_rmap_setup_scan(rr);
1710 	if (error)
1711 		return error;
1712 
1713 	/*
1714 	 * Collect rmaps for everything in this AG that isn't space metadata.
1715 	 * These rmaps won't change even as we try to allocate blocks.
1716 	 */
1717 	error = xrep_rmap_find_rmaps(rr);
1718 	if (error)
1719 		goto out_records;
1720 
1721 	/* Rebuild the rmap information. */
1722 	error = xrep_rmap_build_new_tree(rr);
1723 	if (error)
1724 		goto out_records;
1725 
1726 	/* Kill the old tree. */
1727 	error = xrep_rmap_remove_old_tree(rr);
1728 	if (error)
1729 		goto out_records;
1730 
1731 out_records:
1732 	xrep_rmap_teardown(rr);
1733 	return error;
1734 }
1735