xref: /linux/fs/xfs/scrub/rmap_repair.c (revision ae22a94997b8a03dcb3c922857c203246711f9d4)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_buf_mem.h"
16 #include "xfs_btree_mem.h"
17 #include "xfs_bit.h"
18 #include "xfs_log_format.h"
19 #include "xfs_trans.h"
20 #include "xfs_sb.h"
21 #include "xfs_alloc.h"
22 #include "xfs_alloc_btree.h"
23 #include "xfs_ialloc.h"
24 #include "xfs_ialloc_btree.h"
25 #include "xfs_rmap.h"
26 #include "xfs_rmap_btree.h"
27 #include "xfs_inode.h"
28 #include "xfs_icache.h"
29 #include "xfs_bmap.h"
30 #include "xfs_bmap_btree.h"
31 #include "xfs_refcount.h"
32 #include "xfs_refcount_btree.h"
33 #include "xfs_ag.h"
34 #include "scrub/xfs_scrub.h"
35 #include "scrub/scrub.h"
36 #include "scrub/common.h"
37 #include "scrub/btree.h"
38 #include "scrub/trace.h"
39 #include "scrub/repair.h"
40 #include "scrub/bitmap.h"
41 #include "scrub/agb_bitmap.h"
42 #include "scrub/xfile.h"
43 #include "scrub/xfarray.h"
44 #include "scrub/iscan.h"
45 #include "scrub/newbt.h"
46 #include "scrub/reap.h"
47 
48 /*
49  * Reverse Mapping Btree Repair
50  * ============================
51  *
52  * This is the most involved of all the AG space btree rebuilds.  Everywhere
53  * else in XFS we lock inodes and then AG data structures, but generating the
54  * list of rmap records requires that we be able to scan both block mapping
55  * btrees of every inode in the filesystem to see if it owns any extents in
56  * this AG.  We can't tolerate any inode updates while we do this, so we
57  * freeze the filesystem to lock everyone else out, and grant ourselves
58  * special privileges to run transactions with regular background reclamation
59  * turned off.
60  *
61  * We also have to be very careful not to allow inode reclaim to start a
62  * transaction because all transactions (other than our own) will block.
63  * Deferred inode inactivation helps us out there.
64  *
65  * I) Reverse mappings for all non-space metadata and file data are collected
66  * according to the following algorithm:
67  *
68  * 1. For each fork of each inode:
69  * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
70  * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
71  *      bmaps into rmap records (see 1.1.4).  Set bits in BMBIT for each btree
72  *      block.
73  * 1.3. If the incore extent map is loaded but the fork is in btree format,
74  *      just visit the bmbt blocks to set the corresponding BMBIT areas.
75  * 1.4. From the incore extent map, accumulate each bmap that falls into our
76  *      target AG.  Remember, multiple bmap records can map to a single rmap
77  *      record, so we cannot simply emit rmap records 1:1.
78  * 1.5. Emit rmap records for each extent in BMBIT and free it.
79  * 2. Create bitmaps INOBIT and ICHUNKBIT.
80  * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
81  *    and set bits in INOBIT for each btree block.  If the inobt has no records
82  *    at all, we must be careful to record its root in INOBIT.
83  * 4. For each block in the finobt, set the corresponding INOBIT area.
84  * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
85  * 6. Create bitmaps REFCBIT and COWBIT.
86  * 7. For each CoW staging extent in the refcountbt, set the corresponding
87  *    areas in COWBIT.
88  * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
89  * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
90  * A. Emit rmap for the AG headers.
91  * B. Emit rmap for the log, if there is one.
92  *
93  * II) The rmapbt shape and space metadata rmaps are computed as follows:
94  *
95  * 1. Count the rmaps collected in the previous step. (= NR)
96  * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
97  * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
98  * 4. Create bitmap AGBIT.
99  * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
100  * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
101  * 7. Count the extents in AGBIT. (= AGNR)
102  * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
103  * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
104  *    and clear AGBIT.  Go to step 5.
105  * A. Emit rmaps for each extent in AGBIT.
106  *
107  * III) The rmapbt is constructed and set in place as follows:
108  *
109  * 1. Sort the rmap records.
110  * 2. Bulk load the rmaps.
111  *
112  * IV) Reap the old btree blocks.
113  *
114  * 1. Create a bitmap OLDRMBIT.
115  * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
116  * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
117  * 4. Reap the extents corresponding to the set areas in OLDRMBIT.  These are
118  *    the parts of the AG that the rmap didn't find during its scan of the
119  *    primary metadata and aren't known to be in the free space, which implies
120  *    that they were the old rmapbt blocks.
121  * 5. Commit.
122  *
123  * We use the 'xrep_rmap' prefix for all the rmap functions.
124  */
125 
126 /* Context for collecting rmaps */
127 struct xrep_rmap {
128 	/* new rmapbt information */
129 	struct xrep_newbt	new_btree;
130 
131 	/* lock for the xfbtree and xfile */
132 	struct mutex		lock;
133 
134 	/* rmap records generated from primary metadata */
135 	struct xfbtree		rmap_btree;
136 
137 	struct xfs_scrub	*sc;
138 
139 	/* in-memory btree cursor for the xfs_btree_bload iteration */
140 	struct xfs_btree_cur	*mcur;
141 
142 	/* Hooks into rmap update code. */
143 	struct xfs_rmap_hook	rhook;
144 
145 	/* inode scan cursor */
146 	struct xchk_iscan	iscan;
147 
148 	/* Number of non-freespace records found. */
149 	unsigned long long	nr_records;
150 
151 	/* bnobt/cntbt contribution to btreeblks */
152 	xfs_agblock_t		freesp_btblocks;
153 
154 	/* old agf_rmap_blocks counter */
155 	unsigned int		old_rmapbt_fsbcount;
156 };
157 
158 /* Set us up to repair reverse mapping btrees. */
159 int
160 xrep_setup_ag_rmapbt(
161 	struct xfs_scrub	*sc)
162 {
163 	struct xrep_rmap	*rr;
164 	char			*descr;
165 	int			error;
166 
167 	xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
168 
169 	descr = xchk_xfile_ag_descr(sc, "reverse mapping records");
170 	error = xrep_setup_xfbtree(sc, descr);
171 	kfree(descr);
172 	if (error)
173 		return error;
174 
175 	rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS);
176 	if (!rr)
177 		return -ENOMEM;
178 
179 	rr->sc = sc;
180 	sc->buf = rr;
181 	return 0;
182 }
183 
184 /* Make sure there's nothing funny about this mapping. */
185 STATIC int
186 xrep_rmap_check_mapping(
187 	struct xfs_scrub	*sc,
188 	const struct xfs_rmap_irec *rec)
189 {
190 	enum xbtree_recpacking	outcome;
191 	int			error;
192 
193 	if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
194 		return -EFSCORRUPTED;
195 
196 	/* Make sure this isn't free space. */
197 	error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
198 			rec->rm_blockcount, &outcome);
199 	if (error)
200 		return error;
201 	if (outcome != XBTREE_RECPACKING_EMPTY)
202 		return -EFSCORRUPTED;
203 
204 	return 0;
205 }
206 
207 /* Store a reverse-mapping record. */
208 static inline int
209 xrep_rmap_stash(
210 	struct xrep_rmap	*rr,
211 	xfs_agblock_t		startblock,
212 	xfs_extlen_t		blockcount,
213 	uint64_t		owner,
214 	uint64_t		offset,
215 	unsigned int		flags)
216 {
217 	struct xfs_rmap_irec	rmap = {
218 		.rm_startblock	= startblock,
219 		.rm_blockcount	= blockcount,
220 		.rm_owner	= owner,
221 		.rm_offset	= offset,
222 		.rm_flags	= flags,
223 	};
224 	struct xfs_scrub	*sc = rr->sc;
225 	struct xfs_btree_cur	*mcur;
226 	int			error = 0;
227 
228 	if (xchk_should_terminate(sc, &error))
229 		return error;
230 
231 	if (xchk_iscan_aborted(&rr->iscan))
232 		return -EFSCORRUPTED;
233 
234 	trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap);
235 
236 	mutex_lock(&rr->lock);
237 	mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
238 	error = xfs_rmap_map_raw(mcur, &rmap);
239 	xfs_btree_del_cursor(mcur, error);
240 	if (error)
241 		goto out_cancel;
242 
243 	error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
244 	if (error)
245 		goto out_abort;
246 
247 	mutex_unlock(&rr->lock);
248 	return 0;
249 
250 out_cancel:
251 	xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
252 out_abort:
253 	xchk_iscan_abort(&rr->iscan);
254 	mutex_unlock(&rr->lock);
255 	return error;
256 }
257 
258 struct xrep_rmap_stash_run {
259 	struct xrep_rmap	*rr;
260 	uint64_t		owner;
261 	unsigned int		rmap_flags;
262 };
263 
264 static int
265 xrep_rmap_stash_run(
266 	uint32_t			start,
267 	uint32_t			len,
268 	void				*priv)
269 {
270 	struct xrep_rmap_stash_run	*rsr = priv;
271 	struct xrep_rmap		*rr = rsr->rr;
272 
273 	return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
274 }
275 
276 /*
277  * Emit rmaps for every extent of bits set in the bitmap.  Caller must ensure
278  * that the ranges are in units of FS blocks.
279  */
280 STATIC int
281 xrep_rmap_stash_bitmap(
282 	struct xrep_rmap		*rr,
283 	struct xagb_bitmap		*bitmap,
284 	const struct xfs_owner_info	*oinfo)
285 {
286 	struct xrep_rmap_stash_run	rsr = {
287 		.rr			= rr,
288 		.owner			= oinfo->oi_owner,
289 		.rmap_flags		= 0,
290 	};
291 
292 	if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
293 		rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
294 	if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
295 		rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
296 
297 	return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
298 }
299 
300 /* Section (I): Finding all file and bmbt extents. */
301 
302 /* Context for accumulating rmaps for an inode fork. */
303 struct xrep_rmap_ifork {
304 	/*
305 	 * Accumulate rmap data here to turn multiple adjacent bmaps into a
306 	 * single rmap.
307 	 */
308 	struct xfs_rmap_irec	accum;
309 
310 	/* Bitmap of bmbt blocks in this AG. */
311 	struct xagb_bitmap	bmbt_blocks;
312 
313 	struct xrep_rmap	*rr;
314 
315 	/* Which inode fork? */
316 	int			whichfork;
317 };
318 
319 /* Stash an rmap that we accumulated while walking an inode fork. */
320 STATIC int
321 xrep_rmap_stash_accumulated(
322 	struct xrep_rmap_ifork	*rf)
323 {
324 	if (rf->accum.rm_blockcount == 0)
325 		return 0;
326 
327 	return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
328 			rf->accum.rm_blockcount, rf->accum.rm_owner,
329 			rf->accum.rm_offset, rf->accum.rm_flags);
330 }
331 
332 /* Accumulate a bmbt record. */
333 STATIC int
334 xrep_rmap_visit_bmbt(
335 	struct xfs_btree_cur	*cur,
336 	struct xfs_bmbt_irec	*rec,
337 	void			*priv)
338 {
339 	struct xrep_rmap_ifork	*rf = priv;
340 	struct xfs_mount	*mp = rf->rr->sc->mp;
341 	struct xfs_rmap_irec	*accum = &rf->accum;
342 	xfs_agblock_t		agbno;
343 	unsigned int		rmap_flags = 0;
344 	int			error;
345 
346 	if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
347 			rf->rr->sc->sa.pag->pag_agno)
348 		return 0;
349 
350 	agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
351 	if (rf->whichfork == XFS_ATTR_FORK)
352 		rmap_flags |= XFS_RMAP_ATTR_FORK;
353 	if (rec->br_state == XFS_EXT_UNWRITTEN)
354 		rmap_flags |= XFS_RMAP_UNWRITTEN;
355 
356 	/* If this bmap is adjacent to the previous one, just add it. */
357 	if (accum->rm_blockcount > 0 &&
358 	    rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
359 	    agbno == accum->rm_startblock + accum->rm_blockcount &&
360 	    rmap_flags == accum->rm_flags) {
361 		accum->rm_blockcount += rec->br_blockcount;
362 		return 0;
363 	}
364 
365 	/* Otherwise stash the old rmap and start accumulating a new one. */
366 	error = xrep_rmap_stash_accumulated(rf);
367 	if (error)
368 		return error;
369 
370 	accum->rm_startblock = agbno;
371 	accum->rm_blockcount = rec->br_blockcount;
372 	accum->rm_offset = rec->br_startoff;
373 	accum->rm_flags = rmap_flags;
374 	return 0;
375 }
376 
377 /* Add a btree block to the bitmap. */
378 STATIC int
379 xrep_rmap_visit_iroot_btree_block(
380 	struct xfs_btree_cur	*cur,
381 	int			level,
382 	void			*priv)
383 {
384 	struct xrep_rmap_ifork	*rf = priv;
385 	struct xfs_buf		*bp;
386 	xfs_fsblock_t		fsbno;
387 	xfs_agblock_t		agbno;
388 
389 	xfs_btree_get_block(cur, level, &bp);
390 	if (!bp)
391 		return 0;
392 
393 	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
394 	if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno)
395 		return 0;
396 
397 	agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
398 	return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
399 }
400 
401 /*
402  * Iterate a metadata btree rooted in an inode to collect rmap records for
403  * anything in this fork that matches the AG.
404  */
405 STATIC int
406 xrep_rmap_scan_iroot_btree(
407 	struct xrep_rmap_ifork	*rf,
408 	struct xfs_btree_cur	*cur)
409 {
410 	struct xfs_owner_info	oinfo;
411 	struct xrep_rmap	*rr = rf->rr;
412 	int			error;
413 
414 	xagb_bitmap_init(&rf->bmbt_blocks);
415 
416 	/* Record all the blocks in the btree itself. */
417 	error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
418 			XFS_BTREE_VISIT_ALL, rf);
419 	if (error)
420 		goto out;
421 
422 	/* Emit rmaps for the btree blocks. */
423 	xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
424 	error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
425 	if (error)
426 		goto out;
427 
428 	/* Stash any remaining accumulated rmaps. */
429 	error = xrep_rmap_stash_accumulated(rf);
430 out:
431 	xagb_bitmap_destroy(&rf->bmbt_blocks);
432 	return error;
433 }
434 
435 static inline bool
436 is_rt_data_fork(
437 	struct xfs_inode	*ip,
438 	int			whichfork)
439 {
440 	return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK;
441 }
442 
443 /*
444  * Iterate the block mapping btree to collect rmap records for anything in this
445  * fork that matches the AG.  Sets @mappings_done to true if we've scanned the
446  * block mappings in this fork.
447  */
448 STATIC int
449 xrep_rmap_scan_bmbt(
450 	struct xrep_rmap_ifork	*rf,
451 	struct xfs_inode	*ip,
452 	bool			*mappings_done)
453 {
454 	struct xrep_rmap	*rr = rf->rr;
455 	struct xfs_btree_cur	*cur;
456 	struct xfs_ifork	*ifp;
457 	int			error;
458 
459 	*mappings_done = false;
460 	ifp = xfs_ifork_ptr(ip, rf->whichfork);
461 	cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
462 
463 	if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
464 	    xfs_need_iread_extents(ifp)) {
465 		/*
466 		 * If the incore extent cache isn't loaded, scan the bmbt for
467 		 * mapping records.  This avoids loading the incore extent
468 		 * tree, which will increase memory pressure at a time when
469 		 * we're trying to run as quickly as we possibly can.  Ignore
470 		 * realtime extents.
471 		 */
472 		error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
473 		if (error)
474 			goto out_cur;
475 
476 		*mappings_done = true;
477 	}
478 
479 	/* Scan for the bmbt blocks, which always live on the data device. */
480 	error = xrep_rmap_scan_iroot_btree(rf, cur);
481 out_cur:
482 	xfs_btree_del_cursor(cur, error);
483 	return error;
484 }
485 
486 /*
487  * Iterate the in-core extent cache to collect rmap records for anything in
488  * this fork that matches the AG.
489  */
490 STATIC int
491 xrep_rmap_scan_iext(
492 	struct xrep_rmap_ifork	*rf,
493 	struct xfs_ifork	*ifp)
494 {
495 	struct xfs_bmbt_irec	rec;
496 	struct xfs_iext_cursor	icur;
497 	int			error;
498 
499 	for_each_xfs_iext(ifp, &icur, &rec) {
500 		if (isnullstartblock(rec.br_startblock))
501 			continue;
502 		error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
503 		if (error)
504 			return error;
505 	}
506 
507 	return xrep_rmap_stash_accumulated(rf);
508 }
509 
510 /* Find all the extents from a given AG in an inode fork. */
511 STATIC int
512 xrep_rmap_scan_ifork(
513 	struct xrep_rmap	*rr,
514 	struct xfs_inode	*ip,
515 	int			whichfork)
516 {
517 	struct xrep_rmap_ifork	rf = {
518 		.accum		= { .rm_owner = ip->i_ino, },
519 		.rr		= rr,
520 		.whichfork	= whichfork,
521 	};
522 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
523 	int			error = 0;
524 
525 	if (!ifp)
526 		return 0;
527 
528 	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
529 		bool		mappings_done;
530 
531 		/*
532 		 * Scan the bmap btree for data device mappings.  This includes
533 		 * the btree blocks themselves, even if this is a realtime
534 		 * file.
535 		 */
536 		error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
537 		if (error || mappings_done)
538 			return error;
539 	} else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
540 		return 0;
541 	}
542 
543 	/* Scan incore extent cache if this isn't a realtime file. */
544 	if (xfs_ifork_is_realtime(ip, whichfork))
545 		return 0;
546 
547 	return xrep_rmap_scan_iext(&rf, ifp);
548 }
549 
550 /*
551  * Take ILOCK on a file that we want to scan.
552  *
553  * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
554  * attr bmbt.  Otherwise, take ILOCK_SHARED.
555  */
556 static inline unsigned int
557 xrep_rmap_scan_ilock(
558 	struct xfs_inode	*ip)
559 {
560 	uint			lock_mode = XFS_ILOCK_SHARED;
561 
562 	if (xfs_need_iread_extents(&ip->i_df)) {
563 		lock_mode = XFS_ILOCK_EXCL;
564 		goto lock;
565 	}
566 
567 	if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
568 		lock_mode = XFS_ILOCK_EXCL;
569 
570 lock:
571 	xfs_ilock(ip, lock_mode);
572 	return lock_mode;
573 }
574 
575 /* Record reverse mappings for a file. */
576 STATIC int
577 xrep_rmap_scan_inode(
578 	struct xrep_rmap	*rr,
579 	struct xfs_inode	*ip)
580 {
581 	unsigned int		lock_mode = 0;
582 	int			error;
583 
584 	/*
585 	 * Directory updates (create/link/unlink/rename) drop the directory's
586 	 * ILOCK before finishing any rmapbt updates associated with directory
587 	 * shape changes.  For this scan to coordinate correctly with the live
588 	 * update hook, we must take the only lock (i_rwsem) that is held all
589 	 * the way to dir op completion.  This will get fixed by the parent
590 	 * pointer patchset.
591 	 */
592 	if (S_ISDIR(VFS_I(ip)->i_mode)) {
593 		lock_mode = XFS_IOLOCK_SHARED;
594 		xfs_ilock(ip, lock_mode);
595 	}
596 	lock_mode |= xrep_rmap_scan_ilock(ip);
597 
598 	/* Check the data fork. */
599 	error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
600 	if (error)
601 		goto out_unlock;
602 
603 	/* Check the attr fork. */
604 	error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
605 	if (error)
606 		goto out_unlock;
607 
608 	/* COW fork extents are "owned" by the refcount btree. */
609 
610 	xchk_iscan_mark_visited(&rr->iscan, ip);
611 out_unlock:
612 	xfs_iunlock(ip, lock_mode);
613 	return error;
614 }
615 
616 /* Section (I): Find all AG metadata extents except for free space metadata. */
617 
618 struct xrep_rmap_inodes {
619 	struct xrep_rmap	*rr;
620 	struct xagb_bitmap	inobt_blocks;	/* INOBIT */
621 	struct xagb_bitmap	ichunk_blocks;	/* ICHUNKBIT */
622 };
623 
624 /* Record inode btree rmaps. */
625 STATIC int
626 xrep_rmap_walk_inobt(
627 	struct xfs_btree_cur		*cur,
628 	const union xfs_btree_rec	*rec,
629 	void				*priv)
630 {
631 	struct xfs_inobt_rec_incore	irec;
632 	struct xrep_rmap_inodes		*ri = priv;
633 	struct xfs_mount		*mp = cur->bc_mp;
634 	xfs_agblock_t			agbno;
635 	xfs_extlen_t			aglen;
636 	xfs_agino_t			agino;
637 	xfs_agino_t			iperhole;
638 	unsigned int			i;
639 	int				error;
640 
641 	/* Record the inobt blocks. */
642 	error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
643 	if (error)
644 		return error;
645 
646 	xfs_inobt_btrec_to_irec(mp, rec, &irec);
647 	if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL)
648 		return -EFSCORRUPTED;
649 
650 	agino = irec.ir_startino;
651 
652 	/* Record a non-sparse inode chunk. */
653 	if (!xfs_inobt_issparse(irec.ir_holemask)) {
654 		agbno = XFS_AGINO_TO_AGBNO(mp, agino);
655 		aglen = max_t(xfs_extlen_t, 1,
656 				XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
657 
658 		return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
659 	}
660 
661 	/* Iterate each chunk. */
662 	iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
663 			XFS_INODES_PER_HOLEMASK_BIT);
664 	aglen = iperhole / mp->m_sb.sb_inopblock;
665 	for (i = 0, agino = irec.ir_startino;
666 	     i < XFS_INOBT_HOLEMASK_BITS;
667 	     i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
668 		/* Skip holes. */
669 		if (irec.ir_holemask & (1 << i))
670 			continue;
671 
672 		/* Record the inode chunk otherwise. */
673 		agbno = XFS_AGINO_TO_AGBNO(mp, agino);
674 		error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
675 		if (error)
676 			return error;
677 	}
678 
679 	return 0;
680 }
681 
682 /* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
683 STATIC int
684 xrep_rmap_find_inode_rmaps(
685 	struct xrep_rmap	*rr)
686 {
687 	struct xrep_rmap_inodes	ri = {
688 		.rr		= rr,
689 	};
690 	struct xfs_scrub	*sc = rr->sc;
691 	int			error;
692 
693 	xagb_bitmap_init(&ri.inobt_blocks);
694 	xagb_bitmap_init(&ri.ichunk_blocks);
695 
696 	/*
697 	 * Iterate every record in the inobt so we can capture all the inode
698 	 * chunks and the blocks in the inobt itself.
699 	 */
700 	error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
701 	if (error)
702 		goto out_bitmap;
703 
704 	/*
705 	 * Note that if there are zero records in the inobt then query_all does
706 	 * nothing and we have to account the empty inobt root manually.
707 	 */
708 	if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
709 		struct xfs_agi	*agi = sc->sa.agi_bp->b_addr;
710 
711 		error = xagb_bitmap_set(&ri.inobt_blocks,
712 				be32_to_cpu(agi->agi_root), 1);
713 		if (error)
714 			goto out_bitmap;
715 	}
716 
717 	/* Scan the finobt too. */
718 	if (xfs_has_finobt(sc->mp)) {
719 		error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
720 				sc->sa.fino_cur);
721 		if (error)
722 			goto out_bitmap;
723 	}
724 
725 	/* Generate rmaps for everything. */
726 	error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
727 			&XFS_RMAP_OINFO_INOBT);
728 	if (error)
729 		goto out_bitmap;
730 	error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
731 			&XFS_RMAP_OINFO_INODES);
732 
733 out_bitmap:
734 	xagb_bitmap_destroy(&ri.inobt_blocks);
735 	xagb_bitmap_destroy(&ri.ichunk_blocks);
736 	return error;
737 }
738 
739 /* Record a CoW staging extent. */
740 STATIC int
741 xrep_rmap_walk_cowblocks(
742 	struct xfs_btree_cur		*cur,
743 	const struct xfs_refcount_irec	*irec,
744 	void				*priv)
745 {
746 	struct xagb_bitmap		*bitmap = priv;
747 
748 	if (!xfs_refcount_check_domain(irec) ||
749 	    irec->rc_domain != XFS_REFC_DOMAIN_COW)
750 		return -EFSCORRUPTED;
751 
752 	return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
753 }
754 
755 /*
756  * Collect rmaps for the blocks containing the refcount btree, and all CoW
757  * staging extents.
758  */
759 STATIC int
760 xrep_rmap_find_refcount_rmaps(
761 	struct xrep_rmap	*rr)
762 {
763 	struct xagb_bitmap	refcountbt_blocks;	/* REFCBIT */
764 	struct xagb_bitmap	cow_blocks;		/* COWBIT */
765 	struct xfs_refcount_irec low = {
766 		.rc_startblock	= 0,
767 		.rc_domain	= XFS_REFC_DOMAIN_COW,
768 	};
769 	struct xfs_refcount_irec high = {
770 		.rc_startblock	= -1U,
771 		.rc_domain	= XFS_REFC_DOMAIN_COW,
772 	};
773 	struct xfs_scrub	*sc = rr->sc;
774 	int			error;
775 
776 	if (!xfs_has_reflink(sc->mp))
777 		return 0;
778 
779 	xagb_bitmap_init(&refcountbt_blocks);
780 	xagb_bitmap_init(&cow_blocks);
781 
782 	/* refcountbt */
783 	error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
784 	if (error)
785 		goto out_bitmap;
786 
787 	/* Collect rmaps for CoW staging extents. */
788 	error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
789 			xrep_rmap_walk_cowblocks, &cow_blocks);
790 	if (error)
791 		goto out_bitmap;
792 
793 	/* Generate rmaps for everything. */
794 	error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
795 	if (error)
796 		goto out_bitmap;
797 	error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
798 			&XFS_RMAP_OINFO_REFC);
799 
800 out_bitmap:
801 	xagb_bitmap_destroy(&cow_blocks);
802 	xagb_bitmap_destroy(&refcountbt_blocks);
803 	return error;
804 }
805 
806 /* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
807 STATIC int
808 xrep_rmap_find_agheader_rmaps(
809 	struct xrep_rmap	*rr)
810 {
811 	struct xfs_scrub	*sc = rr->sc;
812 
813 	/* Create a record for the AG sb->agfl. */
814 	return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
815 			XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
816 			XFS_RMAP_OWN_FS, 0, 0);
817 }
818 
819 /* Generate rmaps for the log, if it's in this AG. */
820 STATIC int
821 xrep_rmap_find_log_rmaps(
822 	struct xrep_rmap	*rr)
823 {
824 	struct xfs_scrub	*sc = rr->sc;
825 
826 	if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno))
827 		return 0;
828 
829 	return xrep_rmap_stash(rr,
830 			XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
831 			sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
832 }
833 
834 /* Check and count all the records that we gathered. */
835 STATIC int
836 xrep_rmap_check_record(
837 	struct xfs_btree_cur		*cur,
838 	const struct xfs_rmap_irec	*rec,
839 	void				*priv)
840 {
841 	struct xrep_rmap		*rr = priv;
842 	int				error;
843 
844 	error = xrep_rmap_check_mapping(rr->sc, rec);
845 	if (error)
846 		return error;
847 
848 	rr->nr_records++;
849 	return 0;
850 }
851 
852 /*
853  * Generate all the reverse-mappings for this AG, a list of the old rmapbt
854  * blocks, and the new btreeblks count.  Figure out if we have enough free
855  * space to reconstruct the inode btrees.  The caller must clean up the lists
856  * if anything goes wrong.  This implements section (I) above.
857  */
858 STATIC int
859 xrep_rmap_find_rmaps(
860 	struct xrep_rmap	*rr)
861 {
862 	struct xfs_scrub	*sc = rr->sc;
863 	struct xchk_ag		*sa = &sc->sa;
864 	struct xfs_inode	*ip;
865 	struct xfs_btree_cur	*mcur;
866 	int			error;
867 
868 	/* Find all the per-AG metadata. */
869 	xrep_ag_btcur_init(sc, &sc->sa);
870 
871 	error = xrep_rmap_find_inode_rmaps(rr);
872 	if (error)
873 		goto end_agscan;
874 
875 	error = xrep_rmap_find_refcount_rmaps(rr);
876 	if (error)
877 		goto end_agscan;
878 
879 	error = xrep_rmap_find_agheader_rmaps(rr);
880 	if (error)
881 		goto end_agscan;
882 
883 	error = xrep_rmap_find_log_rmaps(rr);
884 end_agscan:
885 	xchk_ag_btcur_free(&sc->sa);
886 	if (error)
887 		return error;
888 
889 	/*
890 	 * Set up for a potentially lengthy filesystem scan by reducing our
891 	 * transaction resource usage for the duration.  Specifically:
892 	 *
893 	 * Unlock the AG header buffers and cancel the transaction to release
894 	 * the log grant space while we scan the filesystem.
895 	 *
896 	 * Create a new empty transaction to eliminate the possibility of the
897 	 * inode scan deadlocking on cyclical metadata.
898 	 *
899 	 * We pass the empty transaction to the file scanning function to avoid
900 	 * repeatedly cycling empty transactions.  This can be done even though
901 	 * we take the IOLOCK to quiesce the file because empty transactions
902 	 * do not take sb_internal.
903 	 */
904 	sa->agf_bp = NULL;
905 	sa->agi_bp = NULL;
906 	xchk_trans_cancel(sc);
907 	error = xchk_trans_alloc_empty(sc);
908 	if (error)
909 		return error;
910 
911 	/* Iterate all AGs for inodes rmaps. */
912 	while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
913 		error = xrep_rmap_scan_inode(rr, ip);
914 		xchk_irele(sc, ip);
915 		if (error)
916 			break;
917 
918 		if (xchk_should_terminate(sc, &error))
919 			break;
920 	}
921 	xchk_iscan_iter_finish(&rr->iscan);
922 	if (error)
923 		return error;
924 
925 	/*
926 	 * Switch out for a real transaction and lock the AG headers in
927 	 * preparation for building a new tree.
928 	 */
929 	xchk_trans_cancel(sc);
930 	error = xchk_setup_fs(sc);
931 	if (error)
932 		return error;
933 	error = xchk_perag_drain_and_lock(sc);
934 	if (error)
935 		return error;
936 
937 	/*
938 	 * If a hook failed to update the in-memory btree, we lack the data to
939 	 * continue the repair.
940 	 */
941 	if (xchk_iscan_aborted(&rr->iscan))
942 		return -EFSCORRUPTED;
943 
944 	/*
945 	 * Now that we have everything locked again, we need to count the
946 	 * number of rmap records stashed in the btree.  This should reflect
947 	 * all actively-owned space in the filesystem.  At the same time, check
948 	 * all our records before we start building a new btree, which requires
949 	 * a bnobt cursor.
950 	 */
951 	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
952 	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
953 			sc->sa.pag);
954 
955 	rr->nr_records = 0;
956 	error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
957 
958 	xfs_btree_del_cursor(sc->sa.bno_cur, error);
959 	sc->sa.bno_cur = NULL;
960 	xfs_btree_del_cursor(mcur, error);
961 
962 	return error;
963 }
964 
965 /* Section (II): Reserving space for new rmapbt and setting free space bitmap */
966 
967 struct xrep_rmap_agfl {
968 	struct xagb_bitmap	*bitmap;
969 	xfs_agnumber_t		agno;
970 };
971 
972 /* Add an AGFL block to the rmap list. */
973 STATIC int
974 xrep_rmap_walk_agfl(
975 	struct xfs_mount	*mp,
976 	xfs_agblock_t		agbno,
977 	void			*priv)
978 {
979 	struct xrep_rmap_agfl	*ra = priv;
980 
981 	return xagb_bitmap_set(ra->bitmap, agbno, 1);
982 }
983 
984 /*
985  * Run one round of reserving space for the new rmapbt and recomputing the
986  * number of blocks needed to store the previously observed rmapbt records and
987  * the ones we'll create for the free space metadata.  When we don't need more
988  * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
989  * true.
990  */
991 STATIC int
992 xrep_rmap_try_reserve(
993 	struct xrep_rmap	*rr,
994 	struct xfs_btree_cur	*rmap_cur,
995 	struct xagb_bitmap	*freesp_blocks,
996 	uint64_t		*blocks_reserved,
997 	bool			*done)
998 {
999 	struct xrep_rmap_agfl	ra = {
1000 		.bitmap		= freesp_blocks,
1001 		.agno		= rr->sc->sa.pag->pag_agno,
1002 	};
1003 	struct xfs_scrub	*sc = rr->sc;
1004 	struct xrep_newbt_resv	*resv, *n;
1005 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1006 	struct xfs_buf		*agfl_bp;
1007 	uint64_t		nr_blocks;	/* RMB */
1008 	uint64_t		freesp_records;
1009 	int			error;
1010 
1011 	/*
1012 	 * We're going to recompute new_btree.bload.nr_blocks at the end of
1013 	 * this function to reflect however many btree blocks we need to store
1014 	 * all the rmap records (including the ones that reflect the changes we
1015 	 * made to support the new rmapbt blocks), so we save the old value
1016 	 * here so we can decide if we've reserved enough blocks.
1017 	 */
1018 	nr_blocks = rr->new_btree.bload.nr_blocks;
1019 
1020 	/*
1021 	 * Make sure we've reserved enough space for the new btree.  This can
1022 	 * change the shape of the free space btrees, which can cause secondary
1023 	 * interactions with the rmap records because all three space btrees
1024 	 * have the same rmap owner.  We'll account for all that below.
1025 	 */
1026 	error = xrep_newbt_alloc_blocks(&rr->new_btree,
1027 			nr_blocks - *blocks_reserved);
1028 	if (error)
1029 		return error;
1030 
1031 	*blocks_reserved = rr->new_btree.bload.nr_blocks;
1032 
1033 	/* Clear everything in the bitmap. */
1034 	xagb_bitmap_destroy(freesp_blocks);
1035 
1036 	/* Set all the bnobt blocks in the bitmap. */
1037 	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1038 			sc->sa.pag);
1039 	error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
1040 	xfs_btree_del_cursor(sc->sa.bno_cur, error);
1041 	sc->sa.bno_cur = NULL;
1042 	if (error)
1043 		return error;
1044 
1045 	/* Set all the cntbt blocks in the bitmap. */
1046 	sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1047 			sc->sa.pag);
1048 	error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
1049 	xfs_btree_del_cursor(sc->sa.cnt_cur, error);
1050 	sc->sa.cnt_cur = NULL;
1051 	if (error)
1052 		return error;
1053 
1054 	/* Record our new btreeblks value. */
1055 	rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
1056 
1057 	/* Set all the new rmapbt blocks in the bitmap. */
1058 	list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
1059 		error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
1060 		if (error)
1061 			return error;
1062 	}
1063 
1064 	/* Set all the AGFL blocks in the bitmap. */
1065 	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
1066 	if (error)
1067 		return error;
1068 
1069 	error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
1070 	if (error)
1071 		return error;
1072 
1073 	/* Count the extents in the bitmap. */
1074 	freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
1075 
1076 	/* Compute how many blocks we'll need for all the rmaps. */
1077 	error = xfs_btree_bload_compute_geometry(rmap_cur,
1078 			&rr->new_btree.bload, rr->nr_records + freesp_records);
1079 	if (error)
1080 		return error;
1081 
1082 	/* We're done when we don't need more blocks. */
1083 	*done = nr_blocks >= rr->new_btree.bload.nr_blocks;
1084 	return 0;
1085 }
1086 
1087 /*
1088  * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
1089  * the free space metadata.  This implements section (II) above.
1090  */
1091 STATIC int
1092 xrep_rmap_reserve_space(
1093 	struct xrep_rmap	*rr,
1094 	struct xfs_btree_cur	*rmap_cur)
1095 {
1096 	struct xagb_bitmap	freesp_blocks;	/* AGBIT */
1097 	uint64_t		blocks_reserved = 0;
1098 	bool			done = false;
1099 	int			error;
1100 
1101 	/* Compute how many blocks we'll need for the rmaps collected so far. */
1102 	error = xfs_btree_bload_compute_geometry(rmap_cur,
1103 			&rr->new_btree.bload, rr->nr_records);
1104 	if (error)
1105 		return error;
1106 
1107 	/* Last chance to abort before we start committing fixes. */
1108 	if (xchk_should_terminate(rr->sc, &error))
1109 		return error;
1110 
1111 	xagb_bitmap_init(&freesp_blocks);
1112 
1113 	/*
1114 	 * Iteratively reserve space for the new rmapbt and recompute the
1115 	 * number of blocks needed to store the previously observed rmapbt
1116 	 * records and the ones we'll create for the free space metadata.
1117 	 * Finish when we don't need more blocks.
1118 	 */
1119 	do {
1120 		error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
1121 				&blocks_reserved, &done);
1122 		if (error)
1123 			goto out_bitmap;
1124 	} while (!done);
1125 
1126 	/* Emit rmaps for everything in the free space bitmap. */
1127 	xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
1128 	error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
1129 	xchk_ag_btcur_free(&rr->sc->sa);
1130 
1131 out_bitmap:
1132 	xagb_bitmap_destroy(&freesp_blocks);
1133 	return error;
1134 }
1135 
1136 /* Section (III): Building the new rmap btree. */
1137 
1138 /* Update the AGF counters. */
1139 STATIC int
1140 xrep_rmap_reset_counters(
1141 	struct xrep_rmap	*rr)
1142 {
1143 	struct xfs_scrub	*sc = rr->sc;
1144 	struct xfs_perag	*pag = sc->sa.pag;
1145 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1146 	xfs_agblock_t		rmap_btblocks;
1147 
1148 	/*
1149 	 * The AGF header contains extra information related to the reverse
1150 	 * mapping btree, so we must update those fields here.
1151 	 */
1152 	rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
1153 	agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
1154 	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
1155 
1156 	/*
1157 	 * After we commit the new btree to disk, it is possible that the
1158 	 * process to reap the old btree blocks will race with the AIL trying
1159 	 * to checkpoint the old btree blocks into the filesystem.  If the new
1160 	 * tree is shorter than the old one, the rmapbt write verifier will
1161 	 * fail and the AIL will shut down the filesystem.
1162 	 *
1163 	 * To avoid this, save the old incore btree height values as the alt
1164 	 * height values before re-initializing the perag info from the updated
1165 	 * AGF to capture all the new values.
1166 	 */
1167 	pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
1168 
1169 	/* Reinitialize with the values we just logged. */
1170 	return xrep_reinit_pagf(sc);
1171 }
1172 
1173 /* Retrieve rmapbt data for bulk load. */
1174 STATIC int
1175 xrep_rmap_get_records(
1176 	struct xfs_btree_cur	*cur,
1177 	unsigned int		idx,
1178 	struct xfs_btree_block	*block,
1179 	unsigned int		nr_wanted,
1180 	void			*priv)
1181 {
1182 	struct xrep_rmap	*rr = priv;
1183 	union xfs_btree_rec	*block_rec;
1184 	unsigned int		loaded;
1185 	int			error;
1186 
1187 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
1188 		int		stat = 0;
1189 
1190 		error = xfs_btree_increment(rr->mcur, 0, &stat);
1191 		if (error)
1192 			return error;
1193 		if (!stat)
1194 			return -EFSCORRUPTED;
1195 
1196 		error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
1197 		if (error)
1198 			return error;
1199 		if (!stat)
1200 			return -EFSCORRUPTED;
1201 
1202 		block_rec = xfs_btree_rec_addr(cur, idx, block);
1203 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
1204 	}
1205 
1206 	return loaded;
1207 }
1208 
1209 /* Feed one of the new btree blocks to the bulk loader. */
1210 STATIC int
1211 xrep_rmap_claim_block(
1212 	struct xfs_btree_cur	*cur,
1213 	union xfs_btree_ptr	*ptr,
1214 	void			*priv)
1215 {
1216 	struct xrep_rmap        *rr = priv;
1217 
1218 	return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
1219 }
1220 
1221 /* Custom allocation function for new rmap btrees. */
1222 STATIC int
1223 xrep_rmap_alloc_vextent(
1224 	struct xfs_scrub	*sc,
1225 	struct xfs_alloc_arg	*args,
1226 	xfs_fsblock_t		alloc_hint)
1227 {
1228 	int			error;
1229 
1230 	/*
1231 	 * We don't want an rmap update on the allocation, since we iteratively
1232 	 * compute the OWN_AG records /after/ allocating blocks for the records
1233 	 * that we already know we need to store.  Therefore, fix the freelist
1234 	 * with the NORMAP flag set so that we don't also try to create an rmap
1235 	 * for new AGFL blocks.
1236 	 */
1237 	error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
1238 	if (error)
1239 		return error;
1240 
1241 	/*
1242 	 * If xrep_fix_freelist fixed the freelist by moving blocks from the
1243 	 * free space btrees or by removing blocks from the AGFL and queueing
1244 	 * an EFI to free the block, the transaction will be dirty.  This
1245 	 * second case is of interest to us.
1246 	 *
1247 	 * Later on, we will need to compare gaps in the new recordset against
1248 	 * the block usage of all OWN_AG owners in order to free the old
1249 	 * btree's blocks, which means that we can't have EFIs for former AGFL
1250 	 * blocks attached to the repair transaction when we commit the new
1251 	 * btree.
1252 	 *
1253 	 * xrep_newbt_alloc_blocks guarantees this for us by calling
1254 	 * xrep_defer_finish to commit anything that fix_freelist may have
1255 	 * added to the transaction.
1256 	 */
1257 	return xfs_alloc_vextent_near_bno(args, alloc_hint);
1258 }
1259 
1260 
1261 /* Count the records in this btree. */
1262 STATIC int
1263 xrep_rmap_count_records(
1264 	struct xfs_btree_cur	*cur,
1265 	unsigned long long	*nr)
1266 {
1267 	int			running = 1;
1268 	int			error;
1269 
1270 	*nr = 0;
1271 
1272 	error = xfs_btree_goto_left_edge(cur);
1273 	if (error)
1274 		return error;
1275 
1276 	while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
1277 		if (running)
1278 			(*nr)++;
1279 	}
1280 
1281 	return error;
1282 }
1283 /*
1284  * Use the collected rmap information to stage a new rmap btree.  If this is
1285  * successful we'll return with the new btree root information logged to the
1286  * repair transaction but not yet committed.  This implements section (III)
1287  * above.
1288  */
1289 STATIC int
1290 xrep_rmap_build_new_tree(
1291 	struct xrep_rmap	*rr)
1292 {
1293 	struct xfs_scrub	*sc = rr->sc;
1294 	struct xfs_perag	*pag = sc->sa.pag;
1295 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1296 	struct xfs_btree_cur	*rmap_cur;
1297 	xfs_fsblock_t		fsbno;
1298 	int			error;
1299 
1300 	/*
1301 	 * Preserve the old rmapbt block count so that we can adjust the
1302 	 * per-AG rmapbt reservation after we commit the new btree root and
1303 	 * want to dispose of the old btree blocks.
1304 	 */
1305 	rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
1306 
1307 	/*
1308 	 * Prepare to construct the new btree by reserving disk space for the
1309 	 * new btree and setting up all the accounting information we'll need
1310 	 * to root the new btree while it's under construction and before we
1311 	 * attach it to the AG header.  The new blocks are accounted to the
1312 	 * rmapbt per-AG reservation, which we will adjust further after
1313 	 * committing the new btree.
1314 	 */
1315 	fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp));
1316 	xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
1317 			fsbno, XFS_AG_RESV_RMAPBT);
1318 	rr->new_btree.bload.get_records = xrep_rmap_get_records;
1319 	rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
1320 	rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
1321 	rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
1322 	xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
1323 
1324 	/*
1325 	 * Initialize @rr->new_btree, reserve space for the new rmapbt,
1326 	 * and compute OWN_AG rmaps.
1327 	 */
1328 	error = xrep_rmap_reserve_space(rr, rmap_cur);
1329 	if (error)
1330 		goto err_cur;
1331 
1332 	/*
1333 	 * Count the rmapbt records again, because the space reservation
1334 	 * for the rmapbt itself probably added more records to the btree.
1335 	 */
1336 	rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
1337 			&rr->rmap_btree);
1338 
1339 	error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
1340 	if (error)
1341 		goto err_mcur;
1342 
1343 	/*
1344 	 * Due to btree slack factors, it's possible for a new btree to be one
1345 	 * level taller than the old btree.  Update the incore btree height so
1346 	 * that we don't trip the verifiers when writing the new btree blocks
1347 	 * to disk.
1348 	 */
1349 	pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
1350 
1351 	/*
1352 	 * Move the cursor to the left edge of the tree so that the first
1353 	 * increment in ->get_records positions us at the first record.
1354 	 */
1355 	error = xfs_btree_goto_left_edge(rr->mcur);
1356 	if (error)
1357 		goto err_level;
1358 
1359 	/* Add all observed rmap records. */
1360 	error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
1361 	if (error)
1362 		goto err_level;
1363 
1364 	/*
1365 	 * Install the new btree in the AG header.  After this point the old
1366 	 * btree is no longer accessible and the new tree is live.
1367 	 */
1368 	xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
1369 	xfs_btree_del_cursor(rmap_cur, 0);
1370 	xfs_btree_del_cursor(rr->mcur, 0);
1371 	rr->mcur = NULL;
1372 
1373 	/*
1374 	 * Now that we've written the new btree to disk, we don't need to keep
1375 	 * updating the in-memory btree.  Abort the scan to stop live updates.
1376 	 */
1377 	xchk_iscan_abort(&rr->iscan);
1378 
1379 	/*
1380 	 * The newly committed rmap recordset includes mappings for the blocks
1381 	 * that we reserved to build the new btree.  If there is excess space
1382 	 * reservation to be freed, the corresponding rmap records must also be
1383 	 * removed.
1384 	 */
1385 	rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
1386 
1387 	/* Reset the AGF counters now that we've changed the btree shape. */
1388 	error = xrep_rmap_reset_counters(rr);
1389 	if (error)
1390 		goto err_newbt;
1391 
1392 	/* Dispose of any unused blocks and the accounting information. */
1393 	error = xrep_newbt_commit(&rr->new_btree);
1394 	if (error)
1395 		return error;
1396 
1397 	return xrep_roll_ag_trans(sc);
1398 
1399 err_level:
1400 	pag->pagf_repair_rmap_level = 0;
1401 err_mcur:
1402 	xfs_btree_del_cursor(rr->mcur, error);
1403 err_cur:
1404 	xfs_btree_del_cursor(rmap_cur, error);
1405 err_newbt:
1406 	xrep_newbt_cancel(&rr->new_btree);
1407 	return error;
1408 }
1409 
1410 /* Section (IV): Reaping the old btree. */
1411 
1412 struct xrep_rmap_find_gaps {
1413 	struct xagb_bitmap	rmap_gaps;
1414 	xfs_agblock_t		next_agbno;
1415 };
1416 
1417 /* Subtract each free extent in the bnobt from the rmap gaps. */
1418 STATIC int
1419 xrep_rmap_find_freesp(
1420 	struct xfs_btree_cur		*cur,
1421 	const struct xfs_alloc_rec_incore *rec,
1422 	void				*priv)
1423 {
1424 	struct xrep_rmap_find_gaps	*rfg = priv;
1425 
1426 	return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
1427 			rec->ar_blockcount);
1428 }
1429 
1430 /* Record the free space we find, as part of cleaning out the btree. */
1431 STATIC int
1432 xrep_rmap_find_gaps(
1433 	struct xfs_btree_cur		*cur,
1434 	const struct xfs_rmap_irec	*rec,
1435 	void				*priv)
1436 {
1437 	struct xrep_rmap_find_gaps	*rfg = priv;
1438 	int				error;
1439 
1440 	if (rec->rm_startblock > rfg->next_agbno) {
1441 		error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
1442 				rec->rm_startblock - rfg->next_agbno);
1443 		if (error)
1444 			return error;
1445 	}
1446 
1447 	rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
1448 				rec->rm_startblock + rec->rm_blockcount);
1449 	return 0;
1450 }
1451 
1452 /*
1453  * Reap the old rmapbt blocks.  Now that the rmapbt is fully rebuilt, we make
1454  * a list of gaps in the rmap records and a list of the extents mentioned in
1455  * the bnobt.  Any block that's in the new rmapbt gap list but not mentioned
1456  * in the bnobt is a block from the old rmapbt and can be removed.
1457  */
1458 STATIC int
1459 xrep_rmap_remove_old_tree(
1460 	struct xrep_rmap	*rr)
1461 {
1462 	struct xrep_rmap_find_gaps rfg = {
1463 		.next_agbno	= 0,
1464 	};
1465 	struct xfs_scrub	*sc = rr->sc;
1466 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1467 	struct xfs_perag	*pag = sc->sa.pag;
1468 	struct xfs_btree_cur	*mcur;
1469 	xfs_agblock_t		agend;
1470 	int			error;
1471 
1472 	xagb_bitmap_init(&rfg.rmap_gaps);
1473 
1474 	/* Compute free space from the new rmapbt. */
1475 	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
1476 
1477 	error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
1478 	xfs_btree_del_cursor(mcur, error);
1479 	if (error)
1480 		goto out_bitmap;
1481 
1482 	/* Insert a record for space between the last rmap and EOAG. */
1483 	agend = be32_to_cpu(agf->agf_length);
1484 	if (rfg.next_agbno < agend) {
1485 		error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
1486 				agend - rfg.next_agbno);
1487 		if (error)
1488 			goto out_bitmap;
1489 	}
1490 
1491 	/* Compute free space from the existing bnobt. */
1492 	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1493 			sc->sa.pag);
1494 	error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
1495 			&rfg);
1496 	xfs_btree_del_cursor(sc->sa.bno_cur, error);
1497 	sc->sa.bno_cur = NULL;
1498 	if (error)
1499 		goto out_bitmap;
1500 
1501 	/*
1502 	 * Free the "free" blocks that the new rmapbt knows about but the bnobt
1503 	 * doesn't--these are the old rmapbt blocks.  Credit the old rmapbt
1504 	 * block usage count back to the per-AG rmapbt reservation (and not
1505 	 * fdblocks, since the rmap btree lives in free space) to keep the
1506 	 * reservation and free space accounting correct.
1507 	 */
1508 	error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
1509 			&XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
1510 	if (error)
1511 		goto out_bitmap;
1512 
1513 	/*
1514 	 * Now that we've zapped all the old rmapbt blocks we can turn off
1515 	 * the alternate height mechanism and reset the per-AG space
1516 	 * reservation.
1517 	 */
1518 	pag->pagf_repair_rmap_level = 0;
1519 	sc->flags |= XREP_RESET_PERAG_RESV;
1520 out_bitmap:
1521 	xagb_bitmap_destroy(&rfg.rmap_gaps);
1522 	return error;
1523 }
1524 
1525 static inline bool
1526 xrep_rmapbt_want_live_update(
1527 	struct xchk_iscan		*iscan,
1528 	const struct xfs_owner_info	*oi)
1529 {
1530 	if (xchk_iscan_aborted(iscan))
1531 		return false;
1532 
1533 	/*
1534 	 * Before unlocking the AG header to perform the inode scan, we
1535 	 * recorded reverse mappings for all AG metadata except for the OWN_AG
1536 	 * metadata.  IOWs, the in-memory btree knows about the AG headers, the
1537 	 * two inode btrees, the CoW staging extents, and the refcount btrees.
1538 	 * For these types of metadata, we need to record the live updates in
1539 	 * the in-memory rmap btree.
1540 	 *
1541 	 * However, we do not scan the free space btrees or the AGFL until we
1542 	 * have re-locked the AGF and are ready to reserve space for the new
1543 	 * rmap btree, so we do not want live updates for OWN_AG metadata.
1544 	 */
1545 	if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
1546 		return oi->oi_owner != XFS_RMAP_OWN_AG;
1547 
1548 	/* Ignore updates to files that the scanner hasn't visited yet. */
1549 	return xchk_iscan_want_live_update(iscan, oi->oi_owner);
1550 }
1551 
1552 /*
1553  * Apply a rmapbt update from the regular filesystem into our shadow btree.
1554  * We're running from the thread that owns the AGF buffer and is generating
1555  * the update, so we must be careful about which parts of the struct xrep_rmap
1556  * that we change.
1557  */
1558 static int
1559 xrep_rmapbt_live_update(
1560 	struct notifier_block		*nb,
1561 	unsigned long			action,
1562 	void				*data)
1563 {
1564 	struct xfs_rmap_update_params	*p = data;
1565 	struct xrep_rmap		*rr;
1566 	struct xfs_mount		*mp;
1567 	struct xfs_btree_cur		*mcur;
1568 	struct xfs_trans		*tp;
1569 	void				*txcookie;
1570 	int				error;
1571 
1572 	rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
1573 	mp = rr->sc->mp;
1574 
1575 	if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
1576 		goto out_unlock;
1577 
1578 	trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p);
1579 
1580 	error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
1581 	if (error)
1582 		goto out_abort;
1583 
1584 	mutex_lock(&rr->lock);
1585 	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
1586 	error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
1587 			p->blockcount, &p->oinfo, p->unwritten);
1588 	xfs_btree_del_cursor(mcur, error);
1589 	if (error)
1590 		goto out_cancel;
1591 
1592 	error = xfbtree_trans_commit(&rr->rmap_btree, tp);
1593 	if (error)
1594 		goto out_cancel;
1595 
1596 	xrep_trans_cancel_hook_dummy(&txcookie, tp);
1597 	mutex_unlock(&rr->lock);
1598 	return NOTIFY_DONE;
1599 
1600 out_cancel:
1601 	xfbtree_trans_cancel(&rr->rmap_btree, tp);
1602 	xrep_trans_cancel_hook_dummy(&txcookie, tp);
1603 out_abort:
1604 	mutex_unlock(&rr->lock);
1605 	xchk_iscan_abort(&rr->iscan);
1606 out_unlock:
1607 	return NOTIFY_DONE;
1608 }
1609 
1610 /* Set up the filesystem scan components. */
1611 STATIC int
1612 xrep_rmap_setup_scan(
1613 	struct xrep_rmap	*rr)
1614 {
1615 	struct xfs_scrub	*sc = rr->sc;
1616 	int			error;
1617 
1618 	mutex_init(&rr->lock);
1619 
1620 	/* Set up in-memory rmap btree */
1621 	error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
1622 			sc->sa.pag->pag_agno);
1623 	if (error)
1624 		goto out_mutex;
1625 
1626 	/* Retry iget every tenth of a second for up to 30 seconds. */
1627 	xchk_iscan_start(sc, 30000, 100, &rr->iscan);
1628 
1629 	/*
1630 	 * Hook into live rmap operations so that we can update our in-memory
1631 	 * btree to reflect live changes on the filesystem.  Since we drop the
1632 	 * AGF buffer to scan all the inodes, we need this piece to avoid
1633 	 * installing a stale btree.
1634 	 */
1635 	ASSERT(sc->flags & XCHK_FSGATES_RMAP);
1636 	xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
1637 	error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook);
1638 	if (error)
1639 		goto out_iscan;
1640 	return 0;
1641 
1642 out_iscan:
1643 	xchk_iscan_teardown(&rr->iscan);
1644 	xfbtree_destroy(&rr->rmap_btree);
1645 out_mutex:
1646 	mutex_destroy(&rr->lock);
1647 	return error;
1648 }
1649 
1650 /* Tear down scan components. */
1651 STATIC void
1652 xrep_rmap_teardown(
1653 	struct xrep_rmap	*rr)
1654 {
1655 	struct xfs_scrub	*sc = rr->sc;
1656 
1657 	xchk_iscan_abort(&rr->iscan);
1658 	xfs_rmap_hook_del(sc->sa.pag, &rr->rhook);
1659 	xchk_iscan_teardown(&rr->iscan);
1660 	xfbtree_destroy(&rr->rmap_btree);
1661 	mutex_destroy(&rr->lock);
1662 }
1663 
1664 /* Repair the rmap btree for some AG. */
1665 int
1666 xrep_rmapbt(
1667 	struct xfs_scrub	*sc)
1668 {
1669 	struct xrep_rmap	*rr = sc->buf;
1670 	int			error;
1671 
1672 	error = xrep_rmap_setup_scan(rr);
1673 	if (error)
1674 		return error;
1675 
1676 	/*
1677 	 * Collect rmaps for everything in this AG that isn't space metadata.
1678 	 * These rmaps won't change even as we try to allocate blocks.
1679 	 */
1680 	error = xrep_rmap_find_rmaps(rr);
1681 	if (error)
1682 		goto out_records;
1683 
1684 	/* Rebuild the rmap information. */
1685 	error = xrep_rmap_build_new_tree(rr);
1686 	if (error)
1687 		goto out_records;
1688 
1689 	/* Kill the old tree. */
1690 	error = xrep_rmap_remove_old_tree(rr);
1691 	if (error)
1692 		goto out_records;
1693 
1694 out_records:
1695 	xrep_rmap_teardown(rr);
1696 	return error;
1697 }
1698