xref: /linux/fs/xfs/scrub/reap.c (revision e445fba2d76369d72b497ecadf6b9787930693d9)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans.h"
15 #include "xfs_sb.h"
16 #include "xfs_inode.h"
17 #include "xfs_alloc.h"
18 #include "xfs_alloc_btree.h"
19 #include "xfs_ialloc.h"
20 #include "xfs_ialloc_btree.h"
21 #include "xfs_rmap.h"
22 #include "xfs_rmap_btree.h"
23 #include "xfs_refcount.h"
24 #include "xfs_refcount_btree.h"
25 #include "xfs_extent_busy.h"
26 #include "xfs_ag.h"
27 #include "xfs_ag_resv.h"
28 #include "xfs_quota.h"
29 #include "xfs_qm.h"
30 #include "xfs_bmap.h"
31 #include "xfs_da_format.h"
32 #include "xfs_da_btree.h"
33 #include "xfs_attr.h"
34 #include "xfs_attr_remote.h"
35 #include "xfs_defer.h"
36 #include "xfs_metafile.h"
37 #include "xfs_rtgroup.h"
38 #include "xfs_rtrmap_btree.h"
39 #include "xfs_extfree_item.h"
40 #include "xfs_rmap_item.h"
41 #include "xfs_refcount_item.h"
42 #include "xfs_buf_item.h"
43 #include "xfs_bmap_item.h"
44 #include "xfs_bmap_btree.h"
45 #include "scrub/scrub.h"
46 #include "scrub/common.h"
47 #include "scrub/trace.h"
48 #include "scrub/repair.h"
49 #include "scrub/bitmap.h"
50 #include "scrub/agb_bitmap.h"
51 #include "scrub/fsb_bitmap.h"
52 #include "scrub/rtb_bitmap.h"
53 #include "scrub/reap.h"
54 
55 /*
56  * Disposal of Blocks from Old Metadata
57  *
58  * Now that we've constructed a new btree to replace the damaged one, we want
59  * to dispose of the blocks that (we think) the old btree was using.
60  * Previously, we used the rmapbt to collect the extents (bitmap) with the
61  * rmap owner corresponding to the tree we rebuilt, collected extents for any
62  * blocks with the same rmap owner that are owned by another data structure
63  * (sublist), and subtracted sublist from bitmap.  In theory the extents
64  * remaining in bitmap are the old btree's blocks.
65  *
66  * Unfortunately, it's possible that the btree was crosslinked with other
67  * blocks on disk.  The rmap data can tell us if there are multiple owners, so
68  * if the rmapbt says there is an owner of this block other than @oinfo, then
69  * the block is crosslinked.  Remove the reverse mapping and continue.
70  *
71  * If there is one rmap record, we can free the block, which removes the
72  * reverse mapping but doesn't add the block to the free space.  Our repair
73  * strategy is to hope the other metadata objects crosslinked on this block
74  * will be rebuilt (atop different blocks), thereby removing all the cross
75  * links.
76  *
77  * If there are no rmap records at all, we also free the block.  If the btree
78  * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
79  * supposed to be a rmap record and everything is ok.  For other btrees there
80  * had to have been an rmap entry for the block to have ended up on @bitmap,
81  * so if it's gone now there's something wrong and the fs will shut down.
82  *
83  * Note: If there are multiple rmap records with only the same rmap owner as
84  * the btree we're trying to rebuild and the block is indeed owned by another
85  * data structure with the same rmap owner, then the block will be in sublist
86  * and therefore doesn't need disposal.  If there are multiple rmap records
87  * with only the same rmap owner but the block is not owned by something with
88  * the same rmap owner, the block will be freed.
89  *
90  * The caller is responsible for locking the AG headers/inode for the entire
91  * rebuild operation so that nothing else can sneak in and change the incore
92  * state while we're not looking.  We must also invalidate any buffers
93  * associated with @bitmap.
94  */
95 
96 /* Information about reaping extents after a repair. */
97 struct xreap_state {
98 	struct xfs_scrub		*sc;
99 
100 	union {
101 		struct {
102 			/*
103 			 * For AG blocks, this is reverse mapping owner and
104 			 * metadata reservation type.
105 			 */
106 			const struct xfs_owner_info	*oinfo;
107 			enum xfs_ag_resv_type		resv;
108 		};
109 		struct {
110 			/* For file blocks, this is the inode and fork. */
111 			struct xfs_inode		*ip;
112 			int				whichfork;
113 		};
114 	};
115 
116 	/* Number of invalidated buffers logged to the current transaction. */
117 	unsigned int			nr_binval;
118 
119 	/* Maximum number of buffers we can invalidate in a single tx. */
120 	unsigned int			max_binval;
121 
122 	/* Number of deferred reaps attached to the current transaction. */
123 	unsigned int			nr_deferred;
124 
125 	/* Maximum number of intents we can reap in a single transaction. */
126 	unsigned int			max_deferred;
127 };
128 
129 /* Put a block back on the AGFL. */
130 STATIC int
xreap_put_freelist(struct xfs_scrub * sc,xfs_agblock_t agbno)131 xreap_put_freelist(
132 	struct xfs_scrub	*sc,
133 	xfs_agblock_t		agbno)
134 {
135 	struct xfs_buf		*agfl_bp;
136 	int			error;
137 
138 	/* Make sure there's space on the freelist. */
139 	error = xrep_fix_freelist(sc, 0);
140 	if (error)
141 		return error;
142 
143 	/*
144 	 * Since we're "freeing" a lost block onto the AGFL, we have to
145 	 * create an rmap for the block prior to merging it or else other
146 	 * parts will break.
147 	 */
148 	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
149 			&XFS_RMAP_OINFO_AG);
150 	if (error)
151 		return error;
152 
153 	/* Put the block on the AGFL. */
154 	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
155 	if (error)
156 		return error;
157 
158 	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
159 			agfl_bp, agbno, 0);
160 	if (error)
161 		return error;
162 	xfs_extent_busy_insert(sc->tp, pag_group(sc->sa.pag), agbno, 1,
163 			XFS_EXTENT_BUSY_SKIP_DISCARD);
164 
165 	return 0;
166 }
167 
168 /* Are there any uncommitted reap operations? */
xreap_is_dirty(const struct xreap_state * rs)169 static inline bool xreap_is_dirty(const struct xreap_state *rs)
170 {
171 	return rs->nr_binval > 0 || rs->nr_deferred > 0;
172 }
173 
174 /*
175  * Decide if we need to roll the transaction to clear out the the log
176  * reservation that we allocated to buffer invalidations.
177  */
xreap_want_binval_roll(const struct xreap_state * rs)178 static inline bool xreap_want_binval_roll(const struct xreap_state *rs)
179 {
180 	return rs->nr_binval >= rs->max_binval;
181 }
182 
183 /* Reset the buffer invalidation count after rolling. */
xreap_binval_reset(struct xreap_state * rs)184 static inline void xreap_binval_reset(struct xreap_state *rs)
185 {
186 	rs->nr_binval = 0;
187 }
188 
189 /*
190  * Bump the number of invalidated buffers, and return true if we can continue,
191  * or false if we need to roll the transaction.
192  */
xreap_inc_binval(struct xreap_state * rs)193 static inline bool xreap_inc_binval(struct xreap_state *rs)
194 {
195 	rs->nr_binval++;
196 	return rs->nr_binval < rs->max_binval;
197 }
198 
199 /*
200  * Decide if we want to finish the deferred ops that are attached to the scrub
201  * transaction.  We don't want to queue huge chains of deferred ops because
202  * that can consume a lot of log space and kernel memory.  Hence we trigger a
203  * xfs_defer_finish if there are too many deferred reap operations or we've run
204  * out of space for invalidations.
205  */
xreap_want_defer_finish(const struct xreap_state * rs)206 static inline bool xreap_want_defer_finish(const struct xreap_state *rs)
207 {
208 	return rs->nr_deferred >= rs->max_deferred;
209 }
210 
211 /*
212  * Reset the defer chain length and buffer invalidation count after finishing
213  * items.
214  */
xreap_defer_finish_reset(struct xreap_state * rs)215 static inline void xreap_defer_finish_reset(struct xreap_state *rs)
216 {
217 	rs->nr_deferred = 0;
218 	rs->nr_binval = 0;
219 }
220 
221 /*
222  * Bump the number of deferred extent reaps.
223  */
xreap_inc_defer(struct xreap_state * rs)224 static inline void xreap_inc_defer(struct xreap_state *rs)
225 {
226 	rs->nr_deferred++;
227 }
228 
229 /* Force the caller to finish a deferred item chain. */
xreap_force_defer_finish(struct xreap_state * rs)230 static inline void xreap_force_defer_finish(struct xreap_state *rs)
231 {
232 	rs->nr_deferred = rs->max_deferred;
233 }
234 
235 /* Maximum number of fsblocks that we might find in a buffer to invalidate. */
236 static inline unsigned int
xrep_binval_max_fsblocks(struct xfs_mount * mp)237 xrep_binval_max_fsblocks(
238 	struct xfs_mount	*mp)
239 {
240 	/* Remote xattr values are the largest buffers that we support. */
241 	return xfs_attr3_max_rmt_blocks(mp);
242 }
243 
244 /*
245  * Compute the maximum length of a buffer cache scan (in units of sectors),
246  * given a quantity of fs blocks.
247  */
248 xfs_daddr_t
xrep_bufscan_max_sectors(struct xfs_mount * mp,xfs_extlen_t fsblocks)249 xrep_bufscan_max_sectors(
250 	struct xfs_mount	*mp,
251 	xfs_extlen_t		fsblocks)
252 {
253 	return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks,
254 				       xrep_binval_max_fsblocks(mp)));
255 }
256 
257 /*
258  * Return an incore buffer from a sector scan, or NULL if there are no buffers
259  * left to return.
260  */
261 struct xfs_buf *
xrep_bufscan_advance(struct xfs_mount * mp,struct xrep_bufscan * scan)262 xrep_bufscan_advance(
263 	struct xfs_mount	*mp,
264 	struct xrep_bufscan	*scan)
265 {
266 	scan->__sector_count += scan->daddr_step;
267 	while (scan->__sector_count <= scan->max_sectors) {
268 		struct xfs_buf	*bp = NULL;
269 		int		error;
270 
271 		error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr,
272 				scan->__sector_count, XBF_LIVESCAN, &bp);
273 		if (!error)
274 			return bp;
275 
276 		scan->__sector_count += scan->daddr_step;
277 	}
278 
279 	return NULL;
280 }
281 
282 /* Try to invalidate the incore buffers for an extent that we're freeing. */
283 STATIC void
xreap_agextent_binval(struct xreap_state * rs,xfs_agblock_t agbno,xfs_extlen_t * aglenp)284 xreap_agextent_binval(
285 	struct xreap_state	*rs,
286 	xfs_agblock_t		agbno,
287 	xfs_extlen_t		*aglenp)
288 {
289 	struct xfs_scrub	*sc = rs->sc;
290 	struct xfs_perag	*pag = sc->sa.pag;
291 	struct xfs_mount	*mp = sc->mp;
292 	xfs_agblock_t		agbno_next = agbno + *aglenp;
293 	xfs_agblock_t		bno = agbno;
294 
295 	/*
296 	 * Avoid invalidating AG headers and post-EOFS blocks because we never
297 	 * own those.
298 	 */
299 	if (!xfs_verify_agbno(pag, agbno) ||
300 	    !xfs_verify_agbno(pag, agbno_next - 1))
301 		return;
302 
303 	/*
304 	 * If there are incore buffers for these blocks, invalidate them.  We
305 	 * assume that the lack of any other known owners means that the buffer
306 	 * can be locked without risk of deadlocking.  The buffer cache cannot
307 	 * detect aliasing, so employ nested loops to scan for incore buffers
308 	 * of any plausible size.
309 	 */
310 	while (bno < agbno_next) {
311 		struct xrep_bufscan	scan = {
312 			.daddr		= xfs_agbno_to_daddr(pag, bno),
313 			.max_sectors	= xrep_bufscan_max_sectors(mp,
314 							agbno_next - bno),
315 			.daddr_step	= XFS_FSB_TO_BB(mp, 1),
316 		};
317 		struct xfs_buf	*bp;
318 
319 		while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
320 			xfs_trans_bjoin(sc->tp, bp);
321 			xfs_trans_binval(sc->tp, bp);
322 
323 			/*
324 			 * Stop invalidating if we've hit the limit; we should
325 			 * still have enough reservation left to free however
326 			 * far we've gotten.
327 			 */
328 			if (!xreap_inc_binval(rs)) {
329 				*aglenp -= agbno_next - bno;
330 				goto out;
331 			}
332 		}
333 
334 		bno++;
335 	}
336 
337 out:
338 	trace_xreap_agextent_binval(pag_group(sc->sa.pag), agbno, *aglenp);
339 }
340 
341 /*
342  * Figure out the longest run of blocks that we can dispose of with a single
343  * call.  Cross-linked blocks should have their reverse mappings removed, but
344  * single-owner extents can be freed.  AGFL blocks can only be put back one at
345  * a time.
346  */
347 STATIC int
xreap_agextent_select(struct xreap_state * rs,xfs_agblock_t agbno,xfs_agblock_t agbno_next,bool * crosslinked,xfs_extlen_t * aglenp)348 xreap_agextent_select(
349 	struct xreap_state	*rs,
350 	xfs_agblock_t		agbno,
351 	xfs_agblock_t		agbno_next,
352 	bool			*crosslinked,
353 	xfs_extlen_t		*aglenp)
354 {
355 	struct xfs_scrub	*sc = rs->sc;
356 	struct xfs_btree_cur	*cur;
357 	xfs_agblock_t		bno = agbno + 1;
358 	xfs_extlen_t		len = 1;
359 	int			error;
360 
361 	/*
362 	 * Determine if there are any other rmap records covering the first
363 	 * block of this extent.  If so, the block is crosslinked.
364 	 */
365 	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
366 			sc->sa.pag);
367 	error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
368 			crosslinked);
369 	if (error)
370 		goto out_cur;
371 
372 	/* AGFL blocks can only be deal with one at a time. */
373 	if (rs->resv == XFS_AG_RESV_AGFL)
374 		goto out_found;
375 
376 	/*
377 	 * Figure out how many of the subsequent blocks have the same crosslink
378 	 * status.
379 	 */
380 	while (bno < agbno_next) {
381 		bool		also_crosslinked;
382 
383 		error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
384 				&also_crosslinked);
385 		if (error)
386 			goto out_cur;
387 
388 		if (*crosslinked != also_crosslinked)
389 			break;
390 
391 		len++;
392 		bno++;
393 	}
394 
395 out_found:
396 	*aglenp = len;
397 	trace_xreap_agextent_select(pag_group(sc->sa.pag), agbno, len,
398 			*crosslinked);
399 out_cur:
400 	xfs_btree_del_cursor(cur, error);
401 	return error;
402 }
403 
404 /*
405  * Dispose of as much of the beginning of this AG extent as possible.  The
406  * number of blocks disposed of will be returned in @aglenp.
407  */
408 STATIC int
xreap_agextent_iter(struct xreap_state * rs,xfs_agblock_t agbno,xfs_extlen_t * aglenp,bool crosslinked)409 xreap_agextent_iter(
410 	struct xreap_state	*rs,
411 	xfs_agblock_t		agbno,
412 	xfs_extlen_t		*aglenp,
413 	bool			crosslinked)
414 {
415 	struct xfs_scrub	*sc = rs->sc;
416 	xfs_fsblock_t		fsbno;
417 	int			error = 0;
418 
419 	ASSERT(rs->resv != XFS_AG_RESV_METAFILE);
420 
421 	fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno);
422 
423 	/*
424 	 * If there are other rmappings, this block is cross linked and must
425 	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
426 	 * we were the only owner of the block, so free the extent, which will
427 	 * also remove the rmap.
428 	 *
429 	 * XXX: XFS doesn't support detecting the case where a single block
430 	 * metadata structure is crosslinked with a multi-block structure
431 	 * because the buffer cache doesn't detect aliasing problems, so we
432 	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
433 	 * blow on writeout, the filesystem will shut down, and the admin gets
434 	 * to run xfs_repair.
435 	 */
436 	if (crosslinked) {
437 		trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno,
438 				*aglenp);
439 
440 		if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
441 			/*
442 			 * t0: Unmapping CoW staging extents, remove the
443 			 * records from the refcountbt, which will remove the
444 			 * rmap record as well.
445 			 */
446 			xfs_refcount_free_cow_extent(sc->tp, false, fsbno,
447 					*aglenp);
448 			xreap_inc_defer(rs);
449 			return 0;
450 		}
451 
452 		/* t1: unmap crosslinked metadata blocks */
453 		xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp,
454 				rs->oinfo->oi_owner);
455 		xreap_inc_defer(rs);
456 		return 0;
457 	}
458 
459 	trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp);
460 
461 	/*
462 	 * Invalidate as many buffers as we can, starting at agbno.  If this
463 	 * function sets *aglenp to zero, the transaction is full of logged
464 	 * buffer invalidations, so we need to return early so that we can
465 	 * roll and retry.
466 	 */
467 	xreap_agextent_binval(rs, agbno, aglenp);
468 	if (*aglenp == 0) {
469 		ASSERT(xreap_want_binval_roll(rs));
470 		return 0;
471 	}
472 
473 	/*
474 	 * t2: To get rid of CoW staging extents, use deferred work items
475 	 * to remove the refcountbt records (which removes the rmap records)
476 	 * and free the extent.  We're not worried about the system going down
477 	 * here because log recovery walks the refcount btree to clean out the
478 	 * CoW staging extents.
479 	 */
480 	if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
481 		ASSERT(rs->resv == XFS_AG_RESV_NONE);
482 
483 		xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp);
484 		error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
485 				rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
486 		if (error)
487 			return error;
488 
489 		xreap_inc_defer(rs);
490 		return 0;
491 	}
492 
493 	/* t3: Put blocks back on the AGFL one at a time. */
494 	if (rs->resv == XFS_AG_RESV_AGFL) {
495 		ASSERT(*aglenp == 1);
496 		error = xreap_put_freelist(sc, agbno);
497 		if (error)
498 			return error;
499 
500 		xreap_force_defer_finish(rs);
501 		return 0;
502 	}
503 
504 	/*
505 	 * t4: Use deferred frees to get rid of the old btree blocks to try to
506 	 * minimize the window in which we could crash and lose the old blocks.
507 	 * Add a defer ops barrier every other extent to avoid stressing the
508 	 * system with large EFIs.
509 	 */
510 	error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
511 			rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
512 	if (error)
513 		return error;
514 
515 	xreap_inc_defer(rs);
516 	if (rs->nr_deferred % 2 == 0)
517 		xfs_defer_add_barrier(sc->tp);
518 	return 0;
519 }
520 
521 /* Configure the deferral and invalidation limits */
522 static inline void
xreap_configure_limits(struct xreap_state * rs,unsigned int fixed_overhead,unsigned int variable_overhead,unsigned int per_intent,unsigned int per_binval)523 xreap_configure_limits(
524 	struct xreap_state	*rs,
525 	unsigned int		fixed_overhead,
526 	unsigned int		variable_overhead,
527 	unsigned int		per_intent,
528 	unsigned int		per_binval)
529 {
530 	struct xfs_scrub	*sc = rs->sc;
531 	unsigned int		res = sc->tp->t_log_res - fixed_overhead;
532 
533 	/* Don't underflow the reservation */
534 	if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) {
535 		ASSERT(sc->tp->t_log_res >=
536 				(fixed_overhead + variable_overhead));
537 		xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE);
538 		return;
539 	}
540 
541 	rs->max_deferred = per_intent ? res / variable_overhead : 0;
542 	res -= rs->max_deferred * per_intent;
543 	rs->max_binval = per_binval ? res / per_binval : 0;
544 }
545 
546 /*
547  * Compute the maximum number of intent items that reaping can attach to the
548  * scrub transaction given the worst case log overhead of the intent items
549  * needed to reap a single per-AG space extent.  This is not for freeing CoW
550  * staging extents.
551  */
552 STATIC void
xreap_configure_agextent_limits(struct xreap_state * rs)553 xreap_configure_agextent_limits(
554 	struct xreap_state	*rs)
555 {
556 	struct xfs_scrub	*sc = rs->sc;
557 	struct xfs_mount	*mp = sc->mp;
558 
559 	/*
560 	 * In the worst case, relogging an intent item causes both an intent
561 	 * item and a done item to be attached to a transaction for each extent
562 	 * that we'd like to process.
563 	 */
564 	const unsigned int	efi = xfs_efi_log_space(1) +
565 				      xfs_efd_log_space(1);
566 	const unsigned int	rui = xfs_rui_log_space(1) +
567 				      xfs_rud_log_space();
568 
569 	/*
570 	 * Various things can happen when reaping non-CoW metadata blocks:
571 	 *
572 	 * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap
573 	 * record.
574 	 *
575 	 * t3: Freeing to AGFL: roll and finish deferred items for every block.
576 	 * Limits here do not matter.
577 	 *
578 	 * t4: Freeing metadata blocks: deferred freeing of the space, which
579 	 * also removes the rmap record.
580 	 *
581 	 * For simplicity, we'll use the worst-case intents size to determine
582 	 * the maximum number of deferred extents before we have to finish the
583 	 * whole chain.  If we're trying to reap a btree larger than this size,
584 	 * a crash midway through reaping can result in leaked blocks.
585 	 */
586 	const unsigned int	t1 = rui;
587 	const unsigned int	t4 = rui + efi;
588 	const unsigned int	per_intent = max(t1, t4);
589 
590 	/*
591 	 * For each transaction in a reap chain, we must be able to take one
592 	 * step in the defer item chain, which should only consist of EFI or
593 	 * RUI items.
594 	 */
595 	const unsigned int	f1 = xfs_calc_finish_efi_reservation(mp, 1);
596 	const unsigned int	f2 = xfs_calc_finish_rui_reservation(mp, 1);
597 	const unsigned int	step_size = max(f1, f2);
598 
599 	/* Largest buffer size (in fsblocks) that can be invalidated. */
600 	const unsigned int	max_binval = xrep_binval_max_fsblocks(mp);
601 
602 	/* Maximum overhead of invalidating one buffer. */
603 	const unsigned int	per_binval =
604 		xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
605 
606 	/*
607 	 * For each transaction in a reap chain, we can delete some number of
608 	 * extents and invalidate some number of blocks.  We assume that btree
609 	 * blocks aren't usually contiguous; and that scrub likely pulled all
610 	 * the buffers into memory.  From these assumptions, set the maximum
611 	 * number of deferrals we can queue before flushing the defer chain,
612 	 * and the number of invalidations we can queue before rolling to a
613 	 * clean transaction (and possibly relogging some of the deferrals) to
614 	 * the same quantity.
615 	 */
616 	const unsigned int	variable_overhead = per_intent + per_binval;
617 
618 	xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
619 			per_binval);
620 
621 	trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval,
622 			step_size, per_intent, rs->max_deferred);
623 }
624 
625 /*
626  * Compute the maximum number of intent items that reaping can attach to the
627  * scrub transaction given the worst case log overhead of the intent items
628  * needed to reap a single CoW staging extent.  This is not for freeing
629  * metadata blocks.
630  */
631 STATIC void
xreap_configure_agcow_limits(struct xreap_state * rs)632 xreap_configure_agcow_limits(
633 	struct xreap_state	*rs)
634 {
635 	struct xfs_scrub	*sc = rs->sc;
636 	struct xfs_mount	*mp = sc->mp;
637 
638 	/*
639 	 * In the worst case, relogging an intent item causes both an intent
640 	 * item and a done item to be attached to a transaction for each extent
641 	 * that we'd like to process.
642 	 */
643 	const unsigned int	efi = xfs_efi_log_space(1) +
644 				      xfs_efd_log_space(1);
645 	const unsigned int	rui = xfs_rui_log_space(1) +
646 				      xfs_rud_log_space();
647 	const unsigned int	cui = xfs_cui_log_space(1) +
648 				      xfs_cud_log_space();
649 
650 	/*
651 	 * Various things can happen when reaping non-CoW metadata blocks:
652 	 *
653 	 * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount
654 	 * record, which defers removal of rmap record
655 	 *
656 	 * t2: Freeing CoW blocks: deferred removal of refcount record, which
657 	 * defers removal of rmap record; and deferred removal of the space
658 	 *
659 	 * For simplicity, we'll use the worst-case intents size to determine
660 	 * the maximum number of deferred extents before we have to finish the
661 	 * whole chain.  If we're trying to reap a btree larger than this size,
662 	 * a crash midway through reaping can result in leaked blocks.
663 	 */
664 	const unsigned int	t0 = cui + rui;
665 	const unsigned int	t2 = cui + rui + efi;
666 	const unsigned int	per_intent = max(t0, t2);
667 
668 	/*
669 	 * For each transaction in a reap chain, we must be able to take one
670 	 * step in the defer item chain, which should only consist of CUI, EFI,
671 	 * or RUI items.
672 	 */
673 	const unsigned int	f1 = xfs_calc_finish_efi_reservation(mp, 1);
674 	const unsigned int	f2 = xfs_calc_finish_rui_reservation(mp, 1);
675 	const unsigned int	f3 = xfs_calc_finish_cui_reservation(mp, 1);
676 	const unsigned int	step_size = max3(f1, f2, f3);
677 
678 	/* Largest buffer size (in fsblocks) that can be invalidated. */
679 	const unsigned int	max_binval = xrep_binval_max_fsblocks(mp);
680 
681 	/* Overhead of invalidating one buffer */
682 	const unsigned int	per_binval =
683 		xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
684 
685 	/*
686 	 * For each transaction in a reap chain, we can delete some number of
687 	 * extents and invalidate some number of blocks.  We assume that CoW
688 	 * staging extents are usually more than 1 fsblock, and that there
689 	 * shouldn't be any buffers for those blocks.  From the assumptions,
690 	 * set the number of deferrals to use as much of the reservation as
691 	 * it can, but leave space to invalidate 1/8th that number of buffers.
692 	 */
693 	const unsigned int	variable_overhead = per_intent +
694 							(per_binval / 8);
695 
696 	xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
697 			per_binval);
698 
699 	trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size,
700 			per_intent, rs->max_deferred);
701 }
702 
703 /*
704  * Break an AG metadata extent into sub-extents by fate (crosslinked, not
705  * crosslinked), and dispose of each sub-extent separately.
706  */
707 STATIC int
xreap_agmeta_extent(uint32_t agbno,uint32_t len,void * priv)708 xreap_agmeta_extent(
709 	uint32_t		agbno,
710 	uint32_t		len,
711 	void			*priv)
712 {
713 	struct xreap_state	*rs = priv;
714 	struct xfs_scrub	*sc = rs->sc;
715 	xfs_agblock_t		agbno_next = agbno + len;
716 	int			error = 0;
717 
718 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
719 	ASSERT(sc->ip == NULL);
720 
721 	while (agbno < agbno_next) {
722 		xfs_extlen_t	aglen;
723 		bool		crosslinked;
724 
725 		error = xreap_agextent_select(rs, agbno, agbno_next,
726 				&crosslinked, &aglen);
727 		if (error)
728 			return error;
729 
730 		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
731 		if (error)
732 			return error;
733 
734 		if (xreap_want_defer_finish(rs)) {
735 			error = xrep_defer_finish(sc);
736 			if (error)
737 				return error;
738 			xreap_defer_finish_reset(rs);
739 		} else if (xreap_want_binval_roll(rs)) {
740 			error = xrep_roll_ag_trans(sc);
741 			if (error)
742 				return error;
743 			xreap_binval_reset(rs);
744 		}
745 
746 		agbno += aglen;
747 	}
748 
749 	return 0;
750 }
751 
752 /* Dispose of every block of every AG metadata extent in the bitmap. */
753 int
xrep_reap_agblocks(struct xfs_scrub * sc,struct xagb_bitmap * bitmap,const struct xfs_owner_info * oinfo,enum xfs_ag_resv_type type)754 xrep_reap_agblocks(
755 	struct xfs_scrub		*sc,
756 	struct xagb_bitmap		*bitmap,
757 	const struct xfs_owner_info	*oinfo,
758 	enum xfs_ag_resv_type		type)
759 {
760 	struct xreap_state		rs = {
761 		.sc			= sc,
762 		.oinfo			= oinfo,
763 		.resv			= type,
764 	};
765 	int				error;
766 
767 	ASSERT(xfs_has_rmapbt(sc->mp));
768 	ASSERT(sc->ip == NULL);
769 
770 	xreap_configure_agextent_limits(&rs);
771 	error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
772 	if (error)
773 		return error;
774 
775 	if (xreap_is_dirty(&rs))
776 		return xrep_defer_finish(sc);
777 
778 	return 0;
779 }
780 
781 /*
782  * Break a file metadata extent into sub-extents by fate (crosslinked, not
783  * crosslinked), and dispose of each sub-extent separately.  The extent must
784  * not cross an AG boundary.
785  */
786 STATIC int
xreap_fsmeta_extent(uint64_t fsbno,uint64_t len,void * priv)787 xreap_fsmeta_extent(
788 	uint64_t		fsbno,
789 	uint64_t		len,
790 	void			*priv)
791 {
792 	struct xreap_state	*rs = priv;
793 	struct xfs_scrub	*sc = rs->sc;
794 	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
795 	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
796 	xfs_agblock_t		agbno_next = agbno + len;
797 	int			error = 0;
798 
799 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
800 	ASSERT(sc->ip != NULL);
801 	ASSERT(!sc->sa.pag);
802 
803 	/*
804 	 * We're reaping blocks after repairing file metadata, which means that
805 	 * we have to init the xchk_ag structure ourselves.
806 	 */
807 	sc->sa.pag = xfs_perag_get(sc->mp, agno);
808 	if (!sc->sa.pag)
809 		return -EFSCORRUPTED;
810 
811 	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
812 	if (error)
813 		goto out_pag;
814 
815 	while (agbno < agbno_next) {
816 		xfs_extlen_t	aglen;
817 		bool		crosslinked;
818 
819 		error = xreap_agextent_select(rs, agbno, agbno_next,
820 				&crosslinked, &aglen);
821 		if (error)
822 			goto out_agf;
823 
824 		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
825 		if (error)
826 			goto out_agf;
827 
828 		if (xreap_want_defer_finish(rs)) {
829 			/*
830 			 * Holds the AGF buffer across the deferred chain
831 			 * processing.
832 			 */
833 			error = xrep_defer_finish(sc);
834 			if (error)
835 				goto out_agf;
836 			xreap_defer_finish_reset(rs);
837 		} else if (xreap_want_binval_roll(rs)) {
838 			/*
839 			 * Hold the AGF buffer across the transaction roll so
840 			 * that we don't have to reattach it to the scrub
841 			 * context.
842 			 */
843 			xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
844 			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
845 			xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
846 			if (error)
847 				goto out_agf;
848 			xreap_binval_reset(rs);
849 		}
850 
851 		agbno += aglen;
852 	}
853 
854 out_agf:
855 	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
856 	sc->sa.agf_bp = NULL;
857 out_pag:
858 	xfs_perag_put(sc->sa.pag);
859 	sc->sa.pag = NULL;
860 	return error;
861 }
862 
863 /*
864  * Dispose of every block of every fs metadata extent in the bitmap.
865  * Do not use this to dispose of the mappings in an ondisk inode fork.
866  */
867 int
xrep_reap_fsblocks(struct xfs_scrub * sc,struct xfsb_bitmap * bitmap,const struct xfs_owner_info * oinfo)868 xrep_reap_fsblocks(
869 	struct xfs_scrub		*sc,
870 	struct xfsb_bitmap		*bitmap,
871 	const struct xfs_owner_info	*oinfo)
872 {
873 	struct xreap_state		rs = {
874 		.sc			= sc,
875 		.oinfo			= oinfo,
876 		.resv			= XFS_AG_RESV_NONE,
877 	};
878 	int				error;
879 
880 	ASSERT(xfs_has_rmapbt(sc->mp));
881 	ASSERT(sc->ip != NULL);
882 
883 	if (oinfo == &XFS_RMAP_OINFO_COW)
884 		xreap_configure_agcow_limits(&rs);
885 	else
886 		xreap_configure_agextent_limits(&rs);
887 	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
888 	if (error)
889 		return error;
890 
891 	if (xreap_is_dirty(&rs))
892 		return xrep_defer_finish(sc);
893 
894 	return 0;
895 }
896 
897 #ifdef CONFIG_XFS_RT
898 /*
899  * Figure out the longest run of blocks that we can dispose of with a single
900  * call.  Cross-linked blocks should have their reverse mappings removed, but
901  * single-owner extents can be freed.  Units are rt blocks, not rt extents.
902  */
903 STATIC int
xreap_rgextent_select(struct xreap_state * rs,xfs_rgblock_t rgbno,xfs_rgblock_t rgbno_next,bool * crosslinked,xfs_extlen_t * rglenp)904 xreap_rgextent_select(
905 	struct xreap_state	*rs,
906 	xfs_rgblock_t		rgbno,
907 	xfs_rgblock_t		rgbno_next,
908 	bool			*crosslinked,
909 	xfs_extlen_t		*rglenp)
910 {
911 	struct xfs_scrub	*sc = rs->sc;
912 	struct xfs_btree_cur	*cur;
913 	xfs_rgblock_t		bno = rgbno + 1;
914 	xfs_extlen_t		len = 1;
915 	int			error;
916 
917 	/*
918 	 * Determine if there are any other rmap records covering the first
919 	 * block of this extent.  If so, the block is crosslinked.
920 	 */
921 	cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg);
922 	error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo,
923 			crosslinked);
924 	if (error)
925 		goto out_cur;
926 
927 	/*
928 	 * Figure out how many of the subsequent blocks have the same crosslink
929 	 * status.
930 	 */
931 	while (bno < rgbno_next) {
932 		bool		also_crosslinked;
933 
934 		error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
935 				&also_crosslinked);
936 		if (error)
937 			goto out_cur;
938 
939 		if (*crosslinked != also_crosslinked)
940 			break;
941 
942 		len++;
943 		bno++;
944 	}
945 
946 	*rglenp = len;
947 	trace_xreap_agextent_select(rtg_group(sc->sr.rtg), rgbno, len,
948 			*crosslinked);
949 out_cur:
950 	xfs_btree_del_cursor(cur, error);
951 	return error;
952 }
953 
954 /*
955  * Dispose of as much of the beginning of this rtgroup extent as possible.
956  * The number of blocks disposed of will be returned in @rglenp.
957  */
958 STATIC int
xreap_rgextent_iter(struct xreap_state * rs,xfs_rgblock_t rgbno,xfs_extlen_t * rglenp,bool crosslinked)959 xreap_rgextent_iter(
960 	struct xreap_state	*rs,
961 	xfs_rgblock_t		rgbno,
962 	xfs_extlen_t		*rglenp,
963 	bool			crosslinked)
964 {
965 	struct xfs_scrub	*sc = rs->sc;
966 	xfs_rtblock_t		rtbno;
967 	int			error;
968 
969 	/*
970 	 * The only caller so far is CoW fork repair, so we only know how to
971 	 * unlink or free CoW staging extents.  Here we don't have to worry
972 	 * about invalidating buffers!
973 	 */
974 	if (rs->oinfo != &XFS_RMAP_OINFO_COW) {
975 		ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW);
976 		return -EFSCORRUPTED;
977 	}
978 	ASSERT(rs->resv == XFS_AG_RESV_NONE);
979 
980 	rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno);
981 
982 	/*
983 	 * t1: There are other rmappings; this block is cross linked and must
984 	 * not be freed.  Remove the forward and reverse mapping and move on.
985 	 */
986 	if (crosslinked) {
987 		trace_xreap_dispose_unmap_extent(rtg_group(sc->sr.rtg), rgbno,
988 				*rglenp);
989 
990 		xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
991 		xreap_inc_defer(rs);
992 		return 0;
993 	}
994 
995 	trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp);
996 
997 	/*
998 	 * t2: The CoW staging extent is not crosslinked.  Use deferred work
999 	 * to remove the refcountbt records (which removes the rmap records)
1000 	 * and free the extent.  We're not worried about the system going down
1001 	 * here because log recovery walks the refcount btree to clean out the
1002 	 * CoW staging extents.
1003 	 */
1004 	xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
1005 	error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL,
1006 			rs->resv,
1007 			XFS_FREE_EXTENT_REALTIME |
1008 			XFS_FREE_EXTENT_SKIP_DISCARD);
1009 	if (error)
1010 		return error;
1011 
1012 	xreap_inc_defer(rs);
1013 	return 0;
1014 }
1015 
1016 /*
1017  * Compute the maximum number of intent items that reaping can attach to the
1018  * scrub transaction given the worst case log overhead of the intent items
1019  * needed to reap a single CoW staging extent.  This is not for freeing
1020  * metadata blocks.
1021  */
1022 STATIC void
xreap_configure_rgcow_limits(struct xreap_state * rs)1023 xreap_configure_rgcow_limits(
1024 	struct xreap_state	*rs)
1025 {
1026 	struct xfs_scrub	*sc = rs->sc;
1027 	struct xfs_mount	*mp = sc->mp;
1028 
1029 	/*
1030 	 * In the worst case, relogging an intent item causes both an intent
1031 	 * item and a done item to be attached to a transaction for each extent
1032 	 * that we'd like to process.
1033 	 */
1034 	const unsigned int	efi = xfs_efi_log_space(1) +
1035 				      xfs_efd_log_space(1);
1036 	const unsigned int	rui = xfs_rui_log_space(1) +
1037 				      xfs_rud_log_space();
1038 	const unsigned int	cui = xfs_cui_log_space(1) +
1039 				      xfs_cud_log_space();
1040 
1041 	/*
1042 	 * Various things can happen when reaping non-CoW metadata blocks:
1043 	 *
1044 	 * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount
1045 	 * record, which defers removal of rmap record
1046 	 *
1047 	 * t2: Freeing CoW blocks: deferred removal of refcount record, which
1048 	 * defers removal of rmap record; and deferred removal of the space
1049 	 *
1050 	 * For simplicity, we'll use the worst-case intents size to determine
1051 	 * the maximum number of deferred extents before we have to finish the
1052 	 * whole chain.  If we're trying to reap a btree larger than this size,
1053 	 * a crash midway through reaping can result in leaked blocks.
1054 	 */
1055 	const unsigned int	t1 = cui + rui;
1056 	const unsigned int	t2 = cui + rui + efi;
1057 	const unsigned int	per_intent = max(t1, t2);
1058 
1059 	/*
1060 	 * For each transaction in a reap chain, we must be able to take one
1061 	 * step in the defer item chain, which should only consist of CUI, EFI,
1062 	 * or RUI items.
1063 	 */
1064 	const unsigned int	f1 = xfs_calc_finish_rt_efi_reservation(mp, 1);
1065 	const unsigned int	f2 = xfs_calc_finish_rt_rui_reservation(mp, 1);
1066 	const unsigned int	f3 = xfs_calc_finish_rt_cui_reservation(mp, 1);
1067 	const unsigned int	step_size = max3(f1, f2, f3);
1068 
1069 	/*
1070 	 * The only buffer for the rt device is the rtgroup super, so we don't
1071 	 * need to save space for buffer invalidations.
1072 	 */
1073 	xreap_configure_limits(rs, step_size, per_intent, per_intent, 0);
1074 
1075 	trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent,
1076 			rs->max_deferred);
1077 }
1078 
1079 #define XREAP_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP | \
1080 				 XFS_RTGLOCK_RMAP | \
1081 				 XFS_RTGLOCK_REFCOUNT)
1082 
1083 /*
1084  * Break a rt file metadata extent into sub-extents by fate (crosslinked, not
1085  * crosslinked), and dispose of each sub-extent separately.  The extent must
1086  * be aligned to a realtime extent.
1087  */
1088 STATIC int
xreap_rtmeta_extent(uint64_t rtbno,uint64_t len,void * priv)1089 xreap_rtmeta_extent(
1090 	uint64_t		rtbno,
1091 	uint64_t		len,
1092 	void			*priv)
1093 {
1094 	struct xreap_state	*rs = priv;
1095 	struct xfs_scrub	*sc = rs->sc;
1096 	xfs_rgblock_t		rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno);
1097 	xfs_rgblock_t		rgbno_next = rgbno + len;
1098 	int			error = 0;
1099 
1100 	ASSERT(sc->ip != NULL);
1101 	ASSERT(!sc->sr.rtg);
1102 
1103 	/*
1104 	 * We're reaping blocks after repairing file metadata, which means that
1105 	 * we have to init the xchk_ag structure ourselves.
1106 	 */
1107 	sc->sr.rtg = xfs_rtgroup_get(sc->mp, xfs_rtb_to_rgno(sc->mp, rtbno));
1108 	if (!sc->sr.rtg)
1109 		return -EFSCORRUPTED;
1110 
1111 	xfs_rtgroup_lock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
1112 
1113 	while (rgbno < rgbno_next) {
1114 		xfs_extlen_t	rglen;
1115 		bool		crosslinked;
1116 
1117 		error = xreap_rgextent_select(rs, rgbno, rgbno_next,
1118 				&crosslinked, &rglen);
1119 		if (error)
1120 			goto out_unlock;
1121 
1122 		error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked);
1123 		if (error)
1124 			goto out_unlock;
1125 
1126 		if (xreap_want_defer_finish(rs)) {
1127 			error = xfs_defer_finish(&sc->tp);
1128 			if (error)
1129 				goto out_unlock;
1130 			xreap_defer_finish_reset(rs);
1131 		} else if (xreap_want_binval_roll(rs)) {
1132 			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
1133 			if (error)
1134 				goto out_unlock;
1135 			xreap_binval_reset(rs);
1136 		}
1137 
1138 		rgbno += rglen;
1139 	}
1140 
1141 out_unlock:
1142 	xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
1143 	xfs_rtgroup_put(sc->sr.rtg);
1144 	sc->sr.rtg = NULL;
1145 	return error;
1146 }
1147 
1148 /*
1149  * Dispose of every block of every rt metadata extent in the bitmap.
1150  * Do not use this to dispose of the mappings in an ondisk inode fork.
1151  */
1152 int
xrep_reap_rtblocks(struct xfs_scrub * sc,struct xrtb_bitmap * bitmap,const struct xfs_owner_info * oinfo)1153 xrep_reap_rtblocks(
1154 	struct xfs_scrub		*sc,
1155 	struct xrtb_bitmap		*bitmap,
1156 	const struct xfs_owner_info	*oinfo)
1157 {
1158 	struct xreap_state		rs = {
1159 		.sc			= sc,
1160 		.oinfo			= oinfo,
1161 		.resv			= XFS_AG_RESV_NONE,
1162 	};
1163 	int				error;
1164 
1165 	ASSERT(xfs_has_rmapbt(sc->mp));
1166 	ASSERT(sc->ip != NULL);
1167 	ASSERT(oinfo == &XFS_RMAP_OINFO_COW);
1168 
1169 	xreap_configure_rgcow_limits(&rs);
1170 	error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs);
1171 	if (error)
1172 		return error;
1173 
1174 	if (xreap_is_dirty(&rs))
1175 		return xrep_defer_finish(sc);
1176 
1177 	return 0;
1178 }
1179 #endif /* CONFIG_XFS_RT */
1180 
1181 /*
1182  * Dispose of every block of an old metadata btree that used to be rooted in a
1183  * metadata directory file.
1184  */
1185 int
xrep_reap_metadir_fsblocks(struct xfs_scrub * sc,struct xfsb_bitmap * bitmap)1186 xrep_reap_metadir_fsblocks(
1187 	struct xfs_scrub		*sc,
1188 	struct xfsb_bitmap		*bitmap)
1189 {
1190 	/*
1191 	 * Reap old metadir btree blocks with XFS_AG_RESV_NONE because the old
1192 	 * blocks are no longer mapped by the inode, and inode metadata space
1193 	 * reservations can only account freed space to the i_nblocks.
1194 	 */
1195 	struct xfs_owner_info		oinfo;
1196 	struct xreap_state		rs = {
1197 		.sc			= sc,
1198 		.oinfo			= &oinfo,
1199 		.resv			= XFS_AG_RESV_NONE,
1200 	};
1201 	int				error;
1202 
1203 	ASSERT(xfs_has_rmapbt(sc->mp));
1204 	ASSERT(sc->ip != NULL);
1205 	ASSERT(xfs_is_metadir_inode(sc->ip));
1206 
1207 	xreap_configure_agextent_limits(&rs);
1208 	xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
1209 	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
1210 	if (error)
1211 		return error;
1212 
1213 	if (xreap_is_dirty(&rs)) {
1214 		error = xrep_defer_finish(sc);
1215 		if (error)
1216 			return error;
1217 	}
1218 
1219 	return xrep_reset_metafile_resv(sc);
1220 }
1221 
1222 /*
1223  * Metadata files are not supposed to share blocks with anything else.
1224  * If blocks are shared, we remove the reverse mapping (thus reducing the
1225  * crosslink factor); if blocks are not shared, we also need to free them.
1226  *
1227  * This first step determines the longest subset of the passed-in imap
1228  * (starting at its beginning) that is either crosslinked or not crosslinked.
1229  * The blockcount will be adjust down as needed.
1230  */
1231 STATIC int
xreap_bmapi_select(struct xreap_state * rs,struct xfs_bmbt_irec * imap,bool * crosslinked)1232 xreap_bmapi_select(
1233 	struct xreap_state	*rs,
1234 	struct xfs_bmbt_irec	*imap,
1235 	bool			*crosslinked)
1236 {
1237 	struct xfs_owner_info	oinfo;
1238 	struct xfs_scrub	*sc = rs->sc;
1239 	struct xfs_btree_cur	*cur;
1240 	xfs_filblks_t		len = 1;
1241 	xfs_agblock_t		bno;
1242 	xfs_agblock_t		agbno;
1243 	xfs_agblock_t		agbno_next;
1244 	int			error;
1245 
1246 	agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
1247 	agbno_next = agbno + imap->br_blockcount;
1248 
1249 	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1250 			sc->sa.pag);
1251 
1252 	xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork,
1253 			imap->br_startoff);
1254 	error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
1255 	if (error)
1256 		goto out_cur;
1257 
1258 	bno = agbno + 1;
1259 	while (bno < agbno_next) {
1260 		bool		also_crosslinked;
1261 
1262 		oinfo.oi_offset++;
1263 		error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo,
1264 				&also_crosslinked);
1265 		if (error)
1266 			goto out_cur;
1267 
1268 		if (also_crosslinked != *crosslinked)
1269 			break;
1270 
1271 		len++;
1272 		bno++;
1273 	}
1274 
1275 	imap->br_blockcount = len;
1276 	trace_xreap_bmapi_select(pag_group(sc->sa.pag), agbno, len,
1277 			*crosslinked);
1278 out_cur:
1279 	xfs_btree_del_cursor(cur, error);
1280 	return error;
1281 }
1282 
1283 /*
1284  * Decide if this buffer can be joined to a transaction.  This is true for most
1285  * buffers, but there are two cases that we want to catch: large remote xattr
1286  * value buffers are not logged and can overflow the buffer log item dirty
1287  * bitmap size; and oversized cached buffers if things have really gone
1288  * haywire.
1289  */
1290 static inline bool
xreap_buf_loggable(const struct xfs_buf * bp)1291 xreap_buf_loggable(
1292 	const struct xfs_buf	*bp)
1293 {
1294 	int			i;
1295 
1296 	for (i = 0; i < bp->b_map_count; i++) {
1297 		int		chunks;
1298 		int		map_size;
1299 
1300 		chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
1301 				XFS_BLF_CHUNK);
1302 		map_size = DIV_ROUND_UP(chunks, NBWORD);
1303 		if (map_size > XFS_BLF_DATAMAP_SIZE)
1304 			return false;
1305 	}
1306 
1307 	return true;
1308 }
1309 
1310 /*
1311  * Invalidate any buffers for this file mapping.  The @imap blockcount may be
1312  * adjusted downward if we need to roll the transaction.
1313  */
1314 STATIC int
xreap_bmapi_binval(struct xreap_state * rs,struct xfs_bmbt_irec * imap)1315 xreap_bmapi_binval(
1316 	struct xreap_state	*rs,
1317 	struct xfs_bmbt_irec	*imap)
1318 {
1319 	struct xfs_scrub	*sc = rs->sc;
1320 	struct xfs_mount	*mp = sc->mp;
1321 	struct xfs_perag	*pag = sc->sa.pag;
1322 	int			bmap_flags = xfs_bmapi_aflag(rs->whichfork);
1323 	xfs_fileoff_t		off;
1324 	xfs_fileoff_t		max_off;
1325 	xfs_extlen_t		scan_blocks;
1326 	xfs_agblock_t		bno;
1327 	xfs_agblock_t		agbno;
1328 	xfs_agblock_t		agbno_next;
1329 	int			error;
1330 
1331 	/*
1332 	 * Avoid invalidating AG headers and post-EOFS blocks because we never
1333 	 * own those.
1334 	 */
1335 	agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
1336 	agbno_next = agbno + imap->br_blockcount;
1337 	if (!xfs_verify_agbno(pag, agbno) ||
1338 	    !xfs_verify_agbno(pag, agbno_next - 1))
1339 		return 0;
1340 
1341 	/*
1342 	 * Buffers for file blocks can span multiple contiguous mappings.  This
1343 	 * means that for each block in the mapping, there could exist an
1344 	 * xfs_buf indexed by that block with any length up to the maximum
1345 	 * buffer size (remote xattr values) or to the next hole in the fork.
1346 	 * To set up our binval scan, first we need to figure out the location
1347 	 * of the next hole.
1348 	 */
1349 	off = imap->br_startoff + imap->br_blockcount;
1350 	max_off = off + xfs_attr3_max_rmt_blocks(mp);
1351 	while (off < max_off) {
1352 		struct xfs_bmbt_irec	hmap;
1353 		int			nhmaps = 1;
1354 
1355 		error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap,
1356 				&nhmaps, bmap_flags);
1357 		if (error)
1358 			return error;
1359 		if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) {
1360 			ASSERT(0);
1361 			return -EFSCORRUPTED;
1362 		}
1363 
1364 		if (!xfs_bmap_is_real_extent(&hmap))
1365 			break;
1366 
1367 		off = hmap.br_startoff + hmap.br_blockcount;
1368 	}
1369 	scan_blocks = off - imap->br_startoff;
1370 
1371 	trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks);
1372 
1373 	/*
1374 	 * If there are incore buffers for these blocks, invalidate them.  If
1375 	 * we can't (try)lock the buffer we assume it's owned by someone else
1376 	 * and leave it alone.  The buffer cache cannot detect aliasing, so
1377 	 * employ nested loops to detect incore buffers of any plausible size.
1378 	 */
1379 	while (bno < agbno_next) {
1380 		struct xrep_bufscan	scan = {
1381 			.daddr		= xfs_agbno_to_daddr(pag, bno),
1382 			.max_sectors	= xrep_bufscan_max_sectors(mp,
1383 								scan_blocks),
1384 			.daddr_step	= XFS_FSB_TO_BB(mp, 1),
1385 		};
1386 		struct xfs_buf		*bp;
1387 
1388 		while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
1389 			if (xreap_buf_loggable(bp)) {
1390 				xfs_trans_bjoin(sc->tp, bp);
1391 				xfs_trans_binval(sc->tp, bp);
1392 			} else {
1393 				xfs_buf_stale(bp);
1394 				xfs_buf_relse(bp);
1395 			}
1396 
1397 			/*
1398 			 * Stop invalidating if we've hit the limit; we should
1399 			 * still have enough reservation left to free however
1400 			 * far we've gotten.
1401 			 */
1402 			if (!xreap_inc_binval(rs)) {
1403 				imap->br_blockcount = agbno_next - bno;
1404 				goto out;
1405 			}
1406 		}
1407 
1408 		bno++;
1409 		scan_blocks--;
1410 	}
1411 
1412 out:
1413 	trace_xreap_bmapi_binval(pag_group(sc->sa.pag), agbno,
1414 			imap->br_blockcount);
1415 	return 0;
1416 }
1417 
1418 /*
1419  * Dispose of as much of the beginning of this file fork mapping as possible.
1420  * The number of blocks disposed of is returned in @imap->br_blockcount.
1421  */
1422 STATIC int
xrep_reap_bmapi_iter(struct xreap_state * rs,struct xfs_bmbt_irec * imap,bool crosslinked)1423 xrep_reap_bmapi_iter(
1424 	struct xreap_state		*rs,
1425 	struct xfs_bmbt_irec		*imap,
1426 	bool				crosslinked)
1427 {
1428 	struct xfs_scrub		*sc = rs->sc;
1429 	int				error;
1430 
1431 	if (crosslinked) {
1432 		/*
1433 		 * If there are other rmappings, this block is cross linked and
1434 		 * must not be freed.  Remove the reverse mapping, leave the
1435 		 * buffer cache in its possibly confused state, and move on.
1436 		 * We don't want to risk discarding valid data buffers from
1437 		 * anybody else who thinks they own the block, even though that
1438 		 * runs the risk of stale buffer warnings in the future.
1439 		 */
1440 		trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag),
1441 				XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
1442 				imap->br_blockcount);
1443 
1444 		/*
1445 		 * t0: Schedule removal of the mapping from the fork.  We use
1446 		 * deferred log intents in this function to control the exact
1447 		 * sequence of metadata updates.
1448 		 */
1449 		xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
1450 		xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
1451 				-(int64_t)imap->br_blockcount);
1452 		xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
1453 		return 0;
1454 	}
1455 
1456 	/*
1457 	 * If the block is not crosslinked, we can invalidate all the incore
1458 	 * buffers for the extent, and then free the extent.  This is a bit of
1459 	 * a mess since we don't detect discontiguous buffers that are indexed
1460 	 * by a block starting before the first block of the extent but overlap
1461 	 * anyway.
1462 	 */
1463 	trace_xreap_dispose_free_extent(pag_group(sc->sa.pag),
1464 			XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
1465 			imap->br_blockcount);
1466 
1467 	/*
1468 	 * Invalidate as many buffers as we can, starting at the beginning of
1469 	 * this mapping.  If this function sets blockcount to zero, the
1470 	 * transaction is full of logged buffer invalidations, so we need to
1471 	 * return early so that we can roll and retry.
1472 	 */
1473 	error = xreap_bmapi_binval(rs, imap);
1474 	if (error || imap->br_blockcount == 0)
1475 		return error;
1476 
1477 	/*
1478 	 * t1: Schedule removal of the mapping from the fork.  We use deferred
1479 	 * work in this function to control the exact sequence of metadata
1480 	 * updates.
1481 	 */
1482 	xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
1483 	xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
1484 			-(int64_t)imap->br_blockcount);
1485 	return xfs_free_extent_later(sc->tp, imap->br_startblock,
1486 			imap->br_blockcount, NULL, XFS_AG_RESV_NONE,
1487 			XFS_FREE_EXTENT_SKIP_DISCARD);
1488 }
1489 
1490 /* Compute the maximum mapcount of a file buffer. */
1491 static unsigned int
xreap_bmapi_binval_mapcount(struct xfs_scrub * sc)1492 xreap_bmapi_binval_mapcount(
1493 	struct xfs_scrub	*sc)
1494 {
1495 	/* directory blocks can span multiple fsblocks and be discontiguous */
1496 	if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR)
1497 		return sc->mp->m_dir_geo->fsbcount;
1498 
1499 	/* all other file xattr/symlink blocks must be contiguous */
1500 	return 1;
1501 }
1502 
1503 /* Compute the maximum block size of a file buffer. */
1504 static unsigned int
xreap_bmapi_binval_blocksize(struct xfs_scrub * sc)1505 xreap_bmapi_binval_blocksize(
1506 	struct xfs_scrub	*sc)
1507 {
1508 	switch (sc->sm->sm_type) {
1509 	case XFS_SCRUB_TYPE_DIR:
1510 		return sc->mp->m_dir_geo->blksize;
1511 	case XFS_SCRUB_TYPE_XATTR:
1512 	case XFS_SCRUB_TYPE_PARENT:
1513 		/*
1514 		 * The xattr structure itself consists of single fsblocks, but
1515 		 * there could be remote xattr blocks to invalidate.
1516 		 */
1517 		return XFS_XATTR_SIZE_MAX;
1518 	}
1519 
1520 	/* everything else is a single block */
1521 	return sc->mp->m_sb.sb_blocksize;
1522 }
1523 
1524 /*
1525  * Compute the maximum number of buffer invalidations that we can do while
1526  * reaping a single extent from a file fork.
1527  */
1528 STATIC void
xreap_configure_bmapi_limits(struct xreap_state * rs)1529 xreap_configure_bmapi_limits(
1530 	struct xreap_state	*rs)
1531 {
1532 	struct xfs_scrub	*sc = rs->sc;
1533 	struct xfs_mount	*mp = sc->mp;
1534 
1535 	/* overhead of invalidating a buffer */
1536 	const unsigned int	per_binval =
1537 		xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc),
1538 					    xreap_bmapi_binval_blocksize(sc));
1539 
1540 	/*
1541 	 * In the worst case, relogging an intent item causes both an intent
1542 	 * item and a done item to be attached to a transaction for each extent
1543 	 * that we'd like to process.
1544 	 */
1545 	const unsigned int	efi = xfs_efi_log_space(1) +
1546 				      xfs_efd_log_space(1);
1547 	const unsigned int	rui = xfs_rui_log_space(1) +
1548 				      xfs_rud_log_space();
1549 	const unsigned int	bui = xfs_bui_log_space(1) +
1550 				      xfs_bud_log_space();
1551 
1552 	/*
1553 	 * t1: Unmapping crosslinked file data blocks: one bmap deletion,
1554 	 * possibly an EFI for underfilled bmbt blocks, and an rmap deletion.
1555 	 *
1556 	 * t2: Freeing freeing file data blocks: one bmap deletion, possibly an
1557 	 * EFI for underfilled bmbt blocks, and another EFI for the space
1558 	 * itself.
1559 	 */
1560 	const unsigned int	t1 = (bui + efi) + rui;
1561 	const unsigned int	t2 = (bui + efi) + efi;
1562 	const unsigned int	per_intent = max(t1, t2);
1563 
1564 	/*
1565 	 * For each transaction in a reap chain, we must be able to take one
1566 	 * step in the defer item chain, which should only consist of CUI, EFI,
1567 	 * or RUI items.
1568 	 */
1569 	const unsigned int	f1 = xfs_calc_finish_efi_reservation(mp, 1);
1570 	const unsigned int	f2 = xfs_calc_finish_rui_reservation(mp, 1);
1571 	const unsigned int	f3 = xfs_calc_finish_bui_reservation(mp, 1);
1572 	const unsigned int	step_size = max3(f1, f2, f3);
1573 
1574 	/*
1575 	 * Each call to xreap_ifork_extent starts with a clean transaction and
1576 	 * operates on a single mapping by creating a chain of log intent items
1577 	 * for that mapping.  We need to leave enough reservation in the
1578 	 * transaction to log btree buffer and inode updates for each step in
1579 	 * the chain, and to relog the log intents.
1580 	 */
1581 	const unsigned int	per_extent_res = per_intent + step_size;
1582 
1583 	xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval);
1584 
1585 	trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval,
1586 			step_size, per_intent, 1);
1587 }
1588 
1589 /*
1590  * Dispose of as much of this file extent as we can.  Upon successful return,
1591  * the imap will reflect the mapping that was removed from the fork.
1592  */
1593 STATIC int
xreap_ifork_extent(struct xreap_state * rs,struct xfs_bmbt_irec * imap)1594 xreap_ifork_extent(
1595 	struct xreap_state		*rs,
1596 	struct xfs_bmbt_irec		*imap)
1597 {
1598 	struct xfs_scrub		*sc = rs->sc;
1599 	xfs_agnumber_t			agno;
1600 	bool				crosslinked;
1601 	int				error;
1602 
1603 	ASSERT(sc->sa.pag == NULL);
1604 
1605 	trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap);
1606 
1607 	agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
1608 	sc->sa.pag = xfs_perag_get(sc->mp, agno);
1609 	if (!sc->sa.pag)
1610 		return -EFSCORRUPTED;
1611 
1612 	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
1613 	if (error)
1614 		goto out_pag;
1615 
1616 	/*
1617 	 * Decide the fate of the blocks at the beginning of the mapping, then
1618 	 * update the mapping to use it with the unmap calls.
1619 	 */
1620 	error = xreap_bmapi_select(rs, imap, &crosslinked);
1621 	if (error)
1622 		goto out_agf;
1623 
1624 	error = xrep_reap_bmapi_iter(rs, imap, crosslinked);
1625 	if (error)
1626 		goto out_agf;
1627 
1628 out_agf:
1629 	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
1630 	sc->sa.agf_bp = NULL;
1631 out_pag:
1632 	xfs_perag_put(sc->sa.pag);
1633 	sc->sa.pag = NULL;
1634 	return error;
1635 }
1636 
1637 /*
1638  * Dispose of each block mapped to the given fork of the given file.  Callers
1639  * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip.  The fork
1640  * must not have any delalloc reservations.
1641  */
1642 int
xrep_reap_ifork(struct xfs_scrub * sc,struct xfs_inode * ip,int whichfork)1643 xrep_reap_ifork(
1644 	struct xfs_scrub	*sc,
1645 	struct xfs_inode	*ip,
1646 	int			whichfork)
1647 {
1648 	struct xreap_state	rs = {
1649 		.sc		= sc,
1650 		.ip		= ip,
1651 		.whichfork	= whichfork,
1652 	};
1653 	xfs_fileoff_t		off = 0;
1654 	int			bmap_flags = xfs_bmapi_aflag(whichfork);
1655 	int			error;
1656 
1657 	ASSERT(xfs_has_rmapbt(sc->mp));
1658 	ASSERT(ip == sc->ip || ip == sc->tempip);
1659 	ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
1660 
1661 	xreap_configure_bmapi_limits(&rs);
1662 	while (off < XFS_MAX_FILEOFF) {
1663 		struct xfs_bmbt_irec	imap;
1664 		int			nimaps = 1;
1665 
1666 		/* Read the next extent, skip past holes and delalloc. */
1667 		error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap,
1668 				&nimaps, bmap_flags);
1669 		if (error)
1670 			return error;
1671 		if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) {
1672 			ASSERT(0);
1673 			return -EFSCORRUPTED;
1674 		}
1675 
1676 		/*
1677 		 * If this is a real space mapping, reap as much of it as we
1678 		 * can in a single transaction.
1679 		 */
1680 		if (xfs_bmap_is_real_extent(&imap)) {
1681 			error = xreap_ifork_extent(&rs, &imap);
1682 			if (error)
1683 				return error;
1684 
1685 			error = xfs_defer_finish(&sc->tp);
1686 			if (error)
1687 				return error;
1688 			xreap_defer_finish_reset(&rs);
1689 		}
1690 
1691 		off = imap.br_startoff + imap.br_blockcount;
1692 	}
1693 
1694 	return 0;
1695 }
1696