xref: /linux/fs/xfs/scrub/reap.c (revision 9c93c0b44be36fd5267fb79ae33453f989fbe909)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans.h"
15 #include "xfs_sb.h"
16 #include "xfs_inode.h"
17 #include "xfs_alloc.h"
18 #include "xfs_alloc_btree.h"
19 #include "xfs_ialloc.h"
20 #include "xfs_ialloc_btree.h"
21 #include "xfs_rmap.h"
22 #include "xfs_rmap_btree.h"
23 #include "xfs_refcount.h"
24 #include "xfs_refcount_btree.h"
25 #include "xfs_extent_busy.h"
26 #include "xfs_ag.h"
27 #include "xfs_ag_resv.h"
28 #include "xfs_quota.h"
29 #include "xfs_qm.h"
30 #include "xfs_bmap.h"
31 #include "xfs_da_format.h"
32 #include "xfs_da_btree.h"
33 #include "xfs_attr.h"
34 #include "xfs_attr_remote.h"
35 #include "xfs_defer.h"
36 #include "scrub/scrub.h"
37 #include "scrub/common.h"
38 #include "scrub/trace.h"
39 #include "scrub/repair.h"
40 #include "scrub/bitmap.h"
41 #include "scrub/agb_bitmap.h"
42 #include "scrub/fsb_bitmap.h"
43 #include "scrub/reap.h"
44 
45 /*
46  * Disposal of Blocks from Old Metadata
47  *
48  * Now that we've constructed a new btree to replace the damaged one, we want
49  * to dispose of the blocks that (we think) the old btree was using.
50  * Previously, we used the rmapbt to collect the extents (bitmap) with the
51  * rmap owner corresponding to the tree we rebuilt, collected extents for any
52  * blocks with the same rmap owner that are owned by another data structure
53  * (sublist), and subtracted sublist from bitmap.  In theory the extents
54  * remaining in bitmap are the old btree's blocks.
55  *
56  * Unfortunately, it's possible that the btree was crosslinked with other
57  * blocks on disk.  The rmap data can tell us if there are multiple owners, so
58  * if the rmapbt says there is an owner of this block other than @oinfo, then
59  * the block is crosslinked.  Remove the reverse mapping and continue.
60  *
61  * If there is one rmap record, we can free the block, which removes the
62  * reverse mapping but doesn't add the block to the free space.  Our repair
63  * strategy is to hope the other metadata objects crosslinked on this block
64  * will be rebuilt (atop different blocks), thereby removing all the cross
65  * links.
66  *
67  * If there are no rmap records at all, we also free the block.  If the btree
68  * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
69  * supposed to be a rmap record and everything is ok.  For other btrees there
70  * had to have been an rmap entry for the block to have ended up on @bitmap,
71  * so if it's gone now there's something wrong and the fs will shut down.
72  *
73  * Note: If there are multiple rmap records with only the same rmap owner as
74  * the btree we're trying to rebuild and the block is indeed owned by another
75  * data structure with the same rmap owner, then the block will be in sublist
76  * and therefore doesn't need disposal.  If there are multiple rmap records
77  * with only the same rmap owner but the block is not owned by something with
78  * the same rmap owner, the block will be freed.
79  *
80  * The caller is responsible for locking the AG headers/inode for the entire
81  * rebuild operation so that nothing else can sneak in and change the incore
82  * state while we're not looking.  We must also invalidate any buffers
83  * associated with @bitmap.
84  */
85 
86 /* Information about reaping extents after a repair. */
87 struct xreap_state {
88 	struct xfs_scrub		*sc;
89 
90 	/* Reverse mapping owner and metadata reservation type. */
91 	const struct xfs_owner_info	*oinfo;
92 	enum xfs_ag_resv_type		resv;
93 
94 	/* If true, roll the transaction before reaping the next extent. */
95 	bool				force_roll;
96 
97 	/* Number of deferred reaps attached to the current transaction. */
98 	unsigned int			deferred;
99 
100 	/* Number of invalidated buffers logged to the current transaction. */
101 	unsigned int			invalidated;
102 
103 	/* Number of deferred reaps queued during the whole reap sequence. */
104 	unsigned long long		total_deferred;
105 };
106 
107 /* Put a block back on the AGFL. */
108 STATIC int
109 xreap_put_freelist(
110 	struct xfs_scrub	*sc,
111 	xfs_agblock_t		agbno)
112 {
113 	struct xfs_buf		*agfl_bp;
114 	int			error;
115 
116 	/* Make sure there's space on the freelist. */
117 	error = xrep_fix_freelist(sc, 0);
118 	if (error)
119 		return error;
120 
121 	/*
122 	 * Since we're "freeing" a lost block onto the AGFL, we have to
123 	 * create an rmap for the block prior to merging it or else other
124 	 * parts will break.
125 	 */
126 	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
127 			&XFS_RMAP_OINFO_AG);
128 	if (error)
129 		return error;
130 
131 	/* Put the block on the AGFL. */
132 	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
133 	if (error)
134 		return error;
135 
136 	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
137 			agfl_bp, agbno, 0);
138 	if (error)
139 		return error;
140 	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
141 			XFS_EXTENT_BUSY_SKIP_DISCARD);
142 
143 	return 0;
144 }
145 
146 /* Are there any uncommitted reap operations? */
147 static inline bool xreap_dirty(const struct xreap_state *rs)
148 {
149 	if (rs->force_roll)
150 		return true;
151 	if (rs->deferred)
152 		return true;
153 	if (rs->invalidated)
154 		return true;
155 	if (rs->total_deferred)
156 		return true;
157 	return false;
158 }
159 
160 #define XREAP_MAX_BINVAL	(2048)
161 
162 /*
163  * Decide if we want to roll the transaction after reaping an extent.  We don't
164  * want to overrun the transaction reservation, so we prohibit more than
165  * 128 EFIs per transaction.  For the same reason, we limit the number
166  * of buffer invalidations to 2048.
167  */
168 static inline bool xreap_want_roll(const struct xreap_state *rs)
169 {
170 	if (rs->force_roll)
171 		return true;
172 	if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
173 		return true;
174 	if (rs->invalidated > XREAP_MAX_BINVAL)
175 		return true;
176 	return false;
177 }
178 
179 static inline void xreap_reset(struct xreap_state *rs)
180 {
181 	rs->total_deferred += rs->deferred;
182 	rs->deferred = 0;
183 	rs->invalidated = 0;
184 	rs->force_roll = false;
185 }
186 
187 #define XREAP_MAX_DEFER_CHAIN		(2048)
188 
189 /*
190  * Decide if we want to finish the deferred ops that are attached to the scrub
191  * transaction.  We don't want to queue huge chains of deferred ops because
192  * that can consume a lot of log space and kernel memory.  Hence we trigger a
193  * xfs_defer_finish if there are more than 2048 deferred reap operations or the
194  * caller did some real work.
195  */
196 static inline bool
197 xreap_want_defer_finish(const struct xreap_state *rs)
198 {
199 	if (rs->force_roll)
200 		return true;
201 	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
202 		return true;
203 	return false;
204 }
205 
206 static inline void xreap_defer_finish_reset(struct xreap_state *rs)
207 {
208 	rs->total_deferred = 0;
209 	rs->deferred = 0;
210 	rs->invalidated = 0;
211 	rs->force_roll = false;
212 }
213 
214 /* Try to invalidate the incore buffers for an extent that we're freeing. */
215 STATIC void
216 xreap_agextent_binval(
217 	struct xreap_state	*rs,
218 	xfs_agblock_t		agbno,
219 	xfs_extlen_t		*aglenp)
220 {
221 	struct xfs_scrub	*sc = rs->sc;
222 	struct xfs_perag	*pag = sc->sa.pag;
223 	struct xfs_mount	*mp = sc->mp;
224 	xfs_agnumber_t		agno = sc->sa.pag->pag_agno;
225 	xfs_agblock_t		agbno_next = agbno + *aglenp;
226 	xfs_agblock_t		bno = agbno;
227 
228 	/*
229 	 * Avoid invalidating AG headers and post-EOFS blocks because we never
230 	 * own those.
231 	 */
232 	if (!xfs_verify_agbno(pag, agbno) ||
233 	    !xfs_verify_agbno(pag, agbno_next - 1))
234 		return;
235 
236 	/*
237 	 * If there are incore buffers for these blocks, invalidate them.  We
238 	 * assume that the lack of any other known owners means that the buffer
239 	 * can be locked without risk of deadlocking.  The buffer cache cannot
240 	 * detect aliasing, so employ nested loops to scan for incore buffers
241 	 * of any plausible size.
242 	 */
243 	while (bno < agbno_next) {
244 		xfs_agblock_t	fsbcount;
245 		xfs_agblock_t	max_fsbs;
246 
247 		/*
248 		 * Max buffer size is the max remote xattr buffer size, which
249 		 * is one fs block larger than 64k.
250 		 */
251 		max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
252 				xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
253 
254 		for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
255 			struct xfs_buf	*bp = NULL;
256 			xfs_daddr_t	daddr;
257 			int		error;
258 
259 			daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
260 			error = xfs_buf_incore(mp->m_ddev_targp, daddr,
261 					XFS_FSB_TO_BB(mp, fsbcount),
262 					XBF_LIVESCAN, &bp);
263 			if (error)
264 				continue;
265 
266 			xfs_trans_bjoin(sc->tp, bp);
267 			xfs_trans_binval(sc->tp, bp);
268 			rs->invalidated++;
269 
270 			/*
271 			 * Stop invalidating if we've hit the limit; we should
272 			 * still have enough reservation left to free however
273 			 * far we've gotten.
274 			 */
275 			if (rs->invalidated > XREAP_MAX_BINVAL) {
276 				*aglenp -= agbno_next - bno;
277 				goto out;
278 			}
279 		}
280 
281 		bno++;
282 	}
283 
284 out:
285 	trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
286 }
287 
288 /*
289  * Figure out the longest run of blocks that we can dispose of with a single
290  * call.  Cross-linked blocks should have their reverse mappings removed, but
291  * single-owner extents can be freed.  AGFL blocks can only be put back one at
292  * a time.
293  */
294 STATIC int
295 xreap_agextent_select(
296 	struct xreap_state	*rs,
297 	xfs_agblock_t		agbno,
298 	xfs_agblock_t		agbno_next,
299 	bool			*crosslinked,
300 	xfs_extlen_t		*aglenp)
301 {
302 	struct xfs_scrub	*sc = rs->sc;
303 	struct xfs_btree_cur	*cur;
304 	xfs_agblock_t		bno = agbno + 1;
305 	xfs_extlen_t		len = 1;
306 	int			error;
307 
308 	/*
309 	 * Determine if there are any other rmap records covering the first
310 	 * block of this extent.  If so, the block is crosslinked.
311 	 */
312 	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
313 			sc->sa.pag);
314 	error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
315 			crosslinked);
316 	if (error)
317 		goto out_cur;
318 
319 	/* AGFL blocks can only be deal with one at a time. */
320 	if (rs->resv == XFS_AG_RESV_AGFL)
321 		goto out_found;
322 
323 	/*
324 	 * Figure out how many of the subsequent blocks have the same crosslink
325 	 * status.
326 	 */
327 	while (bno < agbno_next) {
328 		bool		also_crosslinked;
329 
330 		error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
331 				&also_crosslinked);
332 		if (error)
333 			goto out_cur;
334 
335 		if (*crosslinked != also_crosslinked)
336 			break;
337 
338 		len++;
339 		bno++;
340 	}
341 
342 out_found:
343 	*aglenp = len;
344 	trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
345 out_cur:
346 	xfs_btree_del_cursor(cur, error);
347 	return error;
348 }
349 
350 /*
351  * Dispose of as much of the beginning of this AG extent as possible.  The
352  * number of blocks disposed of will be returned in @aglenp.
353  */
354 STATIC int
355 xreap_agextent_iter(
356 	struct xreap_state	*rs,
357 	xfs_agblock_t		agbno,
358 	xfs_extlen_t		*aglenp,
359 	bool			crosslinked)
360 {
361 	struct xfs_scrub	*sc = rs->sc;
362 	xfs_fsblock_t		fsbno;
363 	int			error = 0;
364 
365 	fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
366 
367 	/*
368 	 * If there are other rmappings, this block is cross linked and must
369 	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
370 	 * we were the only owner of the block, so free the extent, which will
371 	 * also remove the rmap.
372 	 *
373 	 * XXX: XFS doesn't support detecting the case where a single block
374 	 * metadata structure is crosslinked with a multi-block structure
375 	 * because the buffer cache doesn't detect aliasing problems, so we
376 	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
377 	 * blow on writeout, the filesystem will shut down, and the admin gets
378 	 * to run xfs_repair.
379 	 */
380 	if (crosslinked) {
381 		trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
382 
383 		rs->force_roll = true;
384 
385 		if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
386 			/*
387 			 * If we're unmapping CoW staging extents, remove the
388 			 * records from the refcountbt, which will remove the
389 			 * rmap record as well.
390 			 */
391 			xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
392 			return 0;
393 		}
394 
395 		return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
396 				*aglenp, rs->oinfo);
397 	}
398 
399 	trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
400 
401 	/*
402 	 * Invalidate as many buffers as we can, starting at agbno.  If this
403 	 * function sets *aglenp to zero, the transaction is full of logged
404 	 * buffer invalidations, so we need to return early so that we can
405 	 * roll and retry.
406 	 */
407 	xreap_agextent_binval(rs, agbno, aglenp);
408 	if (*aglenp == 0) {
409 		ASSERT(xreap_want_roll(rs));
410 		return 0;
411 	}
412 
413 	/*
414 	 * If we're getting rid of CoW staging extents, use deferred work items
415 	 * to remove the refcountbt records (which removes the rmap records)
416 	 * and free the extent.  We're not worried about the system going down
417 	 * here because log recovery walks the refcount btree to clean out the
418 	 * CoW staging extents.
419 	 */
420 	if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
421 		ASSERT(rs->resv == XFS_AG_RESV_NONE);
422 
423 		xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
424 		error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
425 				rs->resv, true);
426 		if (error)
427 			return error;
428 
429 		rs->force_roll = true;
430 		return 0;
431 	}
432 
433 	/* Put blocks back on the AGFL one at a time. */
434 	if (rs->resv == XFS_AG_RESV_AGFL) {
435 		ASSERT(*aglenp == 1);
436 		error = xreap_put_freelist(sc, agbno);
437 		if (error)
438 			return error;
439 
440 		rs->force_roll = true;
441 		return 0;
442 	}
443 
444 	/*
445 	 * Use deferred frees to get rid of the old btree blocks to try to
446 	 * minimize the window in which we could crash and lose the old blocks.
447 	 * Add a defer ops barrier every other extent to avoid stressing the
448 	 * system with large EFIs.
449 	 */
450 	error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
451 			rs->resv, true);
452 	if (error)
453 		return error;
454 
455 	rs->deferred++;
456 	if (rs->deferred % 2 == 0)
457 		xfs_defer_add_barrier(sc->tp);
458 	return 0;
459 }
460 
461 /*
462  * Break an AG metadata extent into sub-extents by fate (crosslinked, not
463  * crosslinked), and dispose of each sub-extent separately.
464  */
465 STATIC int
466 xreap_agmeta_extent(
467 	uint32_t		agbno,
468 	uint32_t		len,
469 	void			*priv)
470 {
471 	struct xreap_state	*rs = priv;
472 	struct xfs_scrub	*sc = rs->sc;
473 	xfs_agblock_t		agbno_next = agbno + len;
474 	int			error = 0;
475 
476 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
477 	ASSERT(sc->ip == NULL);
478 
479 	while (agbno < agbno_next) {
480 		xfs_extlen_t	aglen;
481 		bool		crosslinked;
482 
483 		error = xreap_agextent_select(rs, agbno, agbno_next,
484 				&crosslinked, &aglen);
485 		if (error)
486 			return error;
487 
488 		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
489 		if (error)
490 			return error;
491 
492 		if (xreap_want_defer_finish(rs)) {
493 			error = xrep_defer_finish(sc);
494 			if (error)
495 				return error;
496 			xreap_defer_finish_reset(rs);
497 		} else if (xreap_want_roll(rs)) {
498 			error = xrep_roll_ag_trans(sc);
499 			if (error)
500 				return error;
501 			xreap_reset(rs);
502 		}
503 
504 		agbno += aglen;
505 	}
506 
507 	return 0;
508 }
509 
510 /* Dispose of every block of every AG metadata extent in the bitmap. */
511 int
512 xrep_reap_agblocks(
513 	struct xfs_scrub		*sc,
514 	struct xagb_bitmap		*bitmap,
515 	const struct xfs_owner_info	*oinfo,
516 	enum xfs_ag_resv_type		type)
517 {
518 	struct xreap_state		rs = {
519 		.sc			= sc,
520 		.oinfo			= oinfo,
521 		.resv			= type,
522 	};
523 	int				error;
524 
525 	ASSERT(xfs_has_rmapbt(sc->mp));
526 	ASSERT(sc->ip == NULL);
527 
528 	error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
529 	if (error)
530 		return error;
531 
532 	if (xreap_dirty(&rs))
533 		return xrep_defer_finish(sc);
534 
535 	return 0;
536 }
537 
538 /*
539  * Break a file metadata extent into sub-extents by fate (crosslinked, not
540  * crosslinked), and dispose of each sub-extent separately.  The extent must
541  * not cross an AG boundary.
542  */
543 STATIC int
544 xreap_fsmeta_extent(
545 	uint64_t		fsbno,
546 	uint64_t		len,
547 	void			*priv)
548 {
549 	struct xreap_state	*rs = priv;
550 	struct xfs_scrub	*sc = rs->sc;
551 	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
552 	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
553 	xfs_agblock_t		agbno_next = agbno + len;
554 	int			error = 0;
555 
556 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
557 	ASSERT(sc->ip != NULL);
558 	ASSERT(!sc->sa.pag);
559 
560 	/*
561 	 * We're reaping blocks after repairing file metadata, which means that
562 	 * we have to init the xchk_ag structure ourselves.
563 	 */
564 	sc->sa.pag = xfs_perag_get(sc->mp, agno);
565 	if (!sc->sa.pag)
566 		return -EFSCORRUPTED;
567 
568 	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
569 	if (error)
570 		goto out_pag;
571 
572 	while (agbno < agbno_next) {
573 		xfs_extlen_t	aglen;
574 		bool		crosslinked;
575 
576 		error = xreap_agextent_select(rs, agbno, agbno_next,
577 				&crosslinked, &aglen);
578 		if (error)
579 			goto out_agf;
580 
581 		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
582 		if (error)
583 			goto out_agf;
584 
585 		if (xreap_want_defer_finish(rs)) {
586 			/*
587 			 * Holds the AGF buffer across the deferred chain
588 			 * processing.
589 			 */
590 			error = xrep_defer_finish(sc);
591 			if (error)
592 				goto out_agf;
593 			xreap_defer_finish_reset(rs);
594 		} else if (xreap_want_roll(rs)) {
595 			/*
596 			 * Hold the AGF buffer across the transaction roll so
597 			 * that we don't have to reattach it to the scrub
598 			 * context.
599 			 */
600 			xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
601 			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
602 			xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
603 			if (error)
604 				goto out_agf;
605 			xreap_reset(rs);
606 		}
607 
608 		agbno += aglen;
609 	}
610 
611 out_agf:
612 	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
613 	sc->sa.agf_bp = NULL;
614 out_pag:
615 	xfs_perag_put(sc->sa.pag);
616 	sc->sa.pag = NULL;
617 	return error;
618 }
619 
620 /*
621  * Dispose of every block of every fs metadata extent in the bitmap.
622  * Do not use this to dispose of the mappings in an ondisk inode fork.
623  */
624 int
625 xrep_reap_fsblocks(
626 	struct xfs_scrub		*sc,
627 	struct xfsb_bitmap		*bitmap,
628 	const struct xfs_owner_info	*oinfo)
629 {
630 	struct xreap_state		rs = {
631 		.sc			= sc,
632 		.oinfo			= oinfo,
633 		.resv			= XFS_AG_RESV_NONE,
634 	};
635 	int				error;
636 
637 	ASSERT(xfs_has_rmapbt(sc->mp));
638 	ASSERT(sc->ip != NULL);
639 
640 	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
641 	if (error)
642 		return error;
643 
644 	if (xreap_dirty(&rs))
645 		return xrep_defer_finish(sc);
646 
647 	return 0;
648 }
649