xref: /linux/fs/xfs/scrub/cow_repair.c (revision eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_inode.h"
17 #include "xfs_inode_fork.h"
18 #include "xfs_alloc.h"
19 #include "xfs_bmap.h"
20 #include "xfs_rmap.h"
21 #include "xfs_refcount.h"
22 #include "xfs_quota.h"
23 #include "xfs_ialloc.h"
24 #include "xfs_ag.h"
25 #include "xfs_error.h"
26 #include "xfs_errortag.h"
27 #include "xfs_icache.h"
28 #include "xfs_refcount_btree.h"
29 #include "scrub/xfs_scrub.h"
30 #include "scrub/scrub.h"
31 #include "scrub/common.h"
32 #include "scrub/trace.h"
33 #include "scrub/repair.h"
34 #include "scrub/bitmap.h"
35 #include "scrub/off_bitmap.h"
36 #include "scrub/fsb_bitmap.h"
37 #include "scrub/reap.h"
38 
39 /*
40  * CoW Fork Mapping Repair
41  * =======================
42  *
43  * Although CoW staging extents are owned by incore CoW inode forks, on disk
44  * they are owned by the refcount btree.  The ondisk metadata does not record
45  * any ownership information, which limits what we can do to repair the
46  * mappings in the CoW fork.  At most, we can replace ifork mappings that lack
47  * an entry in the refcount btree or are described by a reverse mapping record
48  * whose owner is not OWN_COW.
49  *
50  * Replacing extents is also tricky -- we can't touch written CoW fork extents
51  * since they are undergoing writeback, and delalloc extents do not require
52  * repair since they only exist incore.  Hence the most we can do is find the
53  * bad parts of unwritten mappings, allocate a replacement set of blocks, and
54  * replace the incore mapping.  We use the regular reaping process to unmap
55  * or free the discarded blocks, as appropriate.
56  */
57 struct xrep_cow {
58 	struct xfs_scrub	*sc;
59 
60 	/* Bitmap of file offset ranges that need replacing. */
61 	struct xoff_bitmap	bad_fileoffs;
62 
63 	/* Bitmap of fsblocks that were removed from the CoW fork. */
64 	struct xfsb_bitmap	old_cowfork_fsblocks;
65 
66 	/* CoW fork mappings used to scan for bad CoW staging extents. */
67 	struct xfs_bmbt_irec	irec;
68 
69 	/* refcount btree block number of irec.br_startblock */
70 	unsigned int		irec_startbno;
71 
72 	/* refcount btree block number of the next refcount record we expect */
73 	unsigned int		next_bno;
74 };
75 
76 /* CoW staging extent. */
77 struct xrep_cow_extent {
78 	xfs_fsblock_t		fsbno;
79 	xfs_extlen_t		len;
80 };
81 
82 /*
83  * Mark the part of the file range that corresponds to the given physical
84  * space.  Caller must ensure that the physical range is within xc->irec.
85  */
86 STATIC int
87 xrep_cow_mark_file_range(
88 	struct xrep_cow		*xc,
89 	xfs_fsblock_t		startblock,
90 	xfs_filblks_t		blockcount)
91 {
92 	xfs_fileoff_t		startoff;
93 
94 	startoff = xc->irec.br_startoff +
95 				(startblock - xc->irec.br_startblock);
96 
97 	trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff,
98 			blockcount);
99 
100 	return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount);
101 }
102 
103 /*
104  * Trim @src to fit within the CoW fork mapping being examined, and put the
105  * result in @dst.
106  */
107 static inline void
108 xrep_cow_trim_refcount(
109 	struct xrep_cow			*xc,
110 	struct xfs_refcount_irec	*dst,
111 	const struct xfs_refcount_irec	*src)
112 {
113 	unsigned int			adj;
114 
115 	memcpy(dst, src, sizeof(*dst));
116 
117 	if (dst->rc_startblock < xc->irec_startbno) {
118 		adj = xc->irec_startbno - dst->rc_startblock;
119 		dst->rc_blockcount -= adj;
120 		dst->rc_startblock += adj;
121 	}
122 
123 	if (dst->rc_startblock + dst->rc_blockcount >
124 	    xc->irec_startbno + xc->irec.br_blockcount) {
125 		adj = (dst->rc_startblock + dst->rc_blockcount) -
126 		      (xc->irec_startbno + xc->irec.br_blockcount);
127 		dst->rc_blockcount -= adj;
128 	}
129 }
130 
131 /* Mark any shared CoW staging extents. */
132 STATIC int
133 xrep_cow_mark_shared_staging(
134 	struct xfs_btree_cur		*cur,
135 	const struct xfs_refcount_irec	*rec,
136 	void				*priv)
137 {
138 	struct xrep_cow			*xc = priv;
139 	struct xfs_refcount_irec	rrec;
140 	xfs_fsblock_t			fsbno;
141 
142 	if (!xfs_refcount_check_domain(rec) ||
143 	    rec->rc_domain != XFS_REFC_DOMAIN_SHARED)
144 		return -EFSCORRUPTED;
145 
146 	xrep_cow_trim_refcount(xc, &rrec, rec);
147 
148 	fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
149 			rrec.rc_startblock);
150 	return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount);
151 }
152 
153 /*
154  * Mark any portion of the CoW fork file offset range where there is not a CoW
155  * staging extent record in the refcountbt, and keep a record of where we did
156  * find correct refcountbt records.  Staging records are always cleaned out at
157  * mount time, so any two inodes trying to map the same staging area would have
158  * already taken the fs down due to refcount btree verifier errors.  Hence this
159  * inode should be the sole creator of the staging extent records ondisk.
160  */
161 STATIC int
162 xrep_cow_mark_missing_staging(
163 	struct xfs_btree_cur		*cur,
164 	const struct xfs_refcount_irec	*rec,
165 	void				*priv)
166 {
167 	struct xrep_cow			*xc = priv;
168 	struct xfs_refcount_irec	rrec;
169 	int				error;
170 
171 	if (!xfs_refcount_check_domain(rec) ||
172 	    rec->rc_domain != XFS_REFC_DOMAIN_COW)
173 		return -EFSCORRUPTED;
174 
175 	xrep_cow_trim_refcount(xc, &rrec, rec);
176 
177 	if (xc->next_bno >= rrec.rc_startblock)
178 		goto next;
179 
180 	error = xrep_cow_mark_file_range(xc,
181 			XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
182 				       xc->next_bno),
183 			rrec.rc_startblock - xc->next_bno);
184 	if (error)
185 		return error;
186 
187 next:
188 	xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount;
189 	return 0;
190 }
191 
192 /*
193  * Mark any area that does not correspond to a CoW staging rmap.  These are
194  * cross-linked areas that must be avoided.
195  */
196 STATIC int
197 xrep_cow_mark_missing_staging_rmap(
198 	struct xfs_btree_cur		*cur,
199 	const struct xfs_rmap_irec	*rec,
200 	void				*priv)
201 {
202 	struct xrep_cow			*xc = priv;
203 	xfs_fsblock_t			fsbno;
204 	xfs_agblock_t			rec_bno;
205 	xfs_extlen_t			rec_len;
206 	unsigned int			adj;
207 
208 	if (rec->rm_owner == XFS_RMAP_OWN_COW)
209 		return 0;
210 
211 	rec_bno = rec->rm_startblock;
212 	rec_len = rec->rm_blockcount;
213 	if (rec_bno < xc->irec_startbno) {
214 		adj = xc->irec_startbno - rec_bno;
215 		rec_len -= adj;
216 		rec_bno += adj;
217 	}
218 
219 	if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) {
220 		adj = (rec_bno + rec_len) -
221 		      (xc->irec_startbno + xc->irec.br_blockcount);
222 		rec_len -= adj;
223 	}
224 
225 	fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno);
226 	return xrep_cow_mark_file_range(xc, fsbno, rec_len);
227 }
228 
229 /*
230  * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
231  * extent and mark the corresponding part of the file range in the bitmap.
232  */
233 STATIC int
234 xrep_cow_find_bad(
235 	struct xrep_cow			*xc)
236 {
237 	struct xfs_refcount_irec	rc_low = { 0 };
238 	struct xfs_refcount_irec	rc_high = { 0 };
239 	struct xfs_rmap_irec		rm_low = { 0 };
240 	struct xfs_rmap_irec		rm_high = { 0 };
241 	struct xfs_perag		*pag;
242 	struct xfs_scrub		*sc = xc->sc;
243 	xfs_agnumber_t			agno;
244 	int				error;
245 
246 	agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock);
247 	xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock);
248 
249 	pag = xfs_perag_get(sc->mp, agno);
250 	if (!pag)
251 		return -EFSCORRUPTED;
252 
253 	error = xrep_ag_init(sc, pag, &sc->sa);
254 	if (error)
255 		goto out_pag;
256 
257 	/* Mark any CoW fork extents that are shared. */
258 	rc_low.rc_startblock = xc->irec_startbno;
259 	rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
260 	rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
261 	error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
262 			xrep_cow_mark_shared_staging, xc);
263 	if (error)
264 		goto out_sa;
265 
266 	/* Make sure there are CoW staging extents for the whole mapping. */
267 	rc_low.rc_startblock = xc->irec_startbno;
268 	rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
269 	rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
270 	xc->next_bno = xc->irec_startbno;
271 	error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
272 			xrep_cow_mark_missing_staging, xc);
273 	if (error)
274 		goto out_sa;
275 
276 	if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
277 		error = xrep_cow_mark_file_range(xc,
278 				XFS_AGB_TO_FSB(sc->mp, pag->pag_agno,
279 					       xc->next_bno),
280 				xc->irec_startbno + xc->irec.br_blockcount -
281 				xc->next_bno);
282 		if (error)
283 			goto out_sa;
284 	}
285 
286 	/* Mark any area has an rmap that isn't a COW staging extent. */
287 	rm_low.rm_startblock = xc->irec_startbno;
288 	memset(&rm_high, 0xFF, sizeof(rm_high));
289 	rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
290 	error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high,
291 			xrep_cow_mark_missing_staging_rmap, xc);
292 	if (error)
293 		goto out_sa;
294 
295 	/*
296 	 * If userspace is forcing us to rebuild the CoW fork or someone turned
297 	 * on the debugging knob, replace everything in the CoW fork.
298 	 */
299 	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
300 	    XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
301 		error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
302 				xc->irec.br_blockcount);
303 		if (error)
304 			return error;
305 	}
306 
307 out_sa:
308 	xchk_ag_free(sc, &sc->sa);
309 out_pag:
310 	xfs_perag_put(pag);
311 	return 0;
312 }
313 
314 /*
315  * Allocate a replacement CoW staging extent of up to the given number of
316  * blocks, and fill out the mapping.
317  */
318 STATIC int
319 xrep_cow_alloc(
320 	struct xfs_scrub	*sc,
321 	xfs_extlen_t		maxlen,
322 	struct xrep_cow_extent	*repl)
323 {
324 	struct xfs_alloc_arg	args = {
325 		.tp		= sc->tp,
326 		.mp		= sc->mp,
327 		.oinfo		= XFS_RMAP_OINFO_SKIP_UPDATE,
328 		.minlen		= 1,
329 		.maxlen		= maxlen,
330 		.prod		= 1,
331 		.resv		= XFS_AG_RESV_NONE,
332 		.datatype	= XFS_ALLOC_USERDATA,
333 	};
334 	int			error;
335 
336 	error = xfs_trans_reserve_more(sc->tp, maxlen, 0);
337 	if (error)
338 		return error;
339 
340 	error = xfs_alloc_vextent_start_ag(&args,
341 			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino));
342 	if (error)
343 		return error;
344 	if (args.fsbno == NULLFSBLOCK)
345 		return -ENOSPC;
346 
347 	xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
348 
349 	repl->fsbno = args.fsbno;
350 	repl->len = args.len;
351 	return 0;
352 }
353 
354 /*
355  * Look up the current CoW fork mapping so that we only allocate enough to
356  * replace a single mapping.  If we don't find a mapping that covers the start
357  * of the file range, or we find a delalloc or written extent, something is
358  * seriously wrong, since we didn't drop the ILOCK.
359  */
360 static inline int
361 xrep_cow_find_mapping(
362 	struct xrep_cow		*xc,
363 	struct xfs_iext_cursor	*icur,
364 	xfs_fileoff_t		startoff,
365 	struct xfs_bmbt_irec	*got)
366 {
367 	struct xfs_inode	*ip = xc->sc->ip;
368 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
369 
370 	if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got))
371 		goto bad;
372 
373 	if (got->br_startoff > startoff)
374 		goto bad;
375 
376 	if (got->br_blockcount == 0)
377 		goto bad;
378 
379 	if (isnullstartblock(got->br_startblock))
380 		goto bad;
381 
382 	if (xfs_bmap_is_written_extent(got))
383 		goto bad;
384 
385 	return 0;
386 bad:
387 	ASSERT(0);
388 	return -EFSCORRUPTED;
389 }
390 
391 #define REPLACE_LEFT_SIDE	(1U << 0)
392 #define REPLACE_RIGHT_SIDE	(1U << 1)
393 
394 /*
395  * Given a CoW fork mapping @got and a replacement mapping @repl, remap the
396  * beginning of @got with the space described by @rep.
397  */
398 static inline void
399 xrep_cow_replace_mapping(
400 	struct xfs_inode		*ip,
401 	struct xfs_iext_cursor		*icur,
402 	const struct xfs_bmbt_irec	*got,
403 	const struct xrep_cow_extent	*repl)
404 {
405 	struct xfs_bmbt_irec		new = *got; /* struct copy */
406 
407 	ASSERT(repl->len > 0);
408 	ASSERT(!isnullstartblock(got->br_startblock));
409 
410 	trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len);
411 
412 	if (got->br_blockcount == repl->len) {
413 		/*
414 		 * The new extent is a complete replacement for the existing
415 		 * extent.  Update the COW fork record.
416 		 */
417 		new.br_startblock = repl->fsbno;
418 		xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
419 		return;
420 	}
421 
422 	/*
423 	 * The new extent can replace the beginning of the COW fork record.
424 	 * Move the left side of @got upwards, then insert the new record.
425 	 */
426 	new.br_startoff += repl->len;
427 	new.br_startblock += repl->len;
428 	new.br_blockcount -= repl->len;
429 	xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
430 
431 	new.br_startoff = got->br_startoff;
432 	new.br_startblock = repl->fsbno;
433 	new.br_blockcount = repl->len;
434 	xfs_iext_insert(ip, icur, &new, BMAP_COWFORK);
435 }
436 
437 /*
438  * Replace the unwritten CoW staging extent backing the given file range with a
439  * new space extent that isn't as problematic.
440  */
441 STATIC int
442 xrep_cow_replace_range(
443 	struct xrep_cow		*xc,
444 	xfs_fileoff_t		startoff,
445 	xfs_extlen_t		*blockcount)
446 {
447 	struct xfs_iext_cursor	icur;
448 	struct xrep_cow_extent	repl;
449 	struct xfs_bmbt_irec	got;
450 	struct xfs_scrub	*sc = xc->sc;
451 	xfs_fileoff_t		nextoff;
452 	xfs_extlen_t		alloc_len;
453 	int			error;
454 
455 	/*
456 	 * Put the existing CoW fork mapping in @got.  If @got ends before
457 	 * @rep, truncate @rep so we only replace one extent mapping at a time.
458 	 */
459 	error = xrep_cow_find_mapping(xc, &icur, startoff, &got);
460 	if (error)
461 		return error;
462 	nextoff = min(startoff + *blockcount,
463 		      got.br_startoff + got.br_blockcount);
464 
465 	/*
466 	 * Allocate a replacement extent.  If we don't fill all the blocks,
467 	 * shorten the quantity that will be deleted in this step.
468 	 */
469 	alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
470 			  nextoff - startoff);
471 	error = xrep_cow_alloc(sc, alloc_len, &repl);
472 	if (error)
473 		return error;
474 
475 	/*
476 	 * Replace the old mapping with the new one, and commit the metadata
477 	 * changes made so far.
478 	 */
479 	xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl);
480 
481 	xfs_inode_set_cowblocks_tag(sc->ip);
482 	error = xfs_defer_finish(&sc->tp);
483 	if (error)
484 		return error;
485 
486 	/* Note the old CoW staging extents; we'll reap them all later. */
487 	error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
488 			repl.len);
489 	if (error)
490 		return error;
491 
492 	*blockcount = repl.len;
493 	return 0;
494 }
495 
496 /*
497  * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc
498  * reservation.
499  */
500 STATIC int
501 xrep_cow_replace(
502 	uint64_t		startoff,
503 	uint64_t		blockcount,
504 	void			*priv)
505 {
506 	struct xrep_cow		*xc = priv;
507 	int			error = 0;
508 
509 	while (blockcount > 0) {
510 		xfs_extlen_t	len = min_t(xfs_filblks_t, blockcount,
511 					    XFS_MAX_BMBT_EXTLEN);
512 
513 		error = xrep_cow_replace_range(xc, startoff, &len);
514 		if (error)
515 			break;
516 
517 		blockcount -= len;
518 		startoff += len;
519 	}
520 
521 	return error;
522 }
523 
524 /*
525  * Repair an inode's CoW fork.  The CoW fork is an in-core structure, so
526  * there's no btree to rebuid.  Instead, we replace any mappings that are
527  * cross-linked or lack ondisk CoW fork records in the refcount btree.
528  */
529 int
530 xrep_bmap_cow(
531 	struct xfs_scrub	*sc)
532 {
533 	struct xrep_cow		*xc;
534 	struct xfs_iext_cursor	icur;
535 	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK);
536 	int			error;
537 
538 	if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp))
539 		return -EOPNOTSUPP;
540 
541 	if (!ifp)
542 		return 0;
543 
544 	/* realtime files aren't supported yet */
545 	if (XFS_IS_REALTIME_INODE(sc->ip))
546 		return -EOPNOTSUPP;
547 
548 	/*
549 	 * If we're somehow not in extents format, then reinitialize it to
550 	 * an empty extent mapping fork and exit.
551 	 */
552 	if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
553 		ifp->if_format = XFS_DINODE_FMT_EXTENTS;
554 		ifp->if_nextents = 0;
555 		return 0;
556 	}
557 
558 	xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS);
559 	if (!xc)
560 		return -ENOMEM;
561 
562 	xfs_trans_ijoin(sc->tp, sc->ip, 0);
563 
564 	xc->sc = sc;
565 	xoff_bitmap_init(&xc->bad_fileoffs);
566 	xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
567 
568 	for_each_xfs_iext(ifp, &icur, &xc->irec) {
569 		if (xchk_should_terminate(sc, &error))
570 			goto out_bitmap;
571 
572 		/*
573 		 * delalloc reservations only exist incore, so there is no
574 		 * ondisk metadata that we can examine.  Hence we leave them
575 		 * alone.
576 		 */
577 		if (isnullstartblock(xc->irec.br_startblock))
578 			continue;
579 
580 		/*
581 		 * COW fork extents are only in the written state if writeback
582 		 * is actively writing to disk.  We cannot restart the write
583 		 * at a different disk address since we've already issued the
584 		 * IO, so we leave these alone and hope for the best.
585 		 */
586 		if (xfs_bmap_is_written_extent(&xc->irec))
587 			continue;
588 
589 		error = xrep_cow_find_bad(xc);
590 		if (error)
591 			goto out_bitmap;
592 	}
593 
594 	/* Replace any bad unwritten mappings with fresh reservations. */
595 	error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc);
596 	if (error)
597 		goto out_bitmap;
598 
599 	/*
600 	 * Reap as many of the old CoW blocks as we can.  They are owned ondisk
601 	 * by the refcount btree, not the inode, so it is correct to treat them
602 	 * like inode metadata.
603 	 */
604 	error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
605 			&XFS_RMAP_OINFO_COW);
606 	if (error)
607 		goto out_bitmap;
608 
609 out_bitmap:
610 	xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
611 	xoff_bitmap_destroy(&xc->bad_fileoffs);
612 	kfree(xc);
613 	return error;
614 }
615