xref: /linux/fs/xfs/scrub/tempfile.c (revision c8b90d40d5bba8e6fba457b8a7c10d3c0d467e37)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_log_format.h"
13 #include "xfs_trans.h"
14 #include "xfs_inode.h"
15 #include "xfs_ialloc.h"
16 #include "xfs_quota.h"
17 #include "xfs_bmap.h"
18 #include "xfs_bmap_btree.h"
19 #include "xfs_trans_space.h"
20 #include "xfs_dir2.h"
21 #include "xfs_exchrange.h"
22 #include "xfs_exchmaps.h"
23 #include "xfs_defer.h"
24 #include "xfs_symlink_remote.h"
25 #include "xfs_metafile.h"
26 #include "scrub/scrub.h"
27 #include "scrub/common.h"
28 #include "scrub/repair.h"
29 #include "scrub/trace.h"
30 #include "scrub/tempfile.h"
31 #include "scrub/tempexch.h"
32 #include "scrub/xfile.h"
33 
34 /*
35  * Create a temporary file for reconstructing metadata, with the intention of
36  * atomically exchanging the temporary file's contents with the file that's
37  * being repaired.
38  */
39 int
40 xrep_tempfile_create(
41 	struct xfs_scrub	*sc,
42 	uint16_t		mode)
43 {
44 	struct xfs_icreate_args	args = {
45 		.pip		= sc->mp->m_rootip,
46 		.mode		= mode,
47 		.flags		= XFS_ICREATE_TMPFILE | XFS_ICREATE_UNLINKABLE,
48 	};
49 	struct xfs_mount	*mp = sc->mp;
50 	struct xfs_trans	*tp = NULL;
51 	struct xfs_dquot	*udqp;
52 	struct xfs_dquot	*gdqp;
53 	struct xfs_dquot	*pdqp;
54 	struct xfs_trans_res	*tres;
55 	struct xfs_inode	*dp = mp->m_rootip;
56 	xfs_ino_t		ino;
57 	unsigned int		resblks;
58 	bool			is_dir = S_ISDIR(mode);
59 	int			error;
60 
61 	if (xfs_is_shutdown(mp))
62 		return -EIO;
63 	if (xfs_is_readonly(mp))
64 		return -EROFS;
65 
66 	ASSERT(sc->tp == NULL);
67 	ASSERT(sc->tempip == NULL);
68 
69 	/*
70 	 * Make sure that we have allocated dquot(s) on disk.  The temporary
71 	 * inode should be completely root owned so that we don't fail due to
72 	 * quota limits.
73 	 */
74 	error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp);
75 	if (error)
76 		return error;
77 
78 	if (is_dir) {
79 		resblks = xfs_mkdir_space_res(mp, 0);
80 		tres = &M_RES(mp)->tr_mkdir;
81 	} else {
82 		resblks = XFS_IALLOC_SPACE_RES(mp);
83 		tres = &M_RES(mp)->tr_create_tmpfile;
84 	}
85 
86 	error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
87 			&tp);
88 	if (error)
89 		goto out_release_dquots;
90 
91 	/* Allocate inode, set up directory. */
92 	error = xfs_dialloc(&tp, &args, &ino);
93 	if (error)
94 		goto out_trans_cancel;
95 	error = xfs_icreate(tp, ino, &args, &sc->tempip);
96 	if (error)
97 		goto out_trans_cancel;
98 
99 	/* We don't touch file data, so drop the realtime flags. */
100 	sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
101 	xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
102 
103 	/*
104 	 * Mark our temporary file as private so that LSMs and the ACL code
105 	 * don't try to add their own metadata or reason about these files.
106 	 * The file should never be exposed to userspace.
107 	 */
108 	VFS_I(sc->tempip)->i_flags |= S_PRIVATE;
109 	VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR;
110 
111 	if (is_dir) {
112 		error = xfs_dir_init(tp, sc->tempip, dp);
113 		if (error)
114 			goto out_trans_cancel;
115 	} else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) {
116 		/*
117 		 * Initialize the temporary symlink with a meaningless target
118 		 * that won't trip the verifiers.  Repair must rewrite the
119 		 * target with meaningful content before swapping with the file
120 		 * being repaired.  A single-byte target will not write a
121 		 * remote target block, so the owner is irrelevant.
122 		 */
123 		error = xfs_symlink_write_target(tp, sc->tempip,
124 				sc->tempip->i_ino, ".", 1, 0, 0);
125 		if (error)
126 			goto out_trans_cancel;
127 	}
128 
129 	/*
130 	 * Attach the dquot(s) to the inodes and modify them incore.
131 	 * These ids of the inode couldn't have changed since the new
132 	 * inode has been locked ever since it was created.
133 	 */
134 	xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);
135 
136 	/*
137 	 * Put our temp file on the unlinked list so it's purged automatically.
138 	 * All file-based metadata being reconstructed using this file must be
139 	 * atomically exchanged with the original file because the contents
140 	 * here will be purged when the inode is dropped or log recovery cleans
141 	 * out the unlinked list.
142 	 */
143 	error = xfs_iunlink(tp, sc->tempip);
144 	if (error)
145 		goto out_trans_cancel;
146 
147 	error = xfs_trans_commit(tp);
148 	if (error)
149 		goto out_release_inode;
150 
151 	trace_xrep_tempfile_create(sc);
152 
153 	xfs_qm_dqrele(udqp);
154 	xfs_qm_dqrele(gdqp);
155 	xfs_qm_dqrele(pdqp);
156 
157 	/* Finish setting up the incore / vfs context. */
158 	xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
159 	xfs_setup_iops(sc->tempip);
160 	xfs_finish_inode_setup(sc->tempip);
161 
162 	sc->temp_ilock_flags = 0;
163 	return error;
164 
165 out_trans_cancel:
166 	xfs_trans_cancel(tp);
167 out_release_inode:
168 	/*
169 	 * Wait until after the current transaction is aborted to finish the
170 	 * setup of the inode and release the inode.  This prevents recursive
171 	 * transactions and deadlocks from xfs_inactive.
172 	 */
173 	if (sc->tempip) {
174 		xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
175 		xfs_finish_inode_setup(sc->tempip);
176 		xchk_irele(sc, sc->tempip);
177 	}
178 out_release_dquots:
179 	xfs_qm_dqrele(udqp);
180 	xfs_qm_dqrele(gdqp);
181 	xfs_qm_dqrele(pdqp);
182 
183 	return error;
184 }
185 
186 /*
187  * Temporary files have to be created before we even know which inode we're
188  * going to scrub, so we assume that they will be part of the regular directory
189  * tree.  If it turns out that we're actually scrubbing a file from the
190  * metadata directory tree, we have to subtract the temp file from the root
191  * dquots and detach the dquots.
192  */
193 int
194 xrep_tempfile_adjust_directory_tree(
195 	struct xfs_scrub	*sc)
196 {
197 	int			error;
198 
199 	if (!sc->tempip)
200 		return 0;
201 
202 	ASSERT(sc->tp == NULL);
203 	ASSERT(!xfs_is_metadir_inode(sc->tempip));
204 
205 	if (!sc->ip || !xfs_is_metadir_inode(sc->ip))
206 		return 0;
207 
208 	xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
209 	sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
210 
211 	error = xchk_trans_alloc(sc, 0);
212 	if (error)
213 		goto out_iolock;
214 
215 	xrep_tempfile_ilock(sc);
216 	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
217 
218 	/* Metadir files are not accounted in quota, so drop icount */
219 	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, -1L);
220 	xfs_metafile_set_iflag(sc->tp, sc->tempip, XFS_METAFILE_UNKNOWN);
221 
222 	error = xrep_trans_commit(sc);
223 	if (error)
224 		goto out_ilock;
225 
226 	xfs_qm_dqdetach(sc->tempip);
227 out_ilock:
228 	xrep_tempfile_iunlock(sc);
229 out_iolock:
230 	xrep_tempfile_iounlock(sc);
231 	return error;
232 }
233 
234 /*
235  * Remove this temporary file from the metadata directory tree so that it can
236  * be inactivated the normal way.
237  */
238 STATIC int
239 xrep_tempfile_remove_metadir(
240 	struct xfs_scrub	*sc)
241 {
242 	int			error;
243 
244 	if (!sc->tempip || !xfs_is_metadir_inode(sc->tempip))
245 		return 0;
246 
247 	ASSERT(sc->tp == NULL);
248 
249 	xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
250 	sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
251 
252 	error = xchk_trans_alloc(sc, 0);
253 	if (error)
254 		goto out_iolock;
255 
256 	xrep_tempfile_ilock(sc);
257 	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
258 
259 	xfs_metafile_clear_iflag(sc->tp, sc->tempip);
260 
261 	/* Non-metadir files are accounted in quota, so bump bcount/icount */
262 	error = xfs_qm_dqattach_locked(sc->tempip, false);
263 	if (error)
264 		goto out_cancel;
265 
266 	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, 1L);
267 	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_BCOUNT,
268 			sc->tempip->i_nblocks);
269 	error = xrep_trans_commit(sc);
270 	goto out_ilock;
271 
272 out_cancel:
273 	xchk_trans_cancel(sc);
274 out_ilock:
275 	xrep_tempfile_iunlock(sc);
276 out_iolock:
277 	xrep_tempfile_iounlock(sc);
278 	return error;
279 }
280 
281 /* Take IOLOCK_EXCL on the temporary file, maybe. */
282 bool
283 xrep_tempfile_iolock_nowait(
284 	struct xfs_scrub	*sc)
285 {
286 	if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) {
287 		sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
288 		return true;
289 	}
290 
291 	return false;
292 }
293 
294 /*
295  * Take the temporary file's IOLOCK while holding a different inode's IOLOCK.
296  * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock
297  * to avoid deadlocks and lockdep complaints.
298  */
299 int
300 xrep_tempfile_iolock_polled(
301 	struct xfs_scrub	*sc)
302 {
303 	int			error = 0;
304 
305 	while (!xrep_tempfile_iolock_nowait(sc)) {
306 		if (xchk_should_terminate(sc, &error))
307 			return error;
308 		delay(1);
309 	}
310 
311 	return 0;
312 }
313 
314 /* Release IOLOCK_EXCL on the temporary file. */
315 void
316 xrep_tempfile_iounlock(
317 	struct xfs_scrub	*sc)
318 {
319 	xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL);
320 	sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL;
321 }
322 
323 /* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */
324 void
325 xrep_tempfile_ilock(
326 	struct xfs_scrub	*sc)
327 {
328 	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
329 	xfs_ilock(sc->tempip, XFS_ILOCK_EXCL);
330 }
331 
332 /* Try to grab ILOCK_EXCL on the temporary file. */
333 bool
334 xrep_tempfile_ilock_nowait(
335 	struct xfs_scrub	*sc)
336 {
337 	if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) {
338 		sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
339 		return true;
340 	}
341 
342 	return false;
343 }
344 
345 /* Unlock ILOCK_EXCL on the temporary file after an update. */
346 void
347 xrep_tempfile_iunlock(
348 	struct xfs_scrub	*sc)
349 {
350 	xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
351 	sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL;
352 }
353 
354 /*
355  * Begin the process of making changes to both the file being scrubbed and
356  * the temporary file by taking ILOCK_EXCL on both.
357  */
358 void
359 xrep_tempfile_ilock_both(
360 	struct xfs_scrub	*sc)
361 {
362 	xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL);
363 	sc->ilock_flags |= XFS_ILOCK_EXCL;
364 	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
365 }
366 
367 /* Unlock ILOCK_EXCL on both files. */
368 void
369 xrep_tempfile_iunlock_both(
370 	struct xfs_scrub	*sc)
371 {
372 	xrep_tempfile_iunlock(sc);
373 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
374 }
375 
376 /* Release the temporary file. */
377 void
378 xrep_tempfile_rele(
379 	struct xfs_scrub	*sc)
380 {
381 	if (!sc->tempip)
382 		return;
383 
384 	if (sc->temp_ilock_flags) {
385 		xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
386 		sc->temp_ilock_flags = 0;
387 	}
388 
389 	xrep_tempfile_remove_metadir(sc);
390 	xchk_irele(sc, sc->tempip);
391 	sc->tempip = NULL;
392 }
393 
394 /*
395  * Make sure that the given range of the data fork of the temporary file is
396  * mapped to written blocks.  The caller must ensure that both inodes are
397  * joined to the transaction.
398  */
399 int
400 xrep_tempfile_prealloc(
401 	struct xfs_scrub	*sc,
402 	xfs_fileoff_t		off,
403 	xfs_filblks_t		len)
404 {
405 	struct xfs_bmbt_irec	map;
406 	xfs_fileoff_t		end = off + len;
407 	int			error;
408 
409 	ASSERT(sc->tempip != NULL);
410 	ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip));
411 
412 	for (; off < end; off = map.br_startoff + map.br_blockcount) {
413 		int		nmaps = 1;
414 
415 		/*
416 		 * If we have a real extent mapping this block then we're
417 		 * in ok shape.
418 		 */
419 		error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps,
420 				XFS_DATA_FORK);
421 		if (error)
422 			return error;
423 		if (nmaps == 0) {
424 			ASSERT(nmaps != 0);
425 			return -EFSCORRUPTED;
426 		}
427 
428 		if (xfs_bmap_is_written_extent(&map))
429 			continue;
430 
431 		/*
432 		 * If we find a delalloc reservation then something is very
433 		 * very wrong.  Bail out.
434 		 */
435 		if (map.br_startblock == DELAYSTARTBLOCK)
436 			return -EFSCORRUPTED;
437 
438 		/*
439 		 * Make sure this block has a real zeroed extent allocated to
440 		 * it.
441 		 */
442 		nmaps = 1;
443 		error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off,
444 				XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map,
445 				&nmaps);
446 		if (error)
447 			return error;
448 		if (nmaps != 1)
449 			return -EFSCORRUPTED;
450 
451 		trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map);
452 
453 		/* Commit new extent and all deferred work. */
454 		error = xfs_defer_finish(&sc->tp);
455 		if (error)
456 			return error;
457 	}
458 
459 	return 0;
460 }
461 
462 /*
463  * Write data to each block of a file.  The given range of the tempfile's data
464  * fork must already be populated with written extents.
465  */
466 int
467 xrep_tempfile_copyin(
468 	struct xfs_scrub	*sc,
469 	xfs_fileoff_t		off,
470 	xfs_filblks_t		len,
471 	xrep_tempfile_copyin_fn	prep_fn,
472 	void			*data)
473 {
474 	LIST_HEAD(buffers_list);
475 	struct xfs_mount	*mp = sc->mp;
476 	struct xfs_buf		*bp;
477 	xfs_fileoff_t		flush_mask;
478 	xfs_fileoff_t		end = off + len;
479 	loff_t			pos = XFS_FSB_TO_B(mp, off);
480 	int			error = 0;
481 
482 	ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode));
483 
484 	/* Flush buffers to disk every 512K */
485 	flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1;
486 
487 	for (; off < end; off++, pos += mp->m_sb.sb_blocksize) {
488 		struct xfs_bmbt_irec	map;
489 		int			nmaps = 1;
490 
491 		/* Read block mapping for this file block. */
492 		error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0);
493 		if (error)
494 			goto out_err;
495 		if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) {
496 			error = -EFSCORRUPTED;
497 			goto out_err;
498 		}
499 
500 		/* Get the metadata buffer for this offset in the file. */
501 		error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp,
502 				XFS_FSB_TO_DADDR(mp, map.br_startblock),
503 				mp->m_bsize, 0, &bp);
504 		if (error)
505 			goto out_err;
506 
507 		trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map);
508 
509 		/* Read in a block's worth of data from the xfile. */
510 		error = prep_fn(sc, bp, data);
511 		if (error) {
512 			xfs_trans_brelse(sc->tp, bp);
513 			goto out_err;
514 		}
515 
516 		/* Queue buffer, and flush if we have too much dirty data. */
517 		xfs_buf_delwri_queue_here(bp, &buffers_list);
518 		xfs_trans_brelse(sc->tp, bp);
519 
520 		if (!(off & flush_mask)) {
521 			error = xfs_buf_delwri_submit(&buffers_list);
522 			if (error)
523 				goto out_err;
524 		}
525 	}
526 
527 	/*
528 	 * Write the new blocks to disk.  If the ordered list isn't empty after
529 	 * that, then something went wrong and we have to fail.  This should
530 	 * never happen, but we'll check anyway.
531 	 */
532 	error = xfs_buf_delwri_submit(&buffers_list);
533 	if (error)
534 		goto out_err;
535 
536 	if (!list_empty(&buffers_list)) {
537 		ASSERT(list_empty(&buffers_list));
538 		error = -EIO;
539 		goto out_err;
540 	}
541 
542 	return 0;
543 
544 out_err:
545 	xfs_buf_delwri_cancel(&buffers_list);
546 	return error;
547 }
548 
549 /*
550  * Set the temporary file's size.  Caller must join the tempfile to the scrub
551  * transaction and is responsible for adjusting block mappings as needed.
552  */
553 int
554 xrep_tempfile_set_isize(
555 	struct xfs_scrub	*sc,
556 	unsigned long long	isize)
557 {
558 	if (sc->tempip->i_disk_size == isize)
559 		return 0;
560 
561 	sc->tempip->i_disk_size = isize;
562 	i_size_write(VFS_I(sc->tempip), isize);
563 	return xrep_tempfile_roll_trans(sc);
564 }
565 
566 /*
567  * Roll a repair transaction involving the temporary file.  Caller must join
568  * both the temporary file and the file being scrubbed to the transaction.
569  * This function return with both inodes joined to a new scrub transaction,
570  * or the usual negative errno.
571  */
572 int
573 xrep_tempfile_roll_trans(
574 	struct xfs_scrub	*sc)
575 {
576 	int			error;
577 
578 	xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE);
579 	error = xrep_roll_trans(sc);
580 	if (error)
581 		return error;
582 
583 	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
584 	return 0;
585 }
586 
587 /*
588  * Fill out the mapping exchange request in preparation for atomically
589  * committing the contents of a metadata file that we've rebuilt in the temp
590  * file.
591  */
592 STATIC int
593 xrep_tempexch_prep_request(
594 	struct xfs_scrub	*sc,
595 	int			whichfork,
596 	struct xrep_tempexch	*tx)
597 {
598 	struct xfs_exchmaps_req	*req = &tx->req;
599 
600 	memset(tx, 0, sizeof(struct xrep_tempexch));
601 
602 	/* COW forks don't exist on disk. */
603 	if (whichfork == XFS_COW_FORK) {
604 		ASSERT(0);
605 		return -EINVAL;
606 	}
607 
608 	/* Both files should have the relevant forks. */
609 	if (!xfs_ifork_ptr(sc->ip, whichfork) ||
610 	    !xfs_ifork_ptr(sc->tempip, whichfork)) {
611 		ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
612 		ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL);
613 		return -EINVAL;
614 	}
615 
616 	/* Exchange all mappings in both forks. */
617 	req->ip1 = sc->tempip;
618 	req->ip2 = sc->ip;
619 	req->startoff1 = 0;
620 	req->startoff2 = 0;
621 	switch (whichfork) {
622 	case XFS_ATTR_FORK:
623 		req->flags |= XFS_EXCHMAPS_ATTR_FORK;
624 		break;
625 	case XFS_DATA_FORK:
626 		/* Always exchange sizes when exchanging data fork mappings. */
627 		req->flags |= XFS_EXCHMAPS_SET_SIZES;
628 		break;
629 	}
630 	req->blockcount = XFS_MAX_FILEOFF;
631 
632 	return 0;
633 }
634 
635 /*
636  * Fill out the mapping exchange resource estimation structures in preparation
637  * for exchanging the contents of a metadata file that we've rebuilt in the
638  * temp file.  Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files.
639  */
640 STATIC int
641 xrep_tempexch_estimate(
642 	struct xfs_scrub	*sc,
643 	struct xrep_tempexch	*tx)
644 {
645 	struct xfs_exchmaps_req	*req = &tx->req;
646 	struct xfs_ifork	*ifp;
647 	struct xfs_ifork	*tifp;
648 	int			whichfork = xfs_exchmaps_reqfork(req);
649 	int			state = 0;
650 
651 	/*
652 	 * The exchmaps code only knows how to exchange file fork space
653 	 * mappings.  Any fork data in local format must be promoted to a
654 	 * single block before the exchange can take place.
655 	 */
656 	ifp = xfs_ifork_ptr(sc->ip, whichfork);
657 	if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
658 		state |= 1;
659 
660 	tifp = xfs_ifork_ptr(sc->tempip, whichfork);
661 	if (tifp->if_format == XFS_DINODE_FMT_LOCAL)
662 		state |= 2;
663 
664 	switch (state) {
665 	case 0:
666 		/* Both files have mapped extents; use the regular estimate. */
667 		return xfs_exchrange_estimate(req);
668 	case 1:
669 		/*
670 		 * The file being repaired is in local format, but the temp
671 		 * file has mapped extents.  To perform the exchange, the file
672 		 * being repaired must have its shorform data converted to an
673 		 * ondisk block so that the forks will be in extents format.
674 		 * We need one resblk for the conversion; the number of
675 		 * exchanges is (worst case) the temporary file's extent count
676 		 * plus the block we converted.
677 		 */
678 		req->ip1_bcount = sc->tempip->i_nblocks;
679 		req->ip2_bcount = 1;
680 		req->nr_exchanges = 1 + tifp->if_nextents;
681 		req->resblks = 1;
682 		break;
683 	case 2:
684 		/*
685 		 * The temporary file is in local format, but the file being
686 		 * repaired has mapped extents.  To perform the exchange, the
687 		 * temp file must have its shortform data converted to an
688 		 * ondisk block, and the fork changed to extents format.  We
689 		 * need one resblk for the conversion; the number of exchanges
690 		 * is (worst case) the extent count of the file being repaired
691 		 * plus the block we converted.
692 		 */
693 		req->ip1_bcount = 1;
694 		req->ip2_bcount = sc->ip->i_nblocks;
695 		req->nr_exchanges = 1 + ifp->if_nextents;
696 		req->resblks = 1;
697 		break;
698 	case 3:
699 		/*
700 		 * Both forks are in local format.  To perform the exchange,
701 		 * both files must have their shortform data converted to
702 		 * fsblocks, and both forks must be converted to extents
703 		 * format.  We need two resblks for the two conversions, and
704 		 * the number of exchanges is 1 since there's only one block at
705 		 * fileoff 0.  Presumably, the caller could not exchange the
706 		 * two inode fork areas directly.
707 		 */
708 		req->ip1_bcount = 1;
709 		req->ip2_bcount = 1;
710 		req->nr_exchanges = 1;
711 		req->resblks = 2;
712 		break;
713 	}
714 
715 	return xfs_exchmaps_estimate_overhead(req);
716 }
717 
718 /*
719  * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
720  * this if quota enforcement is disabled or if both inodes' dquots are the
721  * same.  The qretry structure must be initialized to zeroes before the first
722  * call to this function.
723  */
724 STATIC int
725 xrep_tempexch_reserve_quota(
726 	struct xfs_scrub		*sc,
727 	const struct xrep_tempexch	*tx)
728 {
729 	struct xfs_trans		*tp = sc->tp;
730 	const struct xfs_exchmaps_req	*req = &tx->req;
731 	int64_t				ddelta, rdelta;
732 	int				error;
733 
734 	/*
735 	 * Don't bother with a quota reservation if we're not enforcing them
736 	 * or the two inodes have the same dquots.
737 	 */
738 	if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
739 	    (req->ip1->i_udquot == req->ip2->i_udquot &&
740 	     req->ip1->i_gdquot == req->ip2->i_gdquot &&
741 	     req->ip1->i_pdquot == req->ip2->i_pdquot))
742 		return 0;
743 
744 	/*
745 	 * Quota reservation for each file comes from two sources.  First, we
746 	 * need to account for any net gain in mapped blocks during the
747 	 * exchange.  Second, we need reservation for the gross gain in mapped
748 	 * blocks so that we don't trip over any quota block reservation
749 	 * assertions.  We must reserve the gross gain because the quota code
750 	 * subtracts from bcount the number of blocks that we unmap; it does
751 	 * not add that quantity back to the quota block reservation.
752 	 */
753 	ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount);
754 	rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount);
755 	error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
756 			ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount,
757 			true);
758 	if (error)
759 		return error;
760 
761 	ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount);
762 	rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount);
763 	return xfs_trans_reserve_quota_nblks(tp, req->ip2,
764 			ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount,
765 			true);
766 }
767 
768 /*
769  * Prepare an existing transaction for an atomic file contents exchange.
770  *
771  * This function fills out the mapping exchange request and resource estimation
772  * structures in preparation for exchanging the contents of a metadata file
773  * that has been rebuilt in the temp file.  Next, it reserves space and quota
774  * for the transaction.
775  *
776  * The caller must hold ILOCK_EXCL of the scrub target file and the temporary
777  * file.  The caller must join both inodes to the transaction with no unlock
778  * flags, and is responsible for dropping both ILOCKs when appropriate.  Only
779  * use this when those ILOCKs cannot be dropped.
780  */
781 int
782 xrep_tempexch_trans_reserve(
783 	struct xfs_scrub	*sc,
784 	int			whichfork,
785 	struct xrep_tempexch	*tx)
786 {
787 	int			error;
788 
789 	ASSERT(sc->tp != NULL);
790 	xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
791 	xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);
792 
793 	error = xrep_tempexch_prep_request(sc, whichfork, tx);
794 	if (error)
795 		return error;
796 
797 	error = xfs_exchmaps_estimate(&tx->req);
798 	if (error)
799 		return error;
800 
801 	error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0);
802 	if (error)
803 		return error;
804 
805 	return xrep_tempexch_reserve_quota(sc, tx);
806 }
807 
808 /*
809  * Create a new transaction for a file contents exchange.
810  *
811  * This function fills out the mapping excahange request and resource
812  * estimation structures in preparation for exchanging the contents of a
813  * metadata file that has been rebuilt in the temp file.  Next, it reserves
814  * space, takes ILOCK_EXCL of both inodes, joins them to the transaction and
815  * reserves quota for the transaction.
816  *
817  * The caller is responsible for dropping both ILOCKs when appropriate.
818  */
819 int
820 xrep_tempexch_trans_alloc(
821 	struct xfs_scrub	*sc,
822 	int			whichfork,
823 	struct xrep_tempexch	*tx)
824 {
825 	unsigned int		flags = 0;
826 	int			error;
827 
828 	ASSERT(sc->tp == NULL);
829 	ASSERT(xfs_has_exchange_range(sc->mp));
830 
831 	error = xrep_tempexch_prep_request(sc, whichfork, tx);
832 	if (error)
833 		return error;
834 
835 	error = xrep_tempexch_estimate(sc, tx);
836 	if (error)
837 		return error;
838 
839 	if (xfs_has_lazysbcount(sc->mp))
840 		flags |= XFS_TRANS_RES_FDBLKS;
841 
842 	error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
843 			tx->req.resblks, 0, flags, &sc->tp);
844 	if (error)
845 		return error;
846 
847 	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
848 	sc->ilock_flags |= XFS_ILOCK_EXCL;
849 	xfs_exchrange_ilock(sc->tp, sc->ip, sc->tempip);
850 
851 	return xrep_tempexch_reserve_quota(sc, tx);
852 }
853 
854 /*
855  * Exchange file mappings (and hence file contents) between the file being
856  * repaired and the temporary file.  Returns with both inodes locked and joined
857  * to a clean scrub transaction.
858  */
859 int
860 xrep_tempexch_contents(
861 	struct xfs_scrub	*sc,
862 	struct xrep_tempexch	*tx)
863 {
864 	int			error;
865 
866 	ASSERT(xfs_has_exchange_range(sc->mp));
867 
868 	xfs_exchange_mappings(sc->tp, &tx->req);
869 	error = xfs_defer_finish(&sc->tp);
870 	if (error)
871 		return error;
872 
873 	/*
874 	 * If we exchanged the ondisk sizes of two metadata files, we must
875 	 * exchanged the incore sizes as well.
876 	 */
877 	if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) {
878 		loff_t	temp;
879 
880 		temp = i_size_read(VFS_I(sc->ip));
881 		i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
882 		i_size_write(VFS_I(sc->tempip), temp);
883 	}
884 
885 	return 0;
886 }
887 
888 /*
889  * Write local format data from one of the temporary file's forks into the same
890  * fork of file being repaired, and exchange the file sizes, if appropriate.
891  * Caller must ensure that the file being repaired has enough fork space to
892  * hold all the bytes.
893  */
894 void
895 xrep_tempfile_copyout_local(
896 	struct xfs_scrub	*sc,
897 	int			whichfork)
898 {
899 	struct xfs_ifork	*temp_ifp;
900 	struct xfs_ifork	*ifp;
901 	unsigned int		ilog_flags = XFS_ILOG_CORE;
902 
903 	temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork);
904 	ifp = xfs_ifork_ptr(sc->ip, whichfork);
905 
906 	ASSERT(temp_ifp != NULL);
907 	ASSERT(ifp != NULL);
908 	ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL);
909 	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
910 
911 	switch (whichfork) {
912 	case XFS_DATA_FORK:
913 		ASSERT(sc->tempip->i_disk_size <=
914 					xfs_inode_data_fork_size(sc->ip));
915 		break;
916 	case XFS_ATTR_FORK:
917 		ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff);
918 		break;
919 	default:
920 		ASSERT(0);
921 		return;
922 	}
923 
924 	/* Recreate @sc->ip's incore fork (ifp) with data from temp_ifp. */
925 	xfs_idestroy_fork(ifp);
926 	xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_data,
927 			temp_ifp->if_bytes);
928 
929 	if (whichfork == XFS_DATA_FORK) {
930 		i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
931 		sc->ip->i_disk_size = sc->tempip->i_disk_size;
932 	}
933 
934 	ilog_flags |= xfs_ilog_fdata(whichfork);
935 	xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags);
936 }
937 
938 /* Decide if a given XFS inode is a temporary file for a repair. */
939 bool
940 xrep_is_tempfile(
941 	const struct xfs_inode	*ip)
942 {
943 	const struct inode	*inode = &ip->i_vnode;
944 	struct xfs_mount	*mp = ip->i_mount;
945 
946 	/*
947 	 * Files in the metadata directory tree also have S_PRIVATE set and
948 	 * IOP_XATTR unset, so we must distinguish them separately.
949 	 */
950 	if (xfs_has_metadir(mp) && (ip->i_diflags2 & XFS_DIFLAG2_METADATA))
951 		return false;
952 
953 	if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR))
954 		return true;
955 
956 	return false;
957 }
958