xref: /linux/fs/xfs/scrub/tempfile.c (revision 8cbd01ba9c38eb16f3a572300da486ac544519b7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_log_format.h"
13 #include "xfs_trans.h"
14 #include "xfs_inode.h"
15 #include "xfs_ialloc.h"
16 #include "xfs_quota.h"
17 #include "xfs_bmap.h"
18 #include "xfs_bmap_btree.h"
19 #include "xfs_trans_space.h"
20 #include "xfs_dir2.h"
21 #include "xfs_exchrange.h"
22 #include "xfs_exchmaps.h"
23 #include "xfs_defer.h"
24 #include "xfs_symlink_remote.h"
25 #include "xfs_metafile.h"
26 #include "scrub/scrub.h"
27 #include "scrub/common.h"
28 #include "scrub/repair.h"
29 #include "scrub/trace.h"
30 #include "scrub/tempfile.h"
31 #include "scrub/tempexch.h"
32 #include "scrub/xfile.h"
33 
34 /*
35  * Create a temporary file for reconstructing metadata, with the intention of
36  * atomically exchanging the temporary file's contents with the file that's
37  * being repaired.
38  */
39 int
xrep_tempfile_create(struct xfs_scrub * sc,uint16_t mode)40 xrep_tempfile_create(
41 	struct xfs_scrub	*sc,
42 	uint16_t		mode)
43 {
44 	struct xfs_icreate_args	args = {
45 		.pip		= sc->mp->m_rootip,
46 		.mode		= mode,
47 		.flags		= XFS_ICREATE_TMPFILE | XFS_ICREATE_UNLINKABLE,
48 	};
49 	struct xfs_mount	*mp = sc->mp;
50 	struct xfs_trans	*tp = NULL;
51 	struct xfs_dquot	*udqp;
52 	struct xfs_dquot	*gdqp;
53 	struct xfs_dquot	*pdqp;
54 	struct xfs_trans_res	*tres;
55 	struct xfs_inode	*dp = mp->m_rootip;
56 	xfs_ino_t		ino;
57 	unsigned int		resblks;
58 	bool			is_dir = S_ISDIR(mode);
59 	int			error;
60 
61 	if (xfs_is_shutdown(mp))
62 		return -EIO;
63 	if (xfs_is_readonly(mp))
64 		return -EROFS;
65 
66 	ASSERT(sc->tp == NULL);
67 	ASSERT(sc->tempip == NULL);
68 
69 	/*
70 	 * Make sure that we have allocated dquot(s) on disk.  The temporary
71 	 * inode should be completely root owned so that we don't fail due to
72 	 * quota limits.
73 	 */
74 	error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp);
75 	if (error)
76 		return error;
77 
78 	if (is_dir) {
79 		resblks = xfs_mkdir_space_res(mp, 0);
80 		tres = &M_RES(mp)->tr_mkdir;
81 	} else {
82 		resblks = XFS_IALLOC_SPACE_RES(mp);
83 		tres = &M_RES(mp)->tr_create_tmpfile;
84 	}
85 
86 	error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
87 			&tp);
88 	if (error)
89 		goto out_release_dquots;
90 
91 	/* Allocate inode, set up directory. */
92 	error = xfs_dialloc(&tp, &args, &ino);
93 	if (error)
94 		goto out_trans_cancel;
95 	error = xfs_icreate(tp, ino, &args, &sc->tempip);
96 	if (error)
97 		goto out_trans_cancel;
98 
99 	/* We don't touch file data, so drop the realtime flags. */
100 	sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
101 	xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
102 
103 	/*
104 	 * Mark our temporary file as private so that LSMs and the ACL code
105 	 * don't try to add their own metadata or reason about these files.
106 	 * The file should never be exposed to userspace.
107 	 */
108 	VFS_I(sc->tempip)->i_flags |= S_PRIVATE;
109 	VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR;
110 
111 	if (is_dir) {
112 		error = xfs_dir_init(tp, sc->tempip, dp);
113 		if (error)
114 			goto out_trans_cancel;
115 	} else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) {
116 		/*
117 		 * Initialize the temporary symlink with a meaningless target
118 		 * that won't trip the verifiers.  Repair must rewrite the
119 		 * target with meaningful content before swapping with the file
120 		 * being repaired.  A single-byte target will not write a
121 		 * remote target block, so the owner is irrelevant.
122 		 */
123 		error = xfs_symlink_write_target(tp, sc->tempip,
124 				sc->tempip->i_ino, ".", 1, 0, 0);
125 		if (error)
126 			goto out_trans_cancel;
127 	}
128 
129 	/*
130 	 * Attach the dquot(s) to the inodes and modify them incore.
131 	 * These ids of the inode couldn't have changed since the new
132 	 * inode has been locked ever since it was created.
133 	 */
134 	xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);
135 
136 	/*
137 	 * Put our temp file on the unlinked list so it's purged automatically.
138 	 * All file-based metadata being reconstructed using this file must be
139 	 * atomically exchanged with the original file because the contents
140 	 * here will be purged when the inode is dropped or log recovery cleans
141 	 * out the unlinked list.
142 	 */
143 	error = xfs_iunlink(tp, sc->tempip);
144 	if (error)
145 		goto out_trans_cancel;
146 
147 	error = xfs_trans_commit(tp);
148 	if (error)
149 		goto out_release_inode;
150 
151 	trace_xrep_tempfile_create(sc);
152 
153 	xfs_qm_dqrele(udqp);
154 	xfs_qm_dqrele(gdqp);
155 	xfs_qm_dqrele(pdqp);
156 
157 	/* Finish setting up the incore / vfs context. */
158 	xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
159 	xfs_setup_iops(sc->tempip);
160 	xfs_finish_inode_setup(sc->tempip);
161 
162 	sc->temp_ilock_flags = 0;
163 	return error;
164 
165 out_trans_cancel:
166 	xfs_trans_cancel(tp);
167 out_release_inode:
168 	/*
169 	 * Wait until after the current transaction is aborted to finish the
170 	 * setup of the inode and release the inode.  This prevents recursive
171 	 * transactions and deadlocks from xfs_inactive.
172 	 */
173 	if (sc->tempip) {
174 		xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
175 		xfs_finish_inode_setup(sc->tempip);
176 		xchk_irele(sc, sc->tempip);
177 	}
178 out_release_dquots:
179 	xfs_qm_dqrele(udqp);
180 	xfs_qm_dqrele(gdqp);
181 	xfs_qm_dqrele(pdqp);
182 
183 	return error;
184 }
185 
186 /*
187  * Move sc->tempip from the regular directory tree to the metadata directory
188  * tree if sc->ip is part of the metadata directory tree and tempip has an
189  * eligible file mode.
190  *
191  * Temporary files have to be created before we even know which inode we're
192  * going to scrub, so we assume that they will be part of the regular directory
193  * tree.  If it turns out that we're actually scrubbing a file from the
194  * metadata directory tree, we have to subtract the temp file from the root
195  * dquots and detach the dquots prior to setting the METADATA iflag.  However,
196  * the scrub setup functions grab sc->ip and create sc->tempip before we
197  * actually get around to checking if the file mode is the right type for the
198  * scrubber.
199  */
200 int
xrep_tempfile_adjust_directory_tree(struct xfs_scrub * sc)201 xrep_tempfile_adjust_directory_tree(
202 	struct xfs_scrub	*sc)
203 {
204 	int			error;
205 
206 	if (!sc->tempip)
207 		return 0;
208 
209 	ASSERT(sc->tp == NULL);
210 	ASSERT(!xfs_is_metadir_inode(sc->tempip));
211 
212 	if (!sc->ip || !xfs_is_metadir_inode(sc->ip))
213 		return 0;
214 	if (!S_ISDIR(VFS_I(sc->tempip)->i_mode) &&
215 	    !S_ISREG(VFS_I(sc->tempip)->i_mode))
216 		return 0;
217 
218 	xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
219 	sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
220 
221 	error = xchk_trans_alloc(sc, 0);
222 	if (error)
223 		goto out_iolock;
224 
225 	xrep_tempfile_ilock(sc);
226 	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
227 
228 	/* Metadir files are not accounted in quota, so drop icount */
229 	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, -1L);
230 	xfs_metafile_set_iflag(sc->tp, sc->tempip, XFS_METAFILE_UNKNOWN);
231 
232 	error = xrep_trans_commit(sc);
233 	if (error)
234 		goto out_ilock;
235 
236 	xfs_iflags_set(sc->tempip, XFS_IRECOVERY);
237 	xfs_qm_dqdetach(sc->tempip);
238 out_ilock:
239 	xrep_tempfile_iunlock(sc);
240 out_iolock:
241 	xrep_tempfile_iounlock(sc);
242 	return error;
243 }
244 
245 /*
246  * Remove this temporary file from the metadata directory tree so that it can
247  * be inactivated the normal way.
248  */
249 STATIC int
xrep_tempfile_remove_metadir(struct xfs_scrub * sc)250 xrep_tempfile_remove_metadir(
251 	struct xfs_scrub	*sc)
252 {
253 	int			error;
254 
255 	if (!sc->tempip || !xfs_is_metadir_inode(sc->tempip))
256 		return 0;
257 
258 	ASSERT(sc->tp == NULL);
259 
260 	xfs_iflags_clear(sc->tempip, XFS_IRECOVERY);
261 
262 	xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
263 	sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
264 
265 	error = xchk_trans_alloc(sc, 0);
266 	if (error)
267 		goto out_iolock;
268 
269 	xrep_tempfile_ilock(sc);
270 	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
271 
272 	xfs_metafile_clear_iflag(sc->tp, sc->tempip);
273 
274 	/* Non-metadir files are accounted in quota, so bump bcount/icount */
275 	error = xfs_qm_dqattach_locked(sc->tempip, false);
276 	if (error)
277 		goto out_cancel;
278 
279 	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, 1L);
280 	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_BCOUNT,
281 			sc->tempip->i_nblocks);
282 	error = xrep_trans_commit(sc);
283 	goto out_ilock;
284 
285 out_cancel:
286 	xchk_trans_cancel(sc);
287 out_ilock:
288 	xrep_tempfile_iunlock(sc);
289 out_iolock:
290 	xrep_tempfile_iounlock(sc);
291 	return error;
292 }
293 
294 /* Take IOLOCK_EXCL on the temporary file, maybe. */
295 bool
xrep_tempfile_iolock_nowait(struct xfs_scrub * sc)296 xrep_tempfile_iolock_nowait(
297 	struct xfs_scrub	*sc)
298 {
299 	if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) {
300 		sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
301 		return true;
302 	}
303 
304 	return false;
305 }
306 
307 /*
308  * Take the temporary file's IOLOCK while holding a different inode's IOLOCK.
309  * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock
310  * to avoid deadlocks and lockdep complaints.
311  */
312 int
xrep_tempfile_iolock_polled(struct xfs_scrub * sc)313 xrep_tempfile_iolock_polled(
314 	struct xfs_scrub	*sc)
315 {
316 	int			error = 0;
317 
318 	while (!xrep_tempfile_iolock_nowait(sc)) {
319 		if (xchk_should_terminate(sc, &error))
320 			return error;
321 		delay(1);
322 	}
323 
324 	return 0;
325 }
326 
327 /* Release IOLOCK_EXCL on the temporary file. */
328 void
xrep_tempfile_iounlock(struct xfs_scrub * sc)329 xrep_tempfile_iounlock(
330 	struct xfs_scrub	*sc)
331 {
332 	xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL);
333 	sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL;
334 }
335 
336 /* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */
337 void
xrep_tempfile_ilock(struct xfs_scrub * sc)338 xrep_tempfile_ilock(
339 	struct xfs_scrub	*sc)
340 {
341 	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
342 	xfs_ilock(sc->tempip, XFS_ILOCK_EXCL);
343 }
344 
345 /* Try to grab ILOCK_EXCL on the temporary file. */
346 bool
xrep_tempfile_ilock_nowait(struct xfs_scrub * sc)347 xrep_tempfile_ilock_nowait(
348 	struct xfs_scrub	*sc)
349 {
350 	if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) {
351 		sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
352 		return true;
353 	}
354 
355 	return false;
356 }
357 
358 /* Unlock ILOCK_EXCL on the temporary file after an update. */
359 void
xrep_tempfile_iunlock(struct xfs_scrub * sc)360 xrep_tempfile_iunlock(
361 	struct xfs_scrub	*sc)
362 {
363 	xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
364 	sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL;
365 }
366 
367 /*
368  * Begin the process of making changes to both the file being scrubbed and
369  * the temporary file by taking ILOCK_EXCL on both.
370  */
371 void
xrep_tempfile_ilock_both(struct xfs_scrub * sc)372 xrep_tempfile_ilock_both(
373 	struct xfs_scrub	*sc)
374 {
375 	xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL);
376 	sc->ilock_flags |= XFS_ILOCK_EXCL;
377 	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
378 }
379 
380 /* Unlock ILOCK_EXCL on both files. */
381 void
xrep_tempfile_iunlock_both(struct xfs_scrub * sc)382 xrep_tempfile_iunlock_both(
383 	struct xfs_scrub	*sc)
384 {
385 	xrep_tempfile_iunlock(sc);
386 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
387 }
388 
389 /* Release the temporary file. */
390 void
xrep_tempfile_rele(struct xfs_scrub * sc)391 xrep_tempfile_rele(
392 	struct xfs_scrub	*sc)
393 {
394 	if (!sc->tempip)
395 		return;
396 
397 	if (sc->temp_ilock_flags) {
398 		xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
399 		sc->temp_ilock_flags = 0;
400 	}
401 
402 	xrep_tempfile_remove_metadir(sc);
403 	xchk_irele(sc, sc->tempip);
404 	sc->tempip = NULL;
405 }
406 
407 /*
408  * Make sure that the given range of the data fork of the temporary file is
409  * mapped to written blocks.  The caller must ensure that both inodes are
410  * joined to the transaction.
411  */
412 int
xrep_tempfile_prealloc(struct xfs_scrub * sc,xfs_fileoff_t off,xfs_filblks_t len)413 xrep_tempfile_prealloc(
414 	struct xfs_scrub	*sc,
415 	xfs_fileoff_t		off,
416 	xfs_filblks_t		len)
417 {
418 	struct xfs_bmbt_irec	map;
419 	xfs_fileoff_t		end = off + len;
420 	int			error;
421 
422 	ASSERT(sc->tempip != NULL);
423 	ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip));
424 
425 	for (; off < end; off = map.br_startoff + map.br_blockcount) {
426 		int		nmaps = 1;
427 
428 		/*
429 		 * If we have a real extent mapping this block then we're
430 		 * in ok shape.
431 		 */
432 		error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps,
433 				XFS_DATA_FORK);
434 		if (error)
435 			return error;
436 		if (nmaps == 0) {
437 			ASSERT(nmaps != 0);
438 			return -EFSCORRUPTED;
439 		}
440 
441 		if (xfs_bmap_is_written_extent(&map))
442 			continue;
443 
444 		/*
445 		 * If we find a delalloc reservation then something is very
446 		 * very wrong.  Bail out.
447 		 */
448 		if (map.br_startblock == DELAYSTARTBLOCK)
449 			return -EFSCORRUPTED;
450 
451 		/*
452 		 * Make sure this block has a real zeroed extent allocated to
453 		 * it.
454 		 */
455 		nmaps = 1;
456 		error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off,
457 				XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map,
458 				&nmaps);
459 		if (error)
460 			return error;
461 		if (nmaps != 1)
462 			return -EFSCORRUPTED;
463 
464 		trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map);
465 
466 		/* Commit new extent and all deferred work. */
467 		error = xfs_defer_finish(&sc->tp);
468 		if (error)
469 			return error;
470 	}
471 
472 	return 0;
473 }
474 
475 /*
476  * Write data to each block of a file.  The given range of the tempfile's data
477  * fork must already be populated with written extents.
478  */
479 int
xrep_tempfile_copyin(struct xfs_scrub * sc,xfs_fileoff_t off,xfs_filblks_t len,xrep_tempfile_copyin_fn prep_fn,void * data)480 xrep_tempfile_copyin(
481 	struct xfs_scrub	*sc,
482 	xfs_fileoff_t		off,
483 	xfs_filblks_t		len,
484 	xrep_tempfile_copyin_fn	prep_fn,
485 	void			*data)
486 {
487 	LIST_HEAD(buffers_list);
488 	struct xfs_mount	*mp = sc->mp;
489 	struct xfs_buf		*bp;
490 	xfs_fileoff_t		flush_mask;
491 	xfs_fileoff_t		end = off + len;
492 	loff_t			pos = XFS_FSB_TO_B(mp, off);
493 	int			error = 0;
494 
495 	ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode));
496 
497 	/* Flush buffers to disk every 512K */
498 	flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1;
499 
500 	for (; off < end; off++, pos += mp->m_sb.sb_blocksize) {
501 		struct xfs_bmbt_irec	map;
502 		int			nmaps = 1;
503 
504 		/* Read block mapping for this file block. */
505 		error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0);
506 		if (error)
507 			goto out_err;
508 		if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) {
509 			error = -EFSCORRUPTED;
510 			goto out_err;
511 		}
512 
513 		/* Get the metadata buffer for this offset in the file. */
514 		error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp,
515 				XFS_FSB_TO_DADDR(mp, map.br_startblock),
516 				mp->m_bsize, 0, &bp);
517 		if (error)
518 			goto out_err;
519 
520 		trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map);
521 
522 		/* Read in a block's worth of data from the xfile. */
523 		error = prep_fn(sc, bp, data);
524 		if (error) {
525 			xfs_trans_brelse(sc->tp, bp);
526 			goto out_err;
527 		}
528 
529 		/* Queue buffer, and flush if we have too much dirty data. */
530 		xfs_buf_delwri_queue_here(bp, &buffers_list);
531 		xfs_trans_brelse(sc->tp, bp);
532 
533 		if (!(off & flush_mask)) {
534 			error = xfs_buf_delwri_submit(&buffers_list);
535 			if (error)
536 				goto out_err;
537 		}
538 	}
539 
540 	/*
541 	 * Write the new blocks to disk.  If the ordered list isn't empty after
542 	 * that, then something went wrong and we have to fail.  This should
543 	 * never happen, but we'll check anyway.
544 	 */
545 	error = xfs_buf_delwri_submit(&buffers_list);
546 	if (error)
547 		goto out_err;
548 
549 	if (!list_empty(&buffers_list)) {
550 		ASSERT(list_empty(&buffers_list));
551 		error = -EIO;
552 		goto out_err;
553 	}
554 
555 	return 0;
556 
557 out_err:
558 	xfs_buf_delwri_cancel(&buffers_list);
559 	return error;
560 }
561 
562 /*
563  * Set the temporary file's size.  Caller must join the tempfile to the scrub
564  * transaction and is responsible for adjusting block mappings as needed.
565  */
566 int
xrep_tempfile_set_isize(struct xfs_scrub * sc,unsigned long long isize)567 xrep_tempfile_set_isize(
568 	struct xfs_scrub	*sc,
569 	unsigned long long	isize)
570 {
571 	if (sc->tempip->i_disk_size == isize)
572 		return 0;
573 
574 	sc->tempip->i_disk_size = isize;
575 	i_size_write(VFS_I(sc->tempip), isize);
576 	return xrep_tempfile_roll_trans(sc);
577 }
578 
579 /*
580  * Roll a repair transaction involving the temporary file.  Caller must join
581  * both the temporary file and the file being scrubbed to the transaction.
582  * This function return with both inodes joined to a new scrub transaction,
583  * or the usual negative errno.
584  */
585 int
xrep_tempfile_roll_trans(struct xfs_scrub * sc)586 xrep_tempfile_roll_trans(
587 	struct xfs_scrub	*sc)
588 {
589 	int			error;
590 
591 	xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE);
592 	error = xrep_roll_trans(sc);
593 	if (error)
594 		return error;
595 
596 	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
597 	return 0;
598 }
599 
600 /*
601  * Fill out the mapping exchange request in preparation for atomically
602  * committing the contents of a metadata file that we've rebuilt in the temp
603  * file.
604  */
605 STATIC int
xrep_tempexch_prep_request(struct xfs_scrub * sc,int whichfork,struct xrep_tempexch * tx)606 xrep_tempexch_prep_request(
607 	struct xfs_scrub	*sc,
608 	int			whichfork,
609 	struct xrep_tempexch	*tx)
610 {
611 	struct xfs_exchmaps_req	*req = &tx->req;
612 
613 	memset(tx, 0, sizeof(struct xrep_tempexch));
614 
615 	/* COW forks don't exist on disk. */
616 	if (whichfork == XFS_COW_FORK) {
617 		ASSERT(0);
618 		return -EINVAL;
619 	}
620 
621 	/* Both files should have the relevant forks. */
622 	if (!xfs_ifork_ptr(sc->ip, whichfork) ||
623 	    !xfs_ifork_ptr(sc->tempip, whichfork)) {
624 		ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
625 		ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL);
626 		return -EINVAL;
627 	}
628 
629 	/* Exchange all mappings in both forks. */
630 	req->ip1 = sc->tempip;
631 	req->ip2 = sc->ip;
632 	req->startoff1 = 0;
633 	req->startoff2 = 0;
634 	switch (whichfork) {
635 	case XFS_ATTR_FORK:
636 		req->flags |= XFS_EXCHMAPS_ATTR_FORK;
637 		break;
638 	case XFS_DATA_FORK:
639 		/* Always exchange sizes when exchanging data fork mappings. */
640 		req->flags |= XFS_EXCHMAPS_SET_SIZES;
641 		break;
642 	}
643 	req->blockcount = XFS_MAX_FILEOFF;
644 
645 	return 0;
646 }
647 
648 /*
649  * Fill out the mapping exchange resource estimation structures in preparation
650  * for exchanging the contents of a metadata file that we've rebuilt in the
651  * temp file.  Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files.
652  */
653 STATIC int
xrep_tempexch_estimate(struct xfs_scrub * sc,struct xrep_tempexch * tx)654 xrep_tempexch_estimate(
655 	struct xfs_scrub	*sc,
656 	struct xrep_tempexch	*tx)
657 {
658 	struct xfs_exchmaps_req	*req = &tx->req;
659 	struct xfs_ifork	*ifp;
660 	struct xfs_ifork	*tifp;
661 	int			whichfork = xfs_exchmaps_reqfork(req);
662 	int			state = 0;
663 
664 	/*
665 	 * The exchmaps code only knows how to exchange file fork space
666 	 * mappings.  Any fork data in local format must be promoted to a
667 	 * single block before the exchange can take place.
668 	 */
669 	ifp = xfs_ifork_ptr(sc->ip, whichfork);
670 	if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
671 		state |= 1;
672 
673 	tifp = xfs_ifork_ptr(sc->tempip, whichfork);
674 	if (tifp->if_format == XFS_DINODE_FMT_LOCAL)
675 		state |= 2;
676 
677 	switch (state) {
678 	case 0:
679 		/* Both files have mapped extents; use the regular estimate. */
680 		return xfs_exchrange_estimate(req);
681 	case 1:
682 		/*
683 		 * The file being repaired is in local format, but the temp
684 		 * file has mapped extents.  To perform the exchange, the file
685 		 * being repaired must have its shorform data converted to an
686 		 * ondisk block so that the forks will be in extents format.
687 		 * We need one resblk for the conversion; the number of
688 		 * exchanges is (worst case) the temporary file's extent count
689 		 * plus the block we converted.
690 		 */
691 		req->ip1_bcount = sc->tempip->i_nblocks;
692 		req->ip2_bcount = 1;
693 		req->nr_exchanges = 1 + tifp->if_nextents;
694 		req->resblks = 1;
695 		break;
696 	case 2:
697 		/*
698 		 * The temporary file is in local format, but the file being
699 		 * repaired has mapped extents.  To perform the exchange, the
700 		 * temp file must have its shortform data converted to an
701 		 * ondisk block, and the fork changed to extents format.  We
702 		 * need one resblk for the conversion; the number of exchanges
703 		 * is (worst case) the extent count of the file being repaired
704 		 * plus the block we converted.
705 		 */
706 		req->ip1_bcount = 1;
707 		req->ip2_bcount = sc->ip->i_nblocks;
708 		req->nr_exchanges = 1 + ifp->if_nextents;
709 		req->resblks = 1;
710 		break;
711 	case 3:
712 		/*
713 		 * Both forks are in local format.  To perform the exchange,
714 		 * both files must have their shortform data converted to
715 		 * fsblocks, and both forks must be converted to extents
716 		 * format.  We need two resblks for the two conversions, and
717 		 * the number of exchanges is 1 since there's only one block at
718 		 * fileoff 0.  Presumably, the caller could not exchange the
719 		 * two inode fork areas directly.
720 		 */
721 		req->ip1_bcount = 1;
722 		req->ip2_bcount = 1;
723 		req->nr_exchanges = 1;
724 		req->resblks = 2;
725 		break;
726 	}
727 
728 	return xfs_exchmaps_estimate_overhead(req);
729 }
730 
731 /*
732  * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
733  * this if quota enforcement is disabled or if both inodes' dquots are the
734  * same.  The qretry structure must be initialized to zeroes before the first
735  * call to this function.
736  */
737 STATIC int
xrep_tempexch_reserve_quota(struct xfs_scrub * sc,const struct xrep_tempexch * tx)738 xrep_tempexch_reserve_quota(
739 	struct xfs_scrub		*sc,
740 	const struct xrep_tempexch	*tx)
741 {
742 	struct xfs_trans		*tp = sc->tp;
743 	const struct xfs_exchmaps_req	*req = &tx->req;
744 	int64_t				ddelta, rdelta;
745 	int				error;
746 
747 	/*
748 	 * Don't bother with a quota reservation if we're not enforcing them
749 	 * or the two inodes have the same dquots.
750 	 */
751 	if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
752 	    (req->ip1->i_udquot == req->ip2->i_udquot &&
753 	     req->ip1->i_gdquot == req->ip2->i_gdquot &&
754 	     req->ip1->i_pdquot == req->ip2->i_pdquot))
755 		return 0;
756 
757 	/*
758 	 * Quota reservation for each file comes from two sources.  First, we
759 	 * need to account for any net gain in mapped blocks during the
760 	 * exchange.  Second, we need reservation for the gross gain in mapped
761 	 * blocks so that we don't trip over any quota block reservation
762 	 * assertions.  We must reserve the gross gain because the quota code
763 	 * subtracts from bcount the number of blocks that we unmap; it does
764 	 * not add that quantity back to the quota block reservation.
765 	 */
766 	ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount);
767 	rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount);
768 	error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
769 			ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount,
770 			true);
771 	if (error)
772 		return error;
773 
774 	ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount);
775 	rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount);
776 	return xfs_trans_reserve_quota_nblks(tp, req->ip2,
777 			ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount,
778 			true);
779 }
780 
781 /*
782  * Prepare an existing transaction for an atomic file contents exchange.
783  *
784  * This function fills out the mapping exchange request and resource estimation
785  * structures in preparation for exchanging the contents of a metadata file
786  * that has been rebuilt in the temp file.  Next, it reserves space and quota
787  * for the transaction.
788  *
789  * The caller must hold ILOCK_EXCL of the scrub target file and the temporary
790  * file.  The caller must join both inodes to the transaction with no unlock
791  * flags, and is responsible for dropping both ILOCKs when appropriate.  Only
792  * use this when those ILOCKs cannot be dropped.
793  */
794 int
xrep_tempexch_trans_reserve(struct xfs_scrub * sc,int whichfork,struct xrep_tempexch * tx)795 xrep_tempexch_trans_reserve(
796 	struct xfs_scrub	*sc,
797 	int			whichfork,
798 	struct xrep_tempexch	*tx)
799 {
800 	int			error;
801 
802 	ASSERT(sc->tp != NULL);
803 	xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
804 	xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);
805 
806 	error = xrep_tempexch_prep_request(sc, whichfork, tx);
807 	if (error)
808 		return error;
809 
810 	error = xfs_exchmaps_estimate(&tx->req);
811 	if (error)
812 		return error;
813 
814 	error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0);
815 	if (error)
816 		return error;
817 
818 	return xrep_tempexch_reserve_quota(sc, tx);
819 }
820 
821 /*
822  * Create a new transaction for a file contents exchange.
823  *
824  * This function fills out the mapping excahange request and resource
825  * estimation structures in preparation for exchanging the contents of a
826  * metadata file that has been rebuilt in the temp file.  Next, it reserves
827  * space, takes ILOCK_EXCL of both inodes, joins them to the transaction and
828  * reserves quota for the transaction.
829  *
830  * The caller is responsible for dropping both ILOCKs when appropriate.
831  */
832 int
xrep_tempexch_trans_alloc(struct xfs_scrub * sc,int whichfork,struct xrep_tempexch * tx)833 xrep_tempexch_trans_alloc(
834 	struct xfs_scrub	*sc,
835 	int			whichfork,
836 	struct xrep_tempexch	*tx)
837 {
838 	unsigned int		flags = 0;
839 	int			error;
840 
841 	ASSERT(sc->tp == NULL);
842 	ASSERT(xfs_has_exchange_range(sc->mp));
843 
844 	error = xrep_tempexch_prep_request(sc, whichfork, tx);
845 	if (error)
846 		return error;
847 
848 	error = xrep_tempexch_estimate(sc, tx);
849 	if (error)
850 		return error;
851 
852 	if (xfs_has_lazysbcount(sc->mp))
853 		flags |= XFS_TRANS_RES_FDBLKS;
854 
855 	error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
856 			tx->req.resblks, 0, flags, &sc->tp);
857 	if (error)
858 		return error;
859 
860 	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
861 	sc->ilock_flags |= XFS_ILOCK_EXCL;
862 	xfs_exchrange_ilock(sc->tp, sc->ip, sc->tempip);
863 
864 	return xrep_tempexch_reserve_quota(sc, tx);
865 }
866 
867 /*
868  * Exchange file mappings (and hence file contents) between the file being
869  * repaired and the temporary file.  Returns with both inodes locked and joined
870  * to a clean scrub transaction.
871  */
872 int
xrep_tempexch_contents(struct xfs_scrub * sc,struct xrep_tempexch * tx)873 xrep_tempexch_contents(
874 	struct xfs_scrub	*sc,
875 	struct xrep_tempexch	*tx)
876 {
877 	int			error;
878 
879 	ASSERT(xfs_has_exchange_range(sc->mp));
880 
881 	xfs_exchange_mappings(sc->tp, &tx->req);
882 	error = xfs_defer_finish(&sc->tp);
883 	if (error)
884 		return error;
885 
886 	/*
887 	 * If we exchanged the ondisk sizes of two metadata files, we must
888 	 * exchanged the incore sizes as well.
889 	 */
890 	if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) {
891 		loff_t	temp;
892 
893 		temp = i_size_read(VFS_I(sc->ip));
894 		i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
895 		i_size_write(VFS_I(sc->tempip), temp);
896 	}
897 
898 	return 0;
899 }
900 
901 /*
902  * Write local format data from one of the temporary file's forks into the same
903  * fork of file being repaired, and exchange the file sizes, if appropriate.
904  * Caller must ensure that the file being repaired has enough fork space to
905  * hold all the bytes.
906  */
907 void
xrep_tempfile_copyout_local(struct xfs_scrub * sc,int whichfork)908 xrep_tempfile_copyout_local(
909 	struct xfs_scrub	*sc,
910 	int			whichfork)
911 {
912 	struct xfs_ifork	*temp_ifp;
913 	struct xfs_ifork	*ifp;
914 	unsigned int		ilog_flags = XFS_ILOG_CORE;
915 
916 	temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork);
917 	ifp = xfs_ifork_ptr(sc->ip, whichfork);
918 
919 	ASSERT(temp_ifp != NULL);
920 	ASSERT(ifp != NULL);
921 	ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL);
922 	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
923 
924 	switch (whichfork) {
925 	case XFS_DATA_FORK:
926 		ASSERT(sc->tempip->i_disk_size <=
927 					xfs_inode_data_fork_size(sc->ip));
928 		break;
929 	case XFS_ATTR_FORK:
930 		ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff);
931 		break;
932 	default:
933 		ASSERT(0);
934 		return;
935 	}
936 
937 	/* Recreate @sc->ip's incore fork (ifp) with data from temp_ifp. */
938 	xfs_idestroy_fork(ifp);
939 	xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_data,
940 			temp_ifp->if_bytes);
941 
942 	if (whichfork == XFS_DATA_FORK) {
943 		i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
944 		sc->ip->i_disk_size = sc->tempip->i_disk_size;
945 	}
946 
947 	ilog_flags |= xfs_ilog_fdata(whichfork);
948 	xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags);
949 }
950 
951 /* Decide if a given XFS inode is a temporary file for a repair. */
952 bool
xrep_is_tempfile(const struct xfs_inode * ip)953 xrep_is_tempfile(
954 	const struct xfs_inode	*ip)
955 {
956 	const struct inode	*inode = &ip->i_vnode;
957 	struct xfs_mount	*mp = ip->i_mount;
958 
959 	/*
960 	 * Files in the metadata directory tree also have S_PRIVATE set and
961 	 * IOP_XATTR unset, so we must distinguish them separately.  We (ab)use
962 	 * the IRECOVERY flag to mark temporary metadir inodes knowing that the
963 	 * end of log recovery clears IRECOVERY, so the only ones that can
964 	 * exist during online repair are the ones we create.
965 	 */
966 	if (xfs_has_metadir(mp) && (ip->i_diflags2 & XFS_DIFLAG2_METADATA))
967 		return __xfs_iflags_test(ip, XFS_IRECOVERY);
968 
969 	if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR))
970 		return true;
971 
972 	return false;
973 }
974