xref: /linux/fs/xfs/scrub/tempfile.c (revision b477ff98d903618a1ab8247861f2ea6e70c0f0f8)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_log_format.h"
13 #include "xfs_trans.h"
14 #include "xfs_inode.h"
15 #include "xfs_ialloc.h"
16 #include "xfs_quota.h"
17 #include "xfs_bmap.h"
18 #include "xfs_bmap_btree.h"
19 #include "xfs_trans_space.h"
20 #include "xfs_dir2.h"
21 #include "xfs_exchrange.h"
22 #include "xfs_exchmaps.h"
23 #include "xfs_defer.h"
24 #include "xfs_symlink_remote.h"
25 #include "xfs_metafile.h"
26 #include "scrub/scrub.h"
27 #include "scrub/common.h"
28 #include "scrub/repair.h"
29 #include "scrub/trace.h"
30 #include "scrub/tempfile.h"
31 #include "scrub/tempexch.h"
32 #include "scrub/xfile.h"
33 
34 /*
35  * Create a temporary file for reconstructing metadata, with the intention of
36  * atomically exchanging the temporary file's contents with the file that's
37  * being repaired.
38  */
39 int
xrep_tempfile_create(struct xfs_scrub * sc,uint16_t mode)40 xrep_tempfile_create(
41 	struct xfs_scrub	*sc,
42 	uint16_t		mode)
43 {
44 	struct xfs_icreate_args	args = {
45 		.pip		= sc->mp->m_rootip,
46 		.mode		= mode,
47 		.flags		= XFS_ICREATE_TMPFILE | XFS_ICREATE_UNLINKABLE,
48 	};
49 	struct xfs_mount	*mp = sc->mp;
50 	struct xfs_trans	*tp = NULL;
51 	struct xfs_dquot	*udqp;
52 	struct xfs_dquot	*gdqp;
53 	struct xfs_dquot	*pdqp;
54 	struct xfs_trans_res	*tres;
55 	struct xfs_inode	*dp = mp->m_rootip;
56 	xfs_ino_t		ino;
57 	unsigned int		resblks;
58 	bool			is_dir = S_ISDIR(mode);
59 	int			error;
60 
61 	if (xfs_is_shutdown(mp))
62 		return -EIO;
63 	if (xfs_is_readonly(mp))
64 		return -EROFS;
65 
66 	ASSERT(sc->tp == NULL);
67 	ASSERT(sc->tempip == NULL);
68 
69 	/*
70 	 * Make sure that we have allocated dquot(s) on disk.  The temporary
71 	 * inode should be completely root owned so that we don't fail due to
72 	 * quota limits.
73 	 */
74 	error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp);
75 	if (error)
76 		return error;
77 
78 	if (is_dir) {
79 		resblks = xfs_mkdir_space_res(mp, 0);
80 		tres = &M_RES(mp)->tr_mkdir;
81 	} else {
82 		resblks = XFS_IALLOC_SPACE_RES(mp);
83 		tres = &M_RES(mp)->tr_create_tmpfile;
84 	}
85 
86 	error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
87 			&tp);
88 	if (error)
89 		goto out_release_dquots;
90 
91 	/* Allocate inode, set up directory. */
92 	error = xfs_dialloc(&tp, &args, &ino);
93 	if (error)
94 		goto out_trans_cancel;
95 	error = xfs_icreate(tp, ino, &args, &sc->tempip);
96 	if (error)
97 		goto out_trans_cancel;
98 
99 	/* We don't touch file data, so drop the realtime flags. */
100 	sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
101 	xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
102 
103 	/*
104 	 * Mark our temporary file as private so that LSMs and the ACL code
105 	 * don't try to add their own metadata or reason about these files.
106 	 * The file should never be exposed to userspace.
107 	 */
108 	VFS_I(sc->tempip)->i_flags |= S_PRIVATE;
109 	VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR;
110 
111 	if (is_dir) {
112 		error = xfs_dir_init(tp, sc->tempip, dp);
113 		if (error)
114 			goto out_trans_cancel;
115 	} else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) {
116 		/*
117 		 * Initialize the temporary symlink with a meaningless target
118 		 * that won't trip the verifiers.  Repair must rewrite the
119 		 * target with meaningful content before swapping with the file
120 		 * being repaired.  A single-byte target will not write a
121 		 * remote target block, so the owner is irrelevant.
122 		 */
123 		error = xfs_symlink_write_target(tp, sc->tempip,
124 				sc->tempip->i_ino, ".", 1, 0, 0);
125 		if (error)
126 			goto out_trans_cancel;
127 	}
128 
129 	/*
130 	 * Attach the dquot(s) to the inodes and modify them incore.
131 	 * These ids of the inode couldn't have changed since the new
132 	 * inode has been locked ever since it was created.
133 	 */
134 	xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);
135 
136 	/*
137 	 * Put our temp file on the unlinked list so it's purged automatically.
138 	 * All file-based metadata being reconstructed using this file must be
139 	 * atomically exchanged with the original file because the contents
140 	 * here will be purged when the inode is dropped or log recovery cleans
141 	 * out the unlinked list.
142 	 */
143 	error = xfs_iunlink(tp, sc->tempip);
144 	if (error)
145 		goto out_trans_cancel;
146 
147 	error = xfs_trans_commit(tp);
148 	if (error)
149 		goto out_release_inode;
150 
151 	trace_xrep_tempfile_create(sc);
152 
153 	xfs_qm_dqrele(udqp);
154 	xfs_qm_dqrele(gdqp);
155 	xfs_qm_dqrele(pdqp);
156 
157 	/* Finish setting up the incore / vfs context. */
158 	xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
159 	xfs_setup_iops(sc->tempip);
160 	xfs_finish_inode_setup(sc->tempip);
161 
162 	sc->temp_ilock_flags = 0;
163 	return error;
164 
165 out_trans_cancel:
166 	xfs_trans_cancel(tp);
167 out_release_inode:
168 	/*
169 	 * Wait until after the current transaction is aborted to finish the
170 	 * setup of the inode and release the inode.  This prevents recursive
171 	 * transactions and deadlocks from xfs_inactive.
172 	 */
173 	if (sc->tempip) {
174 		xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
175 		xfs_finish_inode_setup(sc->tempip);
176 		xchk_irele(sc, sc->tempip);
177 	}
178 out_release_dquots:
179 	xfs_qm_dqrele(udqp);
180 	xfs_qm_dqrele(gdqp);
181 	xfs_qm_dqrele(pdqp);
182 
183 	return error;
184 }
185 
186 /*
187  * Move sc->tempip from the regular directory tree to the metadata directory
188  * tree if sc->ip is part of the metadata directory tree and tempip has an
189  * eligible file mode.
190  *
191  * Temporary files have to be created before we even know which inode we're
192  * going to scrub, so we assume that they will be part of the regular directory
193  * tree.  If it turns out that we're actually scrubbing a file from the
194  * metadata directory tree, we have to subtract the temp file from the root
195  * dquots and detach the dquots prior to setting the METADATA iflag.  However,
196  * the scrub setup functions grab sc->ip and create sc->tempip before we
197  * actually get around to checking if the file mode is the right type for the
198  * scrubber.
199  */
200 int
xrep_tempfile_adjust_directory_tree(struct xfs_scrub * sc)201 xrep_tempfile_adjust_directory_tree(
202 	struct xfs_scrub	*sc)
203 {
204 	int			error;
205 
206 	if (!sc->tempip)
207 		return 0;
208 
209 	ASSERT(sc->tp == NULL);
210 	ASSERT(!xfs_is_metadir_inode(sc->tempip));
211 
212 	if (!sc->ip || !xfs_is_metadir_inode(sc->ip))
213 		return 0;
214 	if (!S_ISDIR(VFS_I(sc->tempip)->i_mode) &&
215 	    !S_ISREG(VFS_I(sc->tempip)->i_mode))
216 		return 0;
217 
218 	xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
219 	sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
220 
221 	error = xchk_trans_alloc(sc, 0);
222 	if (error)
223 		goto out_iolock;
224 
225 	xrep_tempfile_ilock(sc);
226 	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
227 
228 	/* Metadir files are not accounted in quota, so drop icount */
229 	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, -1L);
230 	xfs_metafile_set_iflag(sc->tp, sc->tempip, XFS_METAFILE_UNKNOWN);
231 
232 	error = xrep_trans_commit(sc);
233 	if (error)
234 		goto out_ilock;
235 
236 	xfs_iflags_set(sc->tempip, XFS_IRECOVERY);
237 	xfs_qm_dqdetach(sc->tempip);
238 out_ilock:
239 	xrep_tempfile_iunlock(sc);
240 out_iolock:
241 	xrep_tempfile_iounlock(sc);
242 	return error;
243 }
244 
245 /*
246  * Remove this temporary file from the metadata directory tree so that it can
247  * be inactivated the normal way.
248  */
249 STATIC int
xrep_tempfile_remove_metadir(struct xfs_scrub * sc)250 xrep_tempfile_remove_metadir(
251 	struct xfs_scrub	*sc)
252 {
253 	int			error;
254 
255 	if (!sc->tempip || !xfs_is_metadir_inode(sc->tempip))
256 		return 0;
257 
258 	ASSERT(sc->tp == NULL);
259 
260 	xfs_iflags_clear(sc->tempip, XFS_IRECOVERY);
261 
262 	xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
263 	sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
264 
265 	error = xchk_trans_alloc(sc, 0);
266 	if (error)
267 		goto out_iolock;
268 
269 	xrep_tempfile_ilock(sc);
270 	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
271 
272 	xfs_metafile_clear_iflag(sc->tp, sc->tempip);
273 
274 	/* Non-metadir files are accounted in quota, so bump bcount/icount */
275 	error = xfs_qm_dqattach_locked(sc->tempip, false);
276 	if (error)
277 		goto out_cancel;
278 
279 	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, 1L);
280 	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_BCOUNT,
281 			sc->tempip->i_nblocks);
282 	error = xrep_trans_commit(sc);
283 	goto out_ilock;
284 
285 out_cancel:
286 	xchk_trans_cancel(sc);
287 out_ilock:
288 	xrep_tempfile_iunlock(sc);
289 out_iolock:
290 	xrep_tempfile_iounlock(sc);
291 	return error;
292 }
293 
294 /* Take IOLOCK_EXCL on the temporary file, maybe. */
295 bool
xrep_tempfile_iolock_nowait(struct xfs_scrub * sc)296 xrep_tempfile_iolock_nowait(
297 	struct xfs_scrub	*sc)
298 {
299 	if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) {
300 		sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
301 		return true;
302 	}
303 
304 	return false;
305 }
306 
307 /*
308  * Take the temporary file's IOLOCK while holding a different inode's IOLOCK.
309  * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock
310  * to avoid deadlocks and lockdep complaints.
311  */
312 int
xrep_tempfile_iolock_polled(struct xfs_scrub * sc)313 xrep_tempfile_iolock_polled(
314 	struct xfs_scrub	*sc)
315 {
316 	int			error = 0;
317 
318 	while (!xrep_tempfile_iolock_nowait(sc)) {
319 		if (xchk_should_terminate(sc, &error))
320 			return error;
321 		delay(1);
322 	}
323 
324 	return 0;
325 }
326 
327 /* Release IOLOCK_EXCL on the temporary file. */
328 void
xrep_tempfile_iounlock(struct xfs_scrub * sc)329 xrep_tempfile_iounlock(
330 	struct xfs_scrub	*sc)
331 {
332 	xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL);
333 	sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL;
334 }
335 
336 /* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */
337 void
xrep_tempfile_ilock(struct xfs_scrub * sc)338 xrep_tempfile_ilock(
339 	struct xfs_scrub	*sc)
340 {
341 	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
342 	xfs_ilock(sc->tempip, XFS_ILOCK_EXCL);
343 }
344 
345 /* Try to grab ILOCK_EXCL on the temporary file. */
346 bool
xrep_tempfile_ilock_nowait(struct xfs_scrub * sc)347 xrep_tempfile_ilock_nowait(
348 	struct xfs_scrub	*sc)
349 {
350 	if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) {
351 		sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
352 		return true;
353 	}
354 
355 	return false;
356 }
357 
358 /* Unlock ILOCK_EXCL on the temporary file after an update. */
359 void
xrep_tempfile_iunlock(struct xfs_scrub * sc)360 xrep_tempfile_iunlock(
361 	struct xfs_scrub	*sc)
362 {
363 	xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
364 	sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL;
365 }
366 
367 /*
368  * Begin the process of making changes to both the file being scrubbed and
369  * the temporary file by taking ILOCK_EXCL on both.
370  */
371 void
xrep_tempfile_ilock_both(struct xfs_scrub * sc)372 xrep_tempfile_ilock_both(
373 	struct xfs_scrub	*sc)
374 {
375 	xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL);
376 	sc->ilock_flags |= XFS_ILOCK_EXCL;
377 	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
378 }
379 
380 /* Unlock ILOCK_EXCL on both files. */
381 void
xrep_tempfile_iunlock_both(struct xfs_scrub * sc)382 xrep_tempfile_iunlock_both(
383 	struct xfs_scrub	*sc)
384 {
385 	xrep_tempfile_iunlock(sc);
386 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
387 }
388 
389 /* Release the temporary file. */
390 void
xrep_tempfile_rele(struct xfs_scrub * sc)391 xrep_tempfile_rele(
392 	struct xfs_scrub	*sc)
393 {
394 	if (!sc->tempip)
395 		return;
396 
397 	if (sc->temp_ilock_flags) {
398 		xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
399 		sc->temp_ilock_flags = 0;
400 	}
401 
402 	xrep_tempfile_remove_metadir(sc);
403 	xchk_irele(sc, sc->tempip);
404 	sc->tempip = NULL;
405 }
406 
407 /*
408  * Make sure that the given range of the data fork of the temporary file is
409  * mapped to written blocks.  The caller must ensure that both inodes are
410  * joined to the transaction.
411  */
412 int
xrep_tempfile_prealloc(struct xfs_scrub * sc,xfs_fileoff_t off,xfs_filblks_t len)413 xrep_tempfile_prealloc(
414 	struct xfs_scrub	*sc,
415 	xfs_fileoff_t		off,
416 	xfs_filblks_t		len)
417 {
418 	struct xfs_bmbt_irec	map;
419 	xfs_fileoff_t		end = off + len;
420 	int			error;
421 
422 	ASSERT(sc->tempip != NULL);
423 	ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip));
424 
425 	for (; off < end; off = map.br_startoff + map.br_blockcount) {
426 		int		nmaps = 1;
427 
428 		/*
429 		 * If we have a real extent mapping this block then we're
430 		 * in ok shape.
431 		 */
432 		error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps,
433 				XFS_DATA_FORK);
434 		if (error)
435 			return error;
436 		if (nmaps == 0) {
437 			ASSERT(nmaps != 0);
438 			return -EFSCORRUPTED;
439 		}
440 
441 		if (xfs_bmap_is_written_extent(&map))
442 			continue;
443 
444 		/*
445 		 * If we find a delalloc reservation then something is very
446 		 * very wrong.  Bail out.
447 		 */
448 		if (map.br_startblock == DELAYSTARTBLOCK)
449 			return -EFSCORRUPTED;
450 
451 		/*
452 		 * Make sure this block has a real zeroed extent allocated to
453 		 * it.
454 		 */
455 		nmaps = 1;
456 		error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off,
457 				XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map,
458 				&nmaps);
459 		if (error)
460 			return error;
461 		if (nmaps != 1)
462 			return -EFSCORRUPTED;
463 
464 		trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map);
465 
466 		/* Commit new extent and all deferred work. */
467 		error = xfs_defer_finish(&sc->tp);
468 		if (error)
469 			return error;
470 	}
471 
472 	return 0;
473 }
474 
475 /*
476  * Write data to each block of a file.  The given range of the tempfile's data
477  * fork must already be populated with written extents.
478  */
479 int
xrep_tempfile_copyin(struct xfs_scrub * sc,xfs_fileoff_t off,xfs_filblks_t len,xrep_tempfile_copyin_fn prep_fn,void * data)480 xrep_tempfile_copyin(
481 	struct xfs_scrub	*sc,
482 	xfs_fileoff_t		off,
483 	xfs_filblks_t		len,
484 	xrep_tempfile_copyin_fn	prep_fn,
485 	void			*data)
486 {
487 	LIST_HEAD(buffers_list);
488 	struct xfs_mount	*mp = sc->mp;
489 	struct xfs_buf		*bp;
490 	xfs_fileoff_t		flush_mask;
491 	xfs_fileoff_t		end = off + len;
492 	loff_t			pos = XFS_FSB_TO_B(mp, off);
493 	int			error = 0;
494 
495 	ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode));
496 
497 	/* Flush buffers to disk every 512K */
498 	flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1;
499 
500 	for (; off < end; off++, pos += mp->m_sb.sb_blocksize) {
501 		struct xfs_bmbt_irec	map;
502 		int			nmaps = 1;
503 
504 		/* Read block mapping for this file block. */
505 		error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0);
506 		if (error)
507 			goto out_err;
508 		if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) {
509 			error = -EFSCORRUPTED;
510 			goto out_err;
511 		}
512 
513 		/* Get the metadata buffer for this offset in the file. */
514 		error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp,
515 				XFS_FSB_TO_DADDR(mp, map.br_startblock),
516 				mp->m_bsize, 0, &bp);
517 		if (error)
518 			goto out_err;
519 
520 		trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map);
521 
522 		/* Read in a block's worth of data from the xfile. */
523 		error = prep_fn(sc, bp, data);
524 		if (error) {
525 			xfs_trans_brelse(sc->tp, bp);
526 			goto out_err;
527 		}
528 
529 		/* Queue buffer, and flush if we have too much dirty data. */
530 		xfs_buf_delwri_queue_here(bp, &buffers_list);
531 		xfs_trans_brelse(sc->tp, bp);
532 
533 		if (!(off & flush_mask)) {
534 			error = xfs_buf_delwri_submit(&buffers_list);
535 			if (error)
536 				goto out_err;
537 		}
538 	}
539 
540 	/*
541 	 * Write the new blocks to disk.  If the ordered list isn't empty after
542 	 * that, then something went wrong and we have to fail.  This should
543 	 * never happen, but we'll check anyway.
544 	 */
545 	error = xfs_buf_delwri_submit(&buffers_list);
546 	if (error)
547 		goto out_err;
548 
549 	if (!list_empty(&buffers_list)) {
550 		ASSERT(list_empty(&buffers_list));
551 		error = -EIO;
552 		goto out_err;
553 	}
554 
555 	return 0;
556 
557 out_err:
558 	xfs_buf_delwri_cancel(&buffers_list);
559 	return error;
560 }
561 
562 /*
563  * Set the temporary file's size.  Caller must join the tempfile to the scrub
564  * transaction and is responsible for adjusting block mappings as needed.
565  */
566 int
xrep_tempfile_set_isize(struct xfs_scrub * sc,unsigned long long isize)567 xrep_tempfile_set_isize(
568 	struct xfs_scrub	*sc,
569 	unsigned long long	isize)
570 {
571 	if (sc->tempip->i_disk_size == isize)
572 		return 0;
573 
574 	sc->tempip->i_disk_size = isize;
575 	i_size_write(VFS_I(sc->tempip), isize);
576 	return xrep_tempfile_roll_trans(sc);
577 }
578 
579 /*
580  * Roll a repair transaction involving the temporary file.  Caller must join
581  * both the temporary file and the file being scrubbed to the transaction.
582  * This function return with both inodes joined to a new scrub transaction,
583  * or the usual negative errno.
584  */
585 int
xrep_tempfile_roll_trans(struct xfs_scrub * sc)586 xrep_tempfile_roll_trans(
587 	struct xfs_scrub	*sc)
588 {
589 	int			error;
590 
591 	xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE);
592 	error = xrep_roll_trans(sc);
593 	if (error)
594 		return error;
595 
596 	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
597 	return 0;
598 }
599 
600 /*
601  * Fill out the mapping exchange request in preparation for atomically
602  * committing the contents of a metadata file that we've rebuilt in the temp
603  * file.
604  */
605 STATIC int
xrep_tempexch_prep_request(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t off,xfs_filblks_t len,struct xrep_tempexch * tx)606 xrep_tempexch_prep_request(
607 	struct xfs_scrub	*sc,
608 	int			whichfork,
609 	xfs_fileoff_t		off,
610 	xfs_filblks_t		len,
611 	struct xrep_tempexch	*tx)
612 {
613 	struct xfs_exchmaps_req	*req = &tx->req;
614 
615 	memset(tx, 0, sizeof(struct xrep_tempexch));
616 
617 	/* COW forks don't exist on disk. */
618 	if (whichfork == XFS_COW_FORK) {
619 		ASSERT(0);
620 		return -EINVAL;
621 	}
622 
623 	/* Both files should have the relevant forks. */
624 	if (!xfs_ifork_ptr(sc->ip, whichfork) ||
625 	    !xfs_ifork_ptr(sc->tempip, whichfork)) {
626 		ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
627 		ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL);
628 		return -EINVAL;
629 	}
630 
631 	/* Exchange all mappings in both forks. */
632 	req->ip1 = sc->tempip;
633 	req->ip2 = sc->ip;
634 	req->startoff1 = off;
635 	req->startoff2 = off;
636 	switch (whichfork) {
637 	case XFS_ATTR_FORK:
638 		req->flags |= XFS_EXCHMAPS_ATTR_FORK;
639 		break;
640 	case XFS_DATA_FORK:
641 		/* Exchange sizes when exchanging all data fork mappings. */
642 		if (off == 0 && len == XFS_MAX_FILEOFF)
643 			req->flags |= XFS_EXCHMAPS_SET_SIZES;
644 		break;
645 	}
646 	req->blockcount = len;
647 
648 	return 0;
649 }
650 
651 /*
652  * Fill out the mapping exchange resource estimation structures in preparation
653  * for exchanging the contents of a metadata file that we've rebuilt in the
654  * temp file.  Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files.
655  */
656 STATIC int
xrep_tempexch_estimate(struct xfs_scrub * sc,struct xrep_tempexch * tx)657 xrep_tempexch_estimate(
658 	struct xfs_scrub	*sc,
659 	struct xrep_tempexch	*tx)
660 {
661 	struct xfs_exchmaps_req	*req = &tx->req;
662 	struct xfs_ifork	*ifp;
663 	struct xfs_ifork	*tifp;
664 	int			whichfork = xfs_exchmaps_reqfork(req);
665 	int			state = 0;
666 
667 	/*
668 	 * The exchmaps code only knows how to exchange file fork space
669 	 * mappings.  Any fork data in local format must be promoted to a
670 	 * single block before the exchange can take place.
671 	 */
672 	ifp = xfs_ifork_ptr(sc->ip, whichfork);
673 	if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
674 		state |= 1;
675 
676 	tifp = xfs_ifork_ptr(sc->tempip, whichfork);
677 	if (tifp->if_format == XFS_DINODE_FMT_LOCAL)
678 		state |= 2;
679 
680 	switch (state) {
681 	case 0:
682 		/* Both files have mapped extents; use the regular estimate. */
683 		return xfs_exchrange_estimate(req);
684 	case 1:
685 		/*
686 		 * The file being repaired is in local format, but the temp
687 		 * file has mapped extents.  To perform the exchange, the file
688 		 * being repaired must have its shorform data converted to an
689 		 * ondisk block so that the forks will be in extents format.
690 		 * We need one resblk for the conversion; the number of
691 		 * exchanges is (worst case) the temporary file's extent count
692 		 * plus the block we converted.
693 		 */
694 		req->ip1_bcount = sc->tempip->i_nblocks;
695 		req->ip2_bcount = 1;
696 		req->nr_exchanges = 1 + tifp->if_nextents;
697 		req->resblks = 1;
698 		break;
699 	case 2:
700 		/*
701 		 * The temporary file is in local format, but the file being
702 		 * repaired has mapped extents.  To perform the exchange, the
703 		 * temp file must have its shortform data converted to an
704 		 * ondisk block, and the fork changed to extents format.  We
705 		 * need one resblk for the conversion; the number of exchanges
706 		 * is (worst case) the extent count of the file being repaired
707 		 * plus the block we converted.
708 		 */
709 		req->ip1_bcount = 1;
710 		req->ip2_bcount = sc->ip->i_nblocks;
711 		req->nr_exchanges = 1 + ifp->if_nextents;
712 		req->resblks = 1;
713 		break;
714 	case 3:
715 		/*
716 		 * Both forks are in local format.  To perform the exchange,
717 		 * both files must have their shortform data converted to
718 		 * fsblocks, and both forks must be converted to extents
719 		 * format.  We need two resblks for the two conversions, and
720 		 * the number of exchanges is 1 since there's only one block at
721 		 * fileoff 0.  Presumably, the caller could not exchange the
722 		 * two inode fork areas directly.
723 		 */
724 		req->ip1_bcount = 1;
725 		req->ip2_bcount = 1;
726 		req->nr_exchanges = 1;
727 		req->resblks = 2;
728 		break;
729 	}
730 
731 	return xfs_exchmaps_estimate_overhead(req);
732 }
733 
734 /*
735  * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
736  * this if quota enforcement is disabled or if both inodes' dquots are the
737  * same.  The qretry structure must be initialized to zeroes before the first
738  * call to this function.
739  */
740 STATIC int
xrep_tempexch_reserve_quota(struct xfs_scrub * sc,const struct xrep_tempexch * tx)741 xrep_tempexch_reserve_quota(
742 	struct xfs_scrub		*sc,
743 	const struct xrep_tempexch	*tx)
744 {
745 	struct xfs_trans		*tp = sc->tp;
746 	const struct xfs_exchmaps_req	*req = &tx->req;
747 	int64_t				ddelta, rdelta;
748 	int				error;
749 
750 	/*
751 	 * Don't bother with a quota reservation if we're not enforcing them
752 	 * or the two inodes have the same dquots.
753 	 */
754 	if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
755 	    xfs_is_metadir_inode(req->ip1) ||
756 	    (req->ip1->i_udquot == req->ip2->i_udquot &&
757 	     req->ip1->i_gdquot == req->ip2->i_gdquot &&
758 	     req->ip1->i_pdquot == req->ip2->i_pdquot))
759 		return 0;
760 
761 	/*
762 	 * Quota reservation for each file comes from two sources.  First, we
763 	 * need to account for any net gain in mapped blocks during the
764 	 * exchange.  Second, we need reservation for the gross gain in mapped
765 	 * blocks so that we don't trip over any quota block reservation
766 	 * assertions.  We must reserve the gross gain because the quota code
767 	 * subtracts from bcount the number of blocks that we unmap; it does
768 	 * not add that quantity back to the quota block reservation.
769 	 */
770 	ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount);
771 	rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount);
772 	error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
773 			ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount,
774 			true);
775 	if (error)
776 		return error;
777 
778 	ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount);
779 	rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount);
780 	return xfs_trans_reserve_quota_nblks(tp, req->ip2,
781 			ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount,
782 			true);
783 }
784 
785 /*
786  * Prepare an existing transaction for an atomic file contents exchange.
787  *
788  * This function fills out the mapping exchange request and resource estimation
789  * structures in preparation for exchanging the contents of a metadata file
790  * that has been rebuilt in the temp file.  Next, it reserves space and quota
791  * for the transaction.
792  *
793  * The caller must hold ILOCK_EXCL of the scrub target file and the temporary
794  * file.  The caller must join both inodes to the transaction with no unlock
795  * flags, and is responsible for dropping both ILOCKs when appropriate.  Only
796  * use this when those ILOCKs cannot be dropped.
797  */
798 int
xrep_tempexch_trans_reserve(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t off,xfs_filblks_t len,struct xrep_tempexch * tx)799 xrep_tempexch_trans_reserve(
800 	struct xfs_scrub	*sc,
801 	int			whichfork,
802 	xfs_fileoff_t		off,
803 	xfs_filblks_t		len,
804 	struct xrep_tempexch	*tx)
805 {
806 	int			error;
807 
808 	ASSERT(sc->tp != NULL);
809 	xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
810 	xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);
811 
812 	error = xrep_tempexch_prep_request(sc, whichfork, off, len, tx);
813 	if (error)
814 		return error;
815 
816 	error = xfs_exchmaps_estimate(&tx->req);
817 	if (error)
818 		return error;
819 
820 	error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0);
821 	if (error)
822 		return error;
823 
824 	return xrep_tempexch_reserve_quota(sc, tx);
825 }
826 
827 /*
828  * Create a new transaction for a file contents exchange.
829  *
830  * This function fills out the mapping excahange request and resource
831  * estimation structures in preparation for exchanging the contents of a
832  * metadata file that has been rebuilt in the temp file.  Next, it reserves
833  * space, takes ILOCK_EXCL of both inodes, joins them to the transaction and
834  * reserves quota for the transaction.
835  *
836  * The caller is responsible for dropping both ILOCKs when appropriate.
837  */
838 int
xrep_tempexch_trans_alloc(struct xfs_scrub * sc,int whichfork,struct xrep_tempexch * tx)839 xrep_tempexch_trans_alloc(
840 	struct xfs_scrub	*sc,
841 	int			whichfork,
842 	struct xrep_tempexch	*tx)
843 {
844 	unsigned int		flags = 0;
845 	int			error;
846 
847 	ASSERT(sc->tp == NULL);
848 	ASSERT(xfs_has_exchange_range(sc->mp));
849 
850 	error = xrep_tempexch_prep_request(sc, whichfork, 0, XFS_MAX_FILEOFF,
851 			tx);
852 	if (error)
853 		return error;
854 
855 	error = xrep_tempexch_estimate(sc, tx);
856 	if (error)
857 		return error;
858 
859 	if (xfs_has_lazysbcount(sc->mp))
860 		flags |= XFS_TRANS_RES_FDBLKS;
861 
862 	error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
863 			tx->req.resblks, 0, flags, &sc->tp);
864 	if (error)
865 		return error;
866 
867 	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
868 	sc->ilock_flags |= XFS_ILOCK_EXCL;
869 	xfs_exchrange_ilock(sc->tp, sc->ip, sc->tempip);
870 
871 	return xrep_tempexch_reserve_quota(sc, tx);
872 }
873 
874 /*
875  * Exchange file mappings (and hence file contents) between the file being
876  * repaired and the temporary file.  Returns with both inodes locked and joined
877  * to a clean scrub transaction.
878  */
879 int
xrep_tempexch_contents(struct xfs_scrub * sc,struct xrep_tempexch * tx)880 xrep_tempexch_contents(
881 	struct xfs_scrub	*sc,
882 	struct xrep_tempexch	*tx)
883 {
884 	int			error;
885 
886 	ASSERT(xfs_has_exchange_range(sc->mp));
887 
888 	xfs_exchange_mappings(sc->tp, &tx->req);
889 	error = xfs_defer_finish(&sc->tp);
890 	if (error)
891 		return error;
892 
893 	/*
894 	 * If we exchanged the ondisk sizes of two metadata files, we must
895 	 * exchanged the incore sizes as well.
896 	 */
897 	if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) {
898 		loff_t	temp;
899 
900 		temp = i_size_read(VFS_I(sc->ip));
901 		i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
902 		i_size_write(VFS_I(sc->tempip), temp);
903 	}
904 
905 	return 0;
906 }
907 
908 /*
909  * Write local format data from one of the temporary file's forks into the same
910  * fork of file being repaired, and exchange the file sizes, if appropriate.
911  * Caller must ensure that the file being repaired has enough fork space to
912  * hold all the bytes.
913  */
914 void
xrep_tempfile_copyout_local(struct xfs_scrub * sc,int whichfork)915 xrep_tempfile_copyout_local(
916 	struct xfs_scrub	*sc,
917 	int			whichfork)
918 {
919 	struct xfs_ifork	*temp_ifp;
920 	struct xfs_ifork	*ifp;
921 	unsigned int		ilog_flags = XFS_ILOG_CORE;
922 
923 	temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork);
924 	ifp = xfs_ifork_ptr(sc->ip, whichfork);
925 
926 	ASSERT(temp_ifp != NULL);
927 	ASSERT(ifp != NULL);
928 	ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL);
929 	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
930 
931 	switch (whichfork) {
932 	case XFS_DATA_FORK:
933 		ASSERT(sc->tempip->i_disk_size <=
934 					xfs_inode_data_fork_size(sc->ip));
935 		break;
936 	case XFS_ATTR_FORK:
937 		ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff);
938 		break;
939 	default:
940 		ASSERT(0);
941 		return;
942 	}
943 
944 	/* Recreate @sc->ip's incore fork (ifp) with data from temp_ifp. */
945 	xfs_idestroy_fork(ifp);
946 	xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_data,
947 			temp_ifp->if_bytes);
948 
949 	if (whichfork == XFS_DATA_FORK) {
950 		i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
951 		sc->ip->i_disk_size = sc->tempip->i_disk_size;
952 	}
953 
954 	ilog_flags |= xfs_ilog_fdata(whichfork);
955 	xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags);
956 }
957 
958 /* Decide if a given XFS inode is a temporary file for a repair. */
959 bool
xrep_is_tempfile(const struct xfs_inode * ip)960 xrep_is_tempfile(
961 	const struct xfs_inode	*ip)
962 {
963 	const struct inode	*inode = &ip->i_vnode;
964 	struct xfs_mount	*mp = ip->i_mount;
965 
966 	/*
967 	 * Files in the metadata directory tree also have S_PRIVATE set and
968 	 * IOP_XATTR unset, so we must distinguish them separately.  We (ab)use
969 	 * the IRECOVERY flag to mark temporary metadir inodes knowing that the
970 	 * end of log recovery clears IRECOVERY, so the only ones that can
971 	 * exist during online repair are the ones we create.
972 	 */
973 	if (xfs_has_metadir(mp) && (ip->i_diflags2 & XFS_DIFLAG2_METADATA))
974 		return __xfs_iflags_test(ip, XFS_IRECOVERY);
975 
976 	if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR))
977 		return true;
978 
979 	return false;
980 }
981