xref: /linux/fs/xfs/xfs_exchrange.c (revision c94cd9508b1335b949fd13ebd269313c65492df0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_quota.h"
16 #include "xfs_bmap_util.h"
17 #include "xfs_reflink.h"
18 #include "xfs_trace.h"
19 #include "xfs_exchrange.h"
20 #include "xfs_exchmaps.h"
21 #include "xfs_sb.h"
22 #include "xfs_icache.h"
23 #include "xfs_log.h"
24 #include "xfs_rtbitmap.h"
25 #include <linux/fsnotify.h>
26 
27 /* Lock (and optionally join) two inodes for a file range exchange. */
28 void
29 xfs_exchrange_ilock(
30 	struct xfs_trans	*tp,
31 	struct xfs_inode	*ip1,
32 	struct xfs_inode	*ip2)
33 {
34 	if (ip1 != ip2)
35 		xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
36 				    ip2, XFS_ILOCK_EXCL);
37 	else
38 		xfs_ilock(ip1, XFS_ILOCK_EXCL);
39 	if (tp) {
40 		xfs_trans_ijoin(tp, ip1, 0);
41 		if (ip2 != ip1)
42 			xfs_trans_ijoin(tp, ip2, 0);
43 	}
44 
45 }
46 
47 /* Unlock two inodes after a file range exchange operation. */
48 void
49 xfs_exchrange_iunlock(
50 	struct xfs_inode	*ip1,
51 	struct xfs_inode	*ip2)
52 {
53 	if (ip2 != ip1)
54 		xfs_iunlock(ip2, XFS_ILOCK_EXCL);
55 	xfs_iunlock(ip1, XFS_ILOCK_EXCL);
56 }
57 
58 /*
59  * Estimate the resource requirements to exchange file contents between the two
60  * files.  The caller is required to hold the IOLOCK and the MMAPLOCK and to
61  * have flushed both inodes' pagecache and active direct-ios.
62  */
63 int
64 xfs_exchrange_estimate(
65 	struct xfs_exchmaps_req	*req)
66 {
67 	int			error;
68 
69 	xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
70 	error = xfs_exchmaps_estimate(req);
71 	xfs_exchrange_iunlock(req->ip1, req->ip2);
72 	return error;
73 }
74 
75 /*
76  * Check that file2's metadata agree with the snapshot that we took for the
77  * range commit request.
78  *
79  * This should be called after the filesystem has locked /all/ inode metadata
80  * against modification.
81  */
82 STATIC int
83 xfs_exchrange_check_freshness(
84 	const struct xfs_exchrange	*fxr,
85 	struct xfs_inode		*ip2)
86 {
87 	struct inode			*inode2 = VFS_I(ip2);
88 	struct timespec64		ctime = inode_get_ctime(inode2);
89 	struct timespec64		mtime = inode_get_mtime(inode2);
90 
91 	trace_xfs_exchrange_freshness(fxr, ip2);
92 
93 	/* Check that file2 hasn't otherwise been modified. */
94 	if (fxr->file2_ino != ip2->i_ino ||
95 	    fxr->file2_gen != inode2->i_generation ||
96 	    !timespec64_equal(&fxr->file2_ctime, &ctime) ||
97 	    !timespec64_equal(&fxr->file2_mtime, &mtime))
98 		return -EBUSY;
99 
100 	return 0;
101 }
102 
103 #define QRETRY_IP1	(0x1)
104 #define QRETRY_IP2	(0x2)
105 
106 /*
107  * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
108  * this if quota enforcement is disabled or if both inodes' dquots are the
109  * same.  The qretry structure must be initialized to zeroes before the first
110  * call to this function.
111  */
112 STATIC int
113 xfs_exchrange_reserve_quota(
114 	struct xfs_trans		*tp,
115 	const struct xfs_exchmaps_req	*req,
116 	unsigned int			*qretry)
117 {
118 	int64_t				ddelta, rdelta;
119 	int				ip1_error = 0;
120 	int				error;
121 
122 	/*
123 	 * Don't bother with a quota reservation if we're not enforcing them
124 	 * or the two inodes have the same dquots.
125 	 */
126 	if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
127 	    (req->ip1->i_udquot == req->ip2->i_udquot &&
128 	     req->ip1->i_gdquot == req->ip2->i_gdquot &&
129 	     req->ip1->i_pdquot == req->ip2->i_pdquot))
130 		return 0;
131 
132 	*qretry = 0;
133 
134 	/*
135 	 * For each file, compute the net gain in the number of regular blocks
136 	 * that will be mapped into that file and reserve that much quota.  The
137 	 * quota counts must be able to absorb at least that much space.
138 	 */
139 	ddelta = req->ip2_bcount - req->ip1_bcount;
140 	rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
141 	if (ddelta > 0 || rdelta > 0) {
142 		error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
143 				ddelta > 0 ? ddelta : 0,
144 				rdelta > 0 ? rdelta : 0,
145 				false);
146 		if (error == -EDQUOT || error == -ENOSPC) {
147 			/*
148 			 * Save this error and see what happens if we try to
149 			 * reserve quota for ip2.  Then report both.
150 			 */
151 			*qretry |= QRETRY_IP1;
152 			ip1_error = error;
153 			error = 0;
154 		}
155 		if (error)
156 			return error;
157 	}
158 	if (ddelta < 0 || rdelta < 0) {
159 		error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
160 				ddelta < 0 ? -ddelta : 0,
161 				rdelta < 0 ? -rdelta : 0,
162 				false);
163 		if (error == -EDQUOT || error == -ENOSPC)
164 			*qretry |= QRETRY_IP2;
165 		if (error)
166 			return error;
167 	}
168 	if (ip1_error)
169 		return ip1_error;
170 
171 	/*
172 	 * For each file, forcibly reserve the gross gain in mapped blocks so
173 	 * that we don't trip over any quota block reservation assertions.
174 	 * We must reserve the gross gain because the quota code subtracts from
175 	 * bcount the number of blocks that we unmap; it does not add that
176 	 * quantity back to the quota block reservation.
177 	 */
178 	error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
179 			req->ip1_rtbcount, true);
180 	if (error)
181 		return error;
182 
183 	return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
184 			req->ip2_rtbcount, true);
185 }
186 
187 /* Exchange the mappings (and hence the contents) of two files' forks. */
188 STATIC int
189 xfs_exchrange_mappings(
190 	const struct xfs_exchrange	*fxr,
191 	struct xfs_inode		*ip1,
192 	struct xfs_inode		*ip2)
193 {
194 	struct xfs_mount		*mp = ip1->i_mount;
195 	struct xfs_exchmaps_req		req = {
196 		.ip1			= ip1,
197 		.ip2			= ip2,
198 		.startoff1		= XFS_B_TO_FSBT(mp, fxr->file1_offset),
199 		.startoff2		= XFS_B_TO_FSBT(mp, fxr->file2_offset),
200 		.blockcount		= XFS_B_TO_FSB(mp, fxr->length),
201 	};
202 	struct xfs_trans		*tp;
203 	unsigned int			qretry;
204 	bool				retried = false;
205 	int				error;
206 
207 	trace_xfs_exchrange_mappings(fxr, ip1, ip2);
208 
209 	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
210 		req.flags |= XFS_EXCHMAPS_SET_SIZES;
211 	if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
212 		req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
213 
214 	/*
215 	 * Round the request length up to the nearest file allocation unit.
216 	 * The prep function already checked that the request offsets and
217 	 * length in @fxr are safe to round up.
218 	 */
219 	if (xfs_inode_has_bigrtalloc(ip2))
220 		req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount);
221 
222 	error = xfs_exchrange_estimate(&req);
223 	if (error)
224 		return error;
225 
226 retry:
227 	/* Allocate the transaction, lock the inodes, and join them. */
228 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
229 			XFS_TRANS_RES_FDBLKS, &tp);
230 	if (error)
231 		return error;
232 
233 	xfs_exchrange_ilock(tp, ip1, ip2);
234 
235 	trace_xfs_exchrange_before(ip2, 2);
236 	trace_xfs_exchrange_before(ip1, 1);
237 
238 	error = xfs_exchmaps_check_forks(mp, &req);
239 	if (error)
240 		goto out_trans_cancel;
241 
242 	/*
243 	 * Reserve ourselves some quota if any of them are in enforcing mode.
244 	 * In theory we only need enough to satisfy the change in the number
245 	 * of blocks between the two ranges being remapped.
246 	 */
247 	error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
248 	if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
249 		xfs_trans_cancel(tp);
250 		xfs_exchrange_iunlock(ip1, ip2);
251 		if (qretry & QRETRY_IP1)
252 			xfs_blockgc_free_quota(ip1, 0);
253 		if (qretry & QRETRY_IP2)
254 			xfs_blockgc_free_quota(ip2, 0);
255 		retried = true;
256 		goto retry;
257 	}
258 	if (error)
259 		goto out_trans_cancel;
260 
261 	/* If we got this far on a dry run, all parameters are ok. */
262 	if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
263 		goto out_trans_cancel;
264 
265 	/* Update the mtime and ctime of both files. */
266 	if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
267 		xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
268 	if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
269 		xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
270 
271 	xfs_exchange_mappings(tp, &req);
272 
273 	/*
274 	 * Force the log to persist metadata updates if the caller or the
275 	 * administrator requires this.  The generic prep function already
276 	 * flushed the relevant parts of the page cache.
277 	 */
278 	if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
279 		xfs_trans_set_sync(tp);
280 
281 	error = xfs_trans_commit(tp);
282 
283 	trace_xfs_exchrange_after(ip2, 2);
284 	trace_xfs_exchrange_after(ip1, 1);
285 
286 	if (error)
287 		goto out_unlock;
288 
289 	/*
290 	 * If the caller wanted us to exchange the contents of two complete
291 	 * files of unequal length, exchange the incore sizes now.  This should
292 	 * be safe because we flushed both files' page caches, exchanged all
293 	 * the mappings, and updated the ondisk sizes.
294 	 */
295 	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
296 		loff_t	temp;
297 
298 		temp = i_size_read(VFS_I(ip2));
299 		i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
300 		i_size_write(VFS_I(ip1), temp);
301 	}
302 
303 out_unlock:
304 	xfs_exchrange_iunlock(ip1, ip2);
305 	return error;
306 
307 out_trans_cancel:
308 	xfs_trans_cancel(tp);
309 	goto out_unlock;
310 }
311 
312 /*
313  * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
314  * This part deals with struct file objects and byte ranges and does not deal
315  * with XFS-specific data structures such as xfs_inodes and block ranges.  This
316  * separation may some day facilitate porting to another filesystem.
317  *
318  * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
319  * file1 with the same number of bytes starting at fxr.file2_offset in file2.
320  * Implementations must call xfs_exchange_range_prep to prepare the two
321  * files prior to taking locks; and they must update the inode change and mod
322  * times of both files as part of the metadata update.  The timestamp update
323  * and freshness checks must be done atomically as part of the data exchange
324  * operation to ensure correctness of the freshness check.
325  * xfs_exchange_range_finish must be called after the operation completes
326  * successfully but before locks are dropped.
327  */
328 
329 /* Verify that we have security clearance to perform this operation. */
330 static int
331 xfs_exchange_range_verify_area(
332 	struct xfs_exchrange	*fxr)
333 {
334 	int			ret;
335 
336 	ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
337 			true);
338 	if (ret)
339 		return ret;
340 
341 	return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
342 			true);
343 }
344 
345 /*
346  * Performs necessary checks before doing a range exchange, having stabilized
347  * mutable inode attributes via i_rwsem.
348  */
349 static inline int
350 xfs_exchange_range_checks(
351 	struct xfs_exchrange	*fxr,
352 	unsigned int		alloc_unit)
353 {
354 	struct inode		*inode1 = file_inode(fxr->file1);
355 	struct inode		*inode2 = file_inode(fxr->file2);
356 	uint64_t		allocmask = alloc_unit - 1;
357 	int64_t			test_len;
358 	uint64_t		blen;
359 	loff_t			size1, size2, tmp;
360 	int			error;
361 
362 	/* Don't touch certain kinds of inodes */
363 	if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
364 		return -EPERM;
365 	if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
366 		return -ETXTBSY;
367 
368 	size1 = i_size_read(inode1);
369 	size2 = i_size_read(inode2);
370 
371 	/* Ranges cannot start after EOF. */
372 	if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
373 		return -EINVAL;
374 
375 	/*
376 	 * If the caller said to exchange to EOF, we set the length of the
377 	 * request large enough to cover everything to the end of both files.
378 	 */
379 	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
380 		fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
381 					     size2 - fxr->file2_offset);
382 
383 		error = xfs_exchange_range_verify_area(fxr);
384 		if (error)
385 			return error;
386 	}
387 
388 	/*
389 	 * The start of both ranges must be aligned to the file allocation
390 	 * unit.
391 	 */
392 	if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
393 	    !IS_ALIGNED(fxr->file2_offset, alloc_unit))
394 		return -EINVAL;
395 
396 	/* Ensure offsets don't wrap. */
397 	if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
398 	    check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
399 		return -EINVAL;
400 
401 	/*
402 	 * We require both ranges to end within EOF, unless we're exchanging
403 	 * to EOF.
404 	 */
405 	if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
406 	    (fxr->file1_offset + fxr->length > size1 ||
407 	     fxr->file2_offset + fxr->length > size2))
408 		return -EINVAL;
409 
410 	/*
411 	 * Make sure we don't hit any file size limits.  If we hit any size
412 	 * limits such that test_length was adjusted, we abort the whole
413 	 * operation.
414 	 */
415 	test_len = fxr->length;
416 	error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
417 			&test_len);
418 	if (error)
419 		return error;
420 	error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
421 			&test_len);
422 	if (error)
423 		return error;
424 	if (test_len != fxr->length)
425 		return -EINVAL;
426 
427 	/*
428 	 * If the user wanted us to exchange up to the infile's EOF, round up
429 	 * to the next allocation unit boundary for this check.  Do the same
430 	 * for the outfile.
431 	 *
432 	 * Otherwise, reject the range length if it's not aligned to an
433 	 * allocation unit.
434 	 */
435 	if (fxr->file1_offset + fxr->length == size1)
436 		blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
437 	else if (fxr->file2_offset + fxr->length == size2)
438 		blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
439 	else if (!IS_ALIGNED(fxr->length, alloc_unit))
440 		return -EINVAL;
441 	else
442 		blen = fxr->length;
443 
444 	/* Don't allow overlapped exchanges within the same file. */
445 	if (inode1 == inode2 &&
446 	    fxr->file2_offset + blen > fxr->file1_offset &&
447 	    fxr->file1_offset + blen > fxr->file2_offset)
448 		return -EINVAL;
449 
450 	/*
451 	 * Ensure that we don't exchange a partial EOF block into the middle of
452 	 * another file.
453 	 */
454 	if ((fxr->length & allocmask) == 0)
455 		return 0;
456 
457 	blen = fxr->length;
458 	if (fxr->file2_offset + blen < size2)
459 		blen &= ~allocmask;
460 
461 	if (fxr->file1_offset + blen < size1)
462 		blen &= ~allocmask;
463 
464 	return blen == fxr->length ? 0 : -EINVAL;
465 }
466 
467 /*
468  * Check that the two inodes are eligible for range exchanges, the ranges make
469  * sense, and then flush all dirty data.  Caller must ensure that the inodes
470  * have been locked against any other modifications.
471  */
472 static inline int
473 xfs_exchange_range_prep(
474 	struct xfs_exchrange	*fxr,
475 	unsigned int		alloc_unit)
476 {
477 	struct inode		*inode1 = file_inode(fxr->file1);
478 	struct inode		*inode2 = file_inode(fxr->file2);
479 	bool			same_inode = (inode1 == inode2);
480 	int			error;
481 
482 	/* Check that we don't violate system file offset limits. */
483 	error = xfs_exchange_range_checks(fxr, alloc_unit);
484 	if (error || fxr->length == 0)
485 		return error;
486 
487 	/* Wait for the completion of any pending IOs on both files */
488 	inode_dio_wait(inode1);
489 	if (!same_inode)
490 		inode_dio_wait(inode2);
491 
492 	error = filemap_write_and_wait_range(inode1->i_mapping,
493 			fxr->file1_offset,
494 			fxr->file1_offset + fxr->length - 1);
495 	if (error)
496 		return error;
497 
498 	error = filemap_write_and_wait_range(inode2->i_mapping,
499 			fxr->file2_offset,
500 			fxr->file2_offset + fxr->length - 1);
501 	if (error)
502 		return error;
503 
504 	/*
505 	 * If the files or inodes involved require synchronous writes, amend
506 	 * the request to force the filesystem to flush all data and metadata
507 	 * to disk after the operation completes.
508 	 */
509 	if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
510 	    IS_SYNC(inode1) || IS_SYNC(inode2))
511 		fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
512 
513 	return 0;
514 }
515 
516 /*
517  * Finish a range exchange operation, if it was successful.  Caller must ensure
518  * that the inodes are still locked against any other modifications.
519  */
520 static inline int
521 xfs_exchange_range_finish(
522 	struct xfs_exchrange	*fxr)
523 {
524 	int			error;
525 
526 	error = file_remove_privs(fxr->file1);
527 	if (error)
528 		return error;
529 	if (file_inode(fxr->file1) == file_inode(fxr->file2))
530 		return 0;
531 
532 	return file_remove_privs(fxr->file2);
533 }
534 
535 /*
536  * Check the alignment of an exchange request when the allocation unit size
537  * isn't a power of two.  The generic file-level helpers use (fast)
538  * bitmask-based alignment checks, but here we have to use slow long division.
539  */
540 static int
541 xfs_exchrange_check_rtalign(
542 	const struct xfs_exchrange	*fxr,
543 	struct xfs_inode		*ip1,
544 	struct xfs_inode		*ip2,
545 	unsigned int			alloc_unit)
546 {
547 	uint64_t			length = fxr->length;
548 	uint64_t			blen;
549 	loff_t				size1, size2;
550 
551 	size1 = i_size_read(VFS_I(ip1));
552 	size2 = i_size_read(VFS_I(ip2));
553 
554 	/* The start of both ranges must be aligned to a rt extent. */
555 	if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
556 	    !isaligned_64(fxr->file2_offset, alloc_unit))
557 		return -EINVAL;
558 
559 	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
560 		length = max_t(int64_t, size1 - fxr->file1_offset,
561 					size2 - fxr->file2_offset);
562 
563 	/*
564 	 * If the user wanted us to exchange up to the infile's EOF, round up
565 	 * to the next rt extent boundary for this check.  Do the same for the
566 	 * outfile.
567 	 *
568 	 * Otherwise, reject the range length if it's not rt extent aligned.
569 	 * We already confirmed the starting offsets' rt extent block
570 	 * alignment.
571 	 */
572 	if (fxr->file1_offset + length == size1)
573 		blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
574 	else if (fxr->file2_offset + length == size2)
575 		blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
576 	else if (!isaligned_64(length, alloc_unit))
577 		return -EINVAL;
578 	else
579 		blen = length;
580 
581 	/* Don't allow overlapped exchanges within the same file. */
582 	if (ip1 == ip2 &&
583 	    fxr->file2_offset + blen > fxr->file1_offset &&
584 	    fxr->file1_offset + blen > fxr->file2_offset)
585 		return -EINVAL;
586 
587 	/*
588 	 * Ensure that we don't exchange a partial EOF rt extent into the
589 	 * middle of another file.
590 	 */
591 	if (isaligned_64(length, alloc_unit))
592 		return 0;
593 
594 	blen = length;
595 	if (fxr->file2_offset + length < size2)
596 		blen = rounddown_64(blen, alloc_unit);
597 
598 	if (fxr->file1_offset + blen < size1)
599 		blen = rounddown_64(blen, alloc_unit);
600 
601 	return blen == length ? 0 : -EINVAL;
602 }
603 
604 /* Prepare two files to have their data exchanged. */
605 STATIC int
606 xfs_exchrange_prep(
607 	struct xfs_exchrange	*fxr,
608 	struct xfs_inode	*ip1,
609 	struct xfs_inode	*ip2)
610 {
611 	struct xfs_mount	*mp = ip2->i_mount;
612 	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip2);
613 	int			error;
614 
615 	trace_xfs_exchrange_prep(fxr, ip1, ip2);
616 
617 	/* Verify both files are either real-time or non-realtime */
618 	if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
619 		return -EINVAL;
620 
621 	/* Check non-power of two alignment issues, if necessary. */
622 	if (!is_power_of_2(alloc_unit)) {
623 		error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
624 		if (error)
625 			return error;
626 
627 		/*
628 		 * Do the generic file-level checks with the regular block
629 		 * alignment.
630 		 */
631 		alloc_unit = mp->m_sb.sb_blocksize;
632 	}
633 
634 	error = xfs_exchange_range_prep(fxr, alloc_unit);
635 	if (error || fxr->length == 0)
636 		return error;
637 
638 	if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
639 		error = xfs_exchrange_check_freshness(fxr, ip2);
640 		if (error)
641 			return error;
642 	}
643 
644 	/* Attach dquots to both inodes before changing block maps. */
645 	error = xfs_qm_dqattach(ip2);
646 	if (error)
647 		return error;
648 	error = xfs_qm_dqattach(ip1);
649 	if (error)
650 		return error;
651 
652 	trace_xfs_exchrange_flush(fxr, ip1, ip2);
653 
654 	/* Flush the relevant ranges of both files. */
655 	error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
656 	if (error)
657 		return error;
658 	error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
659 	if (error)
660 		return error;
661 
662 	/*
663 	 * Cancel CoW fork preallocations for the ranges of both files.  The
664 	 * prep function should have flushed all the dirty data, so the only
665 	 * CoW mappings remaining should be speculative.
666 	 */
667 	if (xfs_inode_has_cow_data(ip1)) {
668 		error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
669 				fxr->length, true);
670 		if (error)
671 			return error;
672 	}
673 
674 	if (xfs_inode_has_cow_data(ip2)) {
675 		error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
676 				fxr->length, true);
677 		if (error)
678 			return error;
679 	}
680 
681 	return 0;
682 }
683 
684 /*
685  * Exchange contents of files.  This is the binding between the generic
686  * file-level concepts and the XFS inode-specific implementation.
687  */
688 STATIC int
689 xfs_exchrange_contents(
690 	struct xfs_exchrange	*fxr)
691 {
692 	struct inode		*inode1 = file_inode(fxr->file1);
693 	struct inode		*inode2 = file_inode(fxr->file2);
694 	struct xfs_inode	*ip1 = XFS_I(inode1);
695 	struct xfs_inode	*ip2 = XFS_I(inode2);
696 	struct xfs_mount	*mp = ip1->i_mount;
697 	int			error;
698 
699 	if (!xfs_has_exchange_range(mp))
700 		return -EOPNOTSUPP;
701 
702 	if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
703 			   XFS_EXCHANGE_RANGE_PRIV_FLAGS))
704 		return -EINVAL;
705 
706 	if (xfs_is_shutdown(mp))
707 		return -EIO;
708 
709 	/* Lock both files against IO */
710 	error = xfs_ilock2_io_mmap(ip1, ip2);
711 	if (error)
712 		goto out_err;
713 
714 	/* Prepare and then exchange file contents. */
715 	error = xfs_exchrange_prep(fxr, ip1, ip2);
716 	if (error)
717 		goto out_unlock;
718 
719 	error = xfs_exchrange_mappings(fxr, ip1, ip2);
720 	if (error)
721 		goto out_unlock;
722 
723 	/*
724 	 * Finish the exchange by removing special file privileges like any
725 	 * other file write would do.  This may involve turning on support for
726 	 * logged xattrs if either file has security capabilities.
727 	 */
728 	error = xfs_exchange_range_finish(fxr);
729 	if (error)
730 		goto out_unlock;
731 
732 out_unlock:
733 	xfs_iunlock2_io_mmap(ip1, ip2);
734 out_err:
735 	if (error)
736 		trace_xfs_exchrange_error(ip2, error, _RET_IP_);
737 	return error;
738 }
739 
740 /* Exchange parts of two files. */
741 static int
742 xfs_exchange_range(
743 	struct xfs_exchrange	*fxr)
744 {
745 	struct inode		*inode1 = file_inode(fxr->file1);
746 	struct inode		*inode2 = file_inode(fxr->file2);
747 	int			ret;
748 
749 	BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
750 		     XFS_EXCHANGE_RANGE_PRIV_FLAGS);
751 
752 	/* Both files must be on the same mount/filesystem. */
753 	if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
754 		return -EXDEV;
755 
756 	if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
757 			 __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
758 		return -EINVAL;
759 
760 	/* Userspace requests only honored for regular files. */
761 	if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
762 		return -EISDIR;
763 	if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
764 		return -EINVAL;
765 
766 	/* Both files must be opened for read and write. */
767 	if (!(fxr->file1->f_mode & FMODE_READ) ||
768 	    !(fxr->file1->f_mode & FMODE_WRITE) ||
769 	    !(fxr->file2->f_mode & FMODE_READ) ||
770 	    !(fxr->file2->f_mode & FMODE_WRITE))
771 		return -EBADF;
772 
773 	/* Neither file can be opened append-only. */
774 	if ((fxr->file1->f_flags & O_APPEND) ||
775 	    (fxr->file2->f_flags & O_APPEND))
776 		return -EBADF;
777 
778 	/*
779 	 * If we're not exchanging to EOF, we can check the areas before
780 	 * stabilizing both files' i_size.
781 	 */
782 	if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
783 		ret = xfs_exchange_range_verify_area(fxr);
784 		if (ret)
785 			return ret;
786 	}
787 
788 	/* Update cmtime if the fd/inode don't forbid it. */
789 	if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
790 		fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
791 	if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
792 		fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
793 
794 	file_start_write(fxr->file2);
795 	ret = xfs_exchrange_contents(fxr);
796 	file_end_write(fxr->file2);
797 	if (ret)
798 		return ret;
799 
800 	fsnotify_modify(fxr->file1);
801 	if (fxr->file2 != fxr->file1)
802 		fsnotify_modify(fxr->file2);
803 	return 0;
804 }
805 
806 /* Collect exchange-range arguments from userspace. */
807 long
808 xfs_ioc_exchange_range(
809 	struct file			*file,
810 	struct xfs_exchange_range __user *argp)
811 {
812 	struct xfs_exchrange		fxr = {
813 		.file2			= file,
814 	};
815 	struct xfs_exchange_range	args;
816 	struct fd			file1;
817 	int				error;
818 
819 	if (copy_from_user(&args, argp, sizeof(args)))
820 		return -EFAULT;
821 	if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
822 		return -EINVAL;
823 	if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
824 		return -EINVAL;
825 
826 	fxr.file1_offset	= args.file1_offset;
827 	fxr.file2_offset	= args.file2_offset;
828 	fxr.length		= args.length;
829 	fxr.flags		= args.flags;
830 
831 	file1 = fdget(args.file1_fd);
832 	if (!fd_file(file1))
833 		return -EBADF;
834 	fxr.file1 = fd_file(file1);
835 
836 	error = xfs_exchange_range(&fxr);
837 	fdput(file1);
838 	return error;
839 }
840 
841 /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
842 struct xfs_commit_range_fresh {
843 	xfs_fsid_t	fsid;		/* m_fixedfsid */
844 	__u64		file2_ino;	/* inode number */
845 	__s64		file2_mtime;	/* modification time */
846 	__s64		file2_ctime;	/* change time */
847 	__s32		file2_mtime_nsec; /* mod time, nsec */
848 	__s32		file2_ctime_nsec; /* change time, nsec */
849 	__u32		file2_gen;	/* inode generation */
850 	__u32		magic;		/* zero */
851 };
852 #define XCR_FRESH_MAGIC	0x444F524B	/* DORK */
853 
854 /* Set up a commitrange operation by sampling file2's write-related attrs */
855 long
856 xfs_ioc_start_commit(
857 	struct file			*file,
858 	struct xfs_commit_range __user	*argp)
859 {
860 	struct xfs_commit_range		args = { };
861 	struct timespec64		ts;
862 	struct xfs_commit_range_fresh	*kern_f;
863 	struct xfs_commit_range_fresh	__user *user_f;
864 	struct inode			*inode2 = file_inode(file);
865 	struct xfs_inode		*ip2 = XFS_I(inode2);
866 	const unsigned int		lockflags = XFS_IOLOCK_SHARED |
867 						    XFS_MMAPLOCK_SHARED |
868 						    XFS_ILOCK_SHARED;
869 
870 	BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
871 		     sizeof(args.file2_freshness));
872 
873 	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
874 
875 	memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
876 
877 	xfs_ilock(ip2, lockflags);
878 	ts = inode_get_ctime(inode2);
879 	kern_f->file2_ctime		= ts.tv_sec;
880 	kern_f->file2_ctime_nsec	= ts.tv_nsec;
881 	ts = inode_get_mtime(inode2);
882 	kern_f->file2_mtime		= ts.tv_sec;
883 	kern_f->file2_mtime_nsec	= ts.tv_nsec;
884 	kern_f->file2_ino		= ip2->i_ino;
885 	kern_f->file2_gen		= inode2->i_generation;
886 	kern_f->magic			= XCR_FRESH_MAGIC;
887 	xfs_iunlock(ip2, lockflags);
888 
889 	user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
890 	if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
891 		return -EFAULT;
892 
893 	return 0;
894 }
895 
896 /*
897  * Exchange file1 and file2 contents if file2 has not been written since the
898  * start commit operation.
899  */
900 long
901 xfs_ioc_commit_range(
902 	struct file			*file,
903 	struct xfs_commit_range __user	*argp)
904 {
905 	struct xfs_exchrange		fxr = {
906 		.file2			= file,
907 	};
908 	struct xfs_commit_range		args;
909 	struct xfs_commit_range_fresh	*kern_f;
910 	struct xfs_inode		*ip2 = XFS_I(file_inode(file));
911 	struct xfs_mount		*mp = ip2->i_mount;
912 	struct fd			file1;
913 	int				error;
914 
915 	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
916 
917 	if (copy_from_user(&args, argp, sizeof(args)))
918 		return -EFAULT;
919 	if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
920 		return -EINVAL;
921 	if (kern_f->magic != XCR_FRESH_MAGIC)
922 		return -EBUSY;
923 	if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
924 		return -EBUSY;
925 
926 	fxr.file1_offset	= args.file1_offset;
927 	fxr.file2_offset	= args.file2_offset;
928 	fxr.length		= args.length;
929 	fxr.flags		= args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
930 	fxr.file2_ino		= kern_f->file2_ino;
931 	fxr.file2_gen		= kern_f->file2_gen;
932 	fxr.file2_mtime.tv_sec	= kern_f->file2_mtime;
933 	fxr.file2_mtime.tv_nsec	= kern_f->file2_mtime_nsec;
934 	fxr.file2_ctime.tv_sec	= kern_f->file2_ctime;
935 	fxr.file2_ctime.tv_nsec	= kern_f->file2_ctime_nsec;
936 
937 	file1 = fdget(args.file1_fd);
938 	if (fd_empty(file1))
939 		return -EBADF;
940 	fxr.file1 = fd_file(file1);
941 
942 	error = xfs_exchange_range(&fxr);
943 	fdput(file1);
944 	return error;
945 }
946