xref: /linux/fs/xfs/xfs_exchrange.c (revision 332d2c1d713e232e163386c35a3ba0c1b90df83f)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_quota.h"
16 #include "xfs_bmap_util.h"
17 #include "xfs_reflink.h"
18 #include "xfs_trace.h"
19 #include "xfs_exchrange.h"
20 #include "xfs_exchmaps.h"
21 #include "xfs_sb.h"
22 #include "xfs_icache.h"
23 #include "xfs_log.h"
24 #include "xfs_rtbitmap.h"
25 #include <linux/fsnotify.h>
26 
27 /* Lock (and optionally join) two inodes for a file range exchange. */
28 void
29 xfs_exchrange_ilock(
30 	struct xfs_trans	*tp,
31 	struct xfs_inode	*ip1,
32 	struct xfs_inode	*ip2)
33 {
34 	if (ip1 != ip2)
35 		xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
36 				    ip2, XFS_ILOCK_EXCL);
37 	else
38 		xfs_ilock(ip1, XFS_ILOCK_EXCL);
39 	if (tp) {
40 		xfs_trans_ijoin(tp, ip1, 0);
41 		if (ip2 != ip1)
42 			xfs_trans_ijoin(tp, ip2, 0);
43 	}
44 
45 }
46 
47 /* Unlock two inodes after a file range exchange operation. */
48 void
49 xfs_exchrange_iunlock(
50 	struct xfs_inode	*ip1,
51 	struct xfs_inode	*ip2)
52 {
53 	if (ip2 != ip1)
54 		xfs_iunlock(ip2, XFS_ILOCK_EXCL);
55 	xfs_iunlock(ip1, XFS_ILOCK_EXCL);
56 }
57 
58 /*
59  * Estimate the resource requirements to exchange file contents between the two
60  * files.  The caller is required to hold the IOLOCK and the MMAPLOCK and to
61  * have flushed both inodes' pagecache and active direct-ios.
62  */
63 int
64 xfs_exchrange_estimate(
65 	struct xfs_exchmaps_req	*req)
66 {
67 	int			error;
68 
69 	xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
70 	error = xfs_exchmaps_estimate(req);
71 	xfs_exchrange_iunlock(req->ip1, req->ip2);
72 	return error;
73 }
74 
75 #define QRETRY_IP1	(0x1)
76 #define QRETRY_IP2	(0x2)
77 
78 /*
79  * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
80  * this if quota enforcement is disabled or if both inodes' dquots are the
81  * same.  The qretry structure must be initialized to zeroes before the first
82  * call to this function.
83  */
84 STATIC int
85 xfs_exchrange_reserve_quota(
86 	struct xfs_trans		*tp,
87 	const struct xfs_exchmaps_req	*req,
88 	unsigned int			*qretry)
89 {
90 	int64_t				ddelta, rdelta;
91 	int				ip1_error = 0;
92 	int				error;
93 
94 	/*
95 	 * Don't bother with a quota reservation if we're not enforcing them
96 	 * or the two inodes have the same dquots.
97 	 */
98 	if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
99 	    (req->ip1->i_udquot == req->ip2->i_udquot &&
100 	     req->ip1->i_gdquot == req->ip2->i_gdquot &&
101 	     req->ip1->i_pdquot == req->ip2->i_pdquot))
102 		return 0;
103 
104 	*qretry = 0;
105 
106 	/*
107 	 * For each file, compute the net gain in the number of regular blocks
108 	 * that will be mapped into that file and reserve that much quota.  The
109 	 * quota counts must be able to absorb at least that much space.
110 	 */
111 	ddelta = req->ip2_bcount - req->ip1_bcount;
112 	rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
113 	if (ddelta > 0 || rdelta > 0) {
114 		error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
115 				ddelta > 0 ? ddelta : 0,
116 				rdelta > 0 ? rdelta : 0,
117 				false);
118 		if (error == -EDQUOT || error == -ENOSPC) {
119 			/*
120 			 * Save this error and see what happens if we try to
121 			 * reserve quota for ip2.  Then report both.
122 			 */
123 			*qretry |= QRETRY_IP1;
124 			ip1_error = error;
125 			error = 0;
126 		}
127 		if (error)
128 			return error;
129 	}
130 	if (ddelta < 0 || rdelta < 0) {
131 		error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
132 				ddelta < 0 ? -ddelta : 0,
133 				rdelta < 0 ? -rdelta : 0,
134 				false);
135 		if (error == -EDQUOT || error == -ENOSPC)
136 			*qretry |= QRETRY_IP2;
137 		if (error)
138 			return error;
139 	}
140 	if (ip1_error)
141 		return ip1_error;
142 
143 	/*
144 	 * For each file, forcibly reserve the gross gain in mapped blocks so
145 	 * that we don't trip over any quota block reservation assertions.
146 	 * We must reserve the gross gain because the quota code subtracts from
147 	 * bcount the number of blocks that we unmap; it does not add that
148 	 * quantity back to the quota block reservation.
149 	 */
150 	error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
151 			req->ip1_rtbcount, true);
152 	if (error)
153 		return error;
154 
155 	return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
156 			req->ip2_rtbcount, true);
157 }
158 
159 /* Exchange the mappings (and hence the contents) of two files' forks. */
160 STATIC int
161 xfs_exchrange_mappings(
162 	const struct xfs_exchrange	*fxr,
163 	struct xfs_inode		*ip1,
164 	struct xfs_inode		*ip2)
165 {
166 	struct xfs_mount		*mp = ip1->i_mount;
167 	struct xfs_exchmaps_req		req = {
168 		.ip1			= ip1,
169 		.ip2			= ip2,
170 		.startoff1		= XFS_B_TO_FSBT(mp, fxr->file1_offset),
171 		.startoff2		= XFS_B_TO_FSBT(mp, fxr->file2_offset),
172 		.blockcount		= XFS_B_TO_FSB(mp, fxr->length),
173 	};
174 	struct xfs_trans		*tp;
175 	unsigned int			qretry;
176 	bool				retried = false;
177 	int				error;
178 
179 	trace_xfs_exchrange_mappings(fxr, ip1, ip2);
180 
181 	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
182 		req.flags |= XFS_EXCHMAPS_SET_SIZES;
183 	if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
184 		req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
185 
186 	/*
187 	 * Round the request length up to the nearest file allocation unit.
188 	 * The prep function already checked that the request offsets and
189 	 * length in @fxr are safe to round up.
190 	 */
191 	if (xfs_inode_has_bigrtalloc(ip2))
192 		req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount);
193 
194 	error = xfs_exchrange_estimate(&req);
195 	if (error)
196 		return error;
197 
198 retry:
199 	/* Allocate the transaction, lock the inodes, and join them. */
200 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
201 			XFS_TRANS_RES_FDBLKS, &tp);
202 	if (error)
203 		return error;
204 
205 	xfs_exchrange_ilock(tp, ip1, ip2);
206 
207 	trace_xfs_exchrange_before(ip2, 2);
208 	trace_xfs_exchrange_before(ip1, 1);
209 
210 	error = xfs_exchmaps_check_forks(mp, &req);
211 	if (error)
212 		goto out_trans_cancel;
213 
214 	/*
215 	 * Reserve ourselves some quota if any of them are in enforcing mode.
216 	 * In theory we only need enough to satisfy the change in the number
217 	 * of blocks between the two ranges being remapped.
218 	 */
219 	error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
220 	if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
221 		xfs_trans_cancel(tp);
222 		xfs_exchrange_iunlock(ip1, ip2);
223 		if (qretry & QRETRY_IP1)
224 			xfs_blockgc_free_quota(ip1, 0);
225 		if (qretry & QRETRY_IP2)
226 			xfs_blockgc_free_quota(ip2, 0);
227 		retried = true;
228 		goto retry;
229 	}
230 	if (error)
231 		goto out_trans_cancel;
232 
233 	/* If we got this far on a dry run, all parameters are ok. */
234 	if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
235 		goto out_trans_cancel;
236 
237 	/* Update the mtime and ctime of both files. */
238 	if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
239 		xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
240 	if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
241 		xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
242 
243 	xfs_exchange_mappings(tp, &req);
244 
245 	/*
246 	 * Force the log to persist metadata updates if the caller or the
247 	 * administrator requires this.  The generic prep function already
248 	 * flushed the relevant parts of the page cache.
249 	 */
250 	if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
251 		xfs_trans_set_sync(tp);
252 
253 	error = xfs_trans_commit(tp);
254 
255 	trace_xfs_exchrange_after(ip2, 2);
256 	trace_xfs_exchrange_after(ip1, 1);
257 
258 	if (error)
259 		goto out_unlock;
260 
261 	/*
262 	 * If the caller wanted us to exchange the contents of two complete
263 	 * files of unequal length, exchange the incore sizes now.  This should
264 	 * be safe because we flushed both files' page caches, exchanged all
265 	 * the mappings, and updated the ondisk sizes.
266 	 */
267 	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
268 		loff_t	temp;
269 
270 		temp = i_size_read(VFS_I(ip2));
271 		i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
272 		i_size_write(VFS_I(ip1), temp);
273 	}
274 
275 out_unlock:
276 	xfs_exchrange_iunlock(ip1, ip2);
277 	return error;
278 
279 out_trans_cancel:
280 	xfs_trans_cancel(tp);
281 	goto out_unlock;
282 }
283 
284 /*
285  * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
286  * This part deals with struct file objects and byte ranges and does not deal
287  * with XFS-specific data structures such as xfs_inodes and block ranges.  This
288  * separation may some day facilitate porting to another filesystem.
289  *
290  * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
291  * file1 with the same number of bytes starting at fxr.file2_offset in file2.
292  * Implementations must call xfs_exchange_range_prep to prepare the two
293  * files prior to taking locks; and they must update the inode change and mod
294  * times of both files as part of the metadata update.  The timestamp update
295  * and freshness checks must be done atomically as part of the data exchange
296  * operation to ensure correctness of the freshness check.
297  * xfs_exchange_range_finish must be called after the operation completes
298  * successfully but before locks are dropped.
299  */
300 
301 /* Verify that we have security clearance to perform this operation. */
302 static int
303 xfs_exchange_range_verify_area(
304 	struct xfs_exchrange	*fxr)
305 {
306 	int			ret;
307 
308 	ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
309 			true);
310 	if (ret)
311 		return ret;
312 
313 	return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
314 			true);
315 }
316 
317 /*
318  * Performs necessary checks before doing a range exchange, having stabilized
319  * mutable inode attributes via i_rwsem.
320  */
321 static inline int
322 xfs_exchange_range_checks(
323 	struct xfs_exchrange	*fxr,
324 	unsigned int		alloc_unit)
325 {
326 	struct inode		*inode1 = file_inode(fxr->file1);
327 	struct inode		*inode2 = file_inode(fxr->file2);
328 	uint64_t		allocmask = alloc_unit - 1;
329 	int64_t			test_len;
330 	uint64_t		blen;
331 	loff_t			size1, size2, tmp;
332 	int			error;
333 
334 	/* Don't touch certain kinds of inodes */
335 	if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
336 		return -EPERM;
337 	if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
338 		return -ETXTBSY;
339 
340 	size1 = i_size_read(inode1);
341 	size2 = i_size_read(inode2);
342 
343 	/* Ranges cannot start after EOF. */
344 	if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
345 		return -EINVAL;
346 
347 	/*
348 	 * If the caller said to exchange to EOF, we set the length of the
349 	 * request large enough to cover everything to the end of both files.
350 	 */
351 	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
352 		fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
353 					     size2 - fxr->file2_offset);
354 
355 		error = xfs_exchange_range_verify_area(fxr);
356 		if (error)
357 			return error;
358 	}
359 
360 	/*
361 	 * The start of both ranges must be aligned to the file allocation
362 	 * unit.
363 	 */
364 	if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
365 	    !IS_ALIGNED(fxr->file2_offset, alloc_unit))
366 		return -EINVAL;
367 
368 	/* Ensure offsets don't wrap. */
369 	if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
370 	    check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
371 		return -EINVAL;
372 
373 	/*
374 	 * We require both ranges to end within EOF, unless we're exchanging
375 	 * to EOF.
376 	 */
377 	if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
378 	    (fxr->file1_offset + fxr->length > size1 ||
379 	     fxr->file2_offset + fxr->length > size2))
380 		return -EINVAL;
381 
382 	/*
383 	 * Make sure we don't hit any file size limits.  If we hit any size
384 	 * limits such that test_length was adjusted, we abort the whole
385 	 * operation.
386 	 */
387 	test_len = fxr->length;
388 	error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
389 			&test_len);
390 	if (error)
391 		return error;
392 	error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
393 			&test_len);
394 	if (error)
395 		return error;
396 	if (test_len != fxr->length)
397 		return -EINVAL;
398 
399 	/*
400 	 * If the user wanted us to exchange up to the infile's EOF, round up
401 	 * to the next allocation unit boundary for this check.  Do the same
402 	 * for the outfile.
403 	 *
404 	 * Otherwise, reject the range length if it's not aligned to an
405 	 * allocation unit.
406 	 */
407 	if (fxr->file1_offset + fxr->length == size1)
408 		blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
409 	else if (fxr->file2_offset + fxr->length == size2)
410 		blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
411 	else if (!IS_ALIGNED(fxr->length, alloc_unit))
412 		return -EINVAL;
413 	else
414 		blen = fxr->length;
415 
416 	/* Don't allow overlapped exchanges within the same file. */
417 	if (inode1 == inode2 &&
418 	    fxr->file2_offset + blen > fxr->file1_offset &&
419 	    fxr->file1_offset + blen > fxr->file2_offset)
420 		return -EINVAL;
421 
422 	/*
423 	 * Ensure that we don't exchange a partial EOF block into the middle of
424 	 * another file.
425 	 */
426 	if ((fxr->length & allocmask) == 0)
427 		return 0;
428 
429 	blen = fxr->length;
430 	if (fxr->file2_offset + blen < size2)
431 		blen &= ~allocmask;
432 
433 	if (fxr->file1_offset + blen < size1)
434 		blen &= ~allocmask;
435 
436 	return blen == fxr->length ? 0 : -EINVAL;
437 }
438 
439 /*
440  * Check that the two inodes are eligible for range exchanges, the ranges make
441  * sense, and then flush all dirty data.  Caller must ensure that the inodes
442  * have been locked against any other modifications.
443  */
444 static inline int
445 xfs_exchange_range_prep(
446 	struct xfs_exchrange	*fxr,
447 	unsigned int		alloc_unit)
448 {
449 	struct inode		*inode1 = file_inode(fxr->file1);
450 	struct inode		*inode2 = file_inode(fxr->file2);
451 	bool			same_inode = (inode1 == inode2);
452 	int			error;
453 
454 	/* Check that we don't violate system file offset limits. */
455 	error = xfs_exchange_range_checks(fxr, alloc_unit);
456 	if (error || fxr->length == 0)
457 		return error;
458 
459 	/* Wait for the completion of any pending IOs on both files */
460 	inode_dio_wait(inode1);
461 	if (!same_inode)
462 		inode_dio_wait(inode2);
463 
464 	error = filemap_write_and_wait_range(inode1->i_mapping,
465 			fxr->file1_offset,
466 			fxr->file1_offset + fxr->length - 1);
467 	if (error)
468 		return error;
469 
470 	error = filemap_write_and_wait_range(inode2->i_mapping,
471 			fxr->file2_offset,
472 			fxr->file2_offset + fxr->length - 1);
473 	if (error)
474 		return error;
475 
476 	/*
477 	 * If the files or inodes involved require synchronous writes, amend
478 	 * the request to force the filesystem to flush all data and metadata
479 	 * to disk after the operation completes.
480 	 */
481 	if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
482 	    IS_SYNC(inode1) || IS_SYNC(inode2))
483 		fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
484 
485 	return 0;
486 }
487 
488 /*
489  * Finish a range exchange operation, if it was successful.  Caller must ensure
490  * that the inodes are still locked against any other modifications.
491  */
492 static inline int
493 xfs_exchange_range_finish(
494 	struct xfs_exchrange	*fxr)
495 {
496 	int			error;
497 
498 	error = file_remove_privs(fxr->file1);
499 	if (error)
500 		return error;
501 	if (file_inode(fxr->file1) == file_inode(fxr->file2))
502 		return 0;
503 
504 	return file_remove_privs(fxr->file2);
505 }
506 
507 /*
508  * Check the alignment of an exchange request when the allocation unit size
509  * isn't a power of two.  The generic file-level helpers use (fast)
510  * bitmask-based alignment checks, but here we have to use slow long division.
511  */
512 static int
513 xfs_exchrange_check_rtalign(
514 	const struct xfs_exchrange	*fxr,
515 	struct xfs_inode		*ip1,
516 	struct xfs_inode		*ip2,
517 	unsigned int			alloc_unit)
518 {
519 	uint64_t			length = fxr->length;
520 	uint64_t			blen;
521 	loff_t				size1, size2;
522 
523 	size1 = i_size_read(VFS_I(ip1));
524 	size2 = i_size_read(VFS_I(ip2));
525 
526 	/* The start of both ranges must be aligned to a rt extent. */
527 	if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
528 	    !isaligned_64(fxr->file2_offset, alloc_unit))
529 		return -EINVAL;
530 
531 	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
532 		length = max_t(int64_t, size1 - fxr->file1_offset,
533 					size2 - fxr->file2_offset);
534 
535 	/*
536 	 * If the user wanted us to exchange up to the infile's EOF, round up
537 	 * to the next rt extent boundary for this check.  Do the same for the
538 	 * outfile.
539 	 *
540 	 * Otherwise, reject the range length if it's not rt extent aligned.
541 	 * We already confirmed the starting offsets' rt extent block
542 	 * alignment.
543 	 */
544 	if (fxr->file1_offset + length == size1)
545 		blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
546 	else if (fxr->file2_offset + length == size2)
547 		blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
548 	else if (!isaligned_64(length, alloc_unit))
549 		return -EINVAL;
550 	else
551 		blen = length;
552 
553 	/* Don't allow overlapped exchanges within the same file. */
554 	if (ip1 == ip2 &&
555 	    fxr->file2_offset + blen > fxr->file1_offset &&
556 	    fxr->file1_offset + blen > fxr->file2_offset)
557 		return -EINVAL;
558 
559 	/*
560 	 * Ensure that we don't exchange a partial EOF rt extent into the
561 	 * middle of another file.
562 	 */
563 	if (isaligned_64(length, alloc_unit))
564 		return 0;
565 
566 	blen = length;
567 	if (fxr->file2_offset + length < size2)
568 		blen = rounddown_64(blen, alloc_unit);
569 
570 	if (fxr->file1_offset + blen < size1)
571 		blen = rounddown_64(blen, alloc_unit);
572 
573 	return blen == length ? 0 : -EINVAL;
574 }
575 
576 /* Prepare two files to have their data exchanged. */
577 STATIC int
578 xfs_exchrange_prep(
579 	struct xfs_exchrange	*fxr,
580 	struct xfs_inode	*ip1,
581 	struct xfs_inode	*ip2)
582 {
583 	struct xfs_mount	*mp = ip2->i_mount;
584 	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip2);
585 	int			error;
586 
587 	trace_xfs_exchrange_prep(fxr, ip1, ip2);
588 
589 	/* Verify both files are either real-time or non-realtime */
590 	if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
591 		return -EINVAL;
592 
593 	/* Check non-power of two alignment issues, if necessary. */
594 	if (!is_power_of_2(alloc_unit)) {
595 		error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
596 		if (error)
597 			return error;
598 
599 		/*
600 		 * Do the generic file-level checks with the regular block
601 		 * alignment.
602 		 */
603 		alloc_unit = mp->m_sb.sb_blocksize;
604 	}
605 
606 	error = xfs_exchange_range_prep(fxr, alloc_unit);
607 	if (error || fxr->length == 0)
608 		return error;
609 
610 	/* Attach dquots to both inodes before changing block maps. */
611 	error = xfs_qm_dqattach(ip2);
612 	if (error)
613 		return error;
614 	error = xfs_qm_dqattach(ip1);
615 	if (error)
616 		return error;
617 
618 	trace_xfs_exchrange_flush(fxr, ip1, ip2);
619 
620 	/* Flush the relevant ranges of both files. */
621 	error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
622 	if (error)
623 		return error;
624 	error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
625 	if (error)
626 		return error;
627 
628 	/*
629 	 * Cancel CoW fork preallocations for the ranges of both files.  The
630 	 * prep function should have flushed all the dirty data, so the only
631 	 * CoW mappings remaining should be speculative.
632 	 */
633 	if (xfs_inode_has_cow_data(ip1)) {
634 		error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
635 				fxr->length, true);
636 		if (error)
637 			return error;
638 	}
639 
640 	if (xfs_inode_has_cow_data(ip2)) {
641 		error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
642 				fxr->length, true);
643 		if (error)
644 			return error;
645 	}
646 
647 	return 0;
648 }
649 
650 /*
651  * Exchange contents of files.  This is the binding between the generic
652  * file-level concepts and the XFS inode-specific implementation.
653  */
654 STATIC int
655 xfs_exchrange_contents(
656 	struct xfs_exchrange	*fxr)
657 {
658 	struct inode		*inode1 = file_inode(fxr->file1);
659 	struct inode		*inode2 = file_inode(fxr->file2);
660 	struct xfs_inode	*ip1 = XFS_I(inode1);
661 	struct xfs_inode	*ip2 = XFS_I(inode2);
662 	struct xfs_mount	*mp = ip1->i_mount;
663 	int			error;
664 
665 	if (!xfs_has_exchange_range(mp))
666 		return -EOPNOTSUPP;
667 
668 	if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
669 			   XFS_EXCHANGE_RANGE_PRIV_FLAGS))
670 		return -EINVAL;
671 
672 	if (xfs_is_shutdown(mp))
673 		return -EIO;
674 
675 	/* Lock both files against IO */
676 	error = xfs_ilock2_io_mmap(ip1, ip2);
677 	if (error)
678 		goto out_err;
679 
680 	/* Prepare and then exchange file contents. */
681 	error = xfs_exchrange_prep(fxr, ip1, ip2);
682 	if (error)
683 		goto out_unlock;
684 
685 	error = xfs_exchrange_mappings(fxr, ip1, ip2);
686 	if (error)
687 		goto out_unlock;
688 
689 	/*
690 	 * Finish the exchange by removing special file privileges like any
691 	 * other file write would do.  This may involve turning on support for
692 	 * logged xattrs if either file has security capabilities.
693 	 */
694 	error = xfs_exchange_range_finish(fxr);
695 	if (error)
696 		goto out_unlock;
697 
698 out_unlock:
699 	xfs_iunlock2_io_mmap(ip1, ip2);
700 out_err:
701 	if (error)
702 		trace_xfs_exchrange_error(ip2, error, _RET_IP_);
703 	return error;
704 }
705 
706 /* Exchange parts of two files. */
707 static int
708 xfs_exchange_range(
709 	struct xfs_exchrange	*fxr)
710 {
711 	struct inode		*inode1 = file_inode(fxr->file1);
712 	struct inode		*inode2 = file_inode(fxr->file2);
713 	int			ret;
714 
715 	BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
716 		     XFS_EXCHANGE_RANGE_PRIV_FLAGS);
717 
718 	/* Both files must be on the same mount/filesystem. */
719 	if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
720 		return -EXDEV;
721 
722 	if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
723 		return -EINVAL;
724 
725 	/* Userspace requests only honored for regular files. */
726 	if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
727 		return -EISDIR;
728 	if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
729 		return -EINVAL;
730 
731 	/* Both files must be opened for read and write. */
732 	if (!(fxr->file1->f_mode & FMODE_READ) ||
733 	    !(fxr->file1->f_mode & FMODE_WRITE) ||
734 	    !(fxr->file2->f_mode & FMODE_READ) ||
735 	    !(fxr->file2->f_mode & FMODE_WRITE))
736 		return -EBADF;
737 
738 	/* Neither file can be opened append-only. */
739 	if ((fxr->file1->f_flags & O_APPEND) ||
740 	    (fxr->file2->f_flags & O_APPEND))
741 		return -EBADF;
742 
743 	/*
744 	 * If we're not exchanging to EOF, we can check the areas before
745 	 * stabilizing both files' i_size.
746 	 */
747 	if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
748 		ret = xfs_exchange_range_verify_area(fxr);
749 		if (ret)
750 			return ret;
751 	}
752 
753 	/* Update cmtime if the fd/inode don't forbid it. */
754 	if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
755 		fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
756 	if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
757 		fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
758 
759 	file_start_write(fxr->file2);
760 	ret = xfs_exchrange_contents(fxr);
761 	file_end_write(fxr->file2);
762 	if (ret)
763 		return ret;
764 
765 	fsnotify_modify(fxr->file1);
766 	if (fxr->file2 != fxr->file1)
767 		fsnotify_modify(fxr->file2);
768 	return 0;
769 }
770 
771 /* Collect exchange-range arguments from userspace. */
772 long
773 xfs_ioc_exchange_range(
774 	struct file			*file,
775 	struct xfs_exchange_range __user *argp)
776 {
777 	struct xfs_exchrange		fxr = {
778 		.file2			= file,
779 	};
780 	struct xfs_exchange_range	args;
781 	struct fd			file1;
782 	int				error;
783 
784 	if (copy_from_user(&args, argp, sizeof(args)))
785 		return -EFAULT;
786 	if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
787 		return -EINVAL;
788 	if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
789 		return -EINVAL;
790 
791 	fxr.file1_offset	= args.file1_offset;
792 	fxr.file2_offset	= args.file2_offset;
793 	fxr.length		= args.length;
794 	fxr.flags		= args.flags;
795 
796 	file1 = fdget(args.file1_fd);
797 	if (!file1.file)
798 		return -EBADF;
799 	fxr.file1 = file1.file;
800 
801 	error = xfs_exchange_range(&fxr);
802 	fdput(file1);
803 	return error;
804 }
805