xref: /linux/fs/xfs/xfs_exchrange.c (revision 0c0a19430bfdfedab437e77b9262e8e62ced384e)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
4   * Author: Darrick J. Wong <djwong@kernel.org>
5   */
6  #include "xfs.h"
7  #include "xfs_shared.h"
8  #include "xfs_format.h"
9  #include "xfs_log_format.h"
10  #include "xfs_trans_resv.h"
11  #include "xfs_mount.h"
12  #include "xfs_defer.h"
13  #include "xfs_inode.h"
14  #include "xfs_trans.h"
15  #include "xfs_quota.h"
16  #include "xfs_bmap_util.h"
17  #include "xfs_reflink.h"
18  #include "xfs_trace.h"
19  #include "xfs_exchrange.h"
20  #include "xfs_exchmaps.h"
21  #include "xfs_sb.h"
22  #include "xfs_icache.h"
23  #include "xfs_log.h"
24  #include "xfs_rtbitmap.h"
25  #include <linux/fsnotify.h>
26  
27  /* Lock (and optionally join) two inodes for a file range exchange. */
28  void
29  xfs_exchrange_ilock(
30  	struct xfs_trans	*tp,
31  	struct xfs_inode	*ip1,
32  	struct xfs_inode	*ip2)
33  {
34  	if (ip1 != ip2)
35  		xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
36  				    ip2, XFS_ILOCK_EXCL);
37  	else
38  		xfs_ilock(ip1, XFS_ILOCK_EXCL);
39  	if (tp) {
40  		xfs_trans_ijoin(tp, ip1, 0);
41  		if (ip2 != ip1)
42  			xfs_trans_ijoin(tp, ip2, 0);
43  	}
44  
45  }
46  
47  /* Unlock two inodes after a file range exchange operation. */
48  void
49  xfs_exchrange_iunlock(
50  	struct xfs_inode	*ip1,
51  	struct xfs_inode	*ip2)
52  {
53  	if (ip2 != ip1)
54  		xfs_iunlock(ip2, XFS_ILOCK_EXCL);
55  	xfs_iunlock(ip1, XFS_ILOCK_EXCL);
56  }
57  
58  /*
59   * Estimate the resource requirements to exchange file contents between the two
60   * files.  The caller is required to hold the IOLOCK and the MMAPLOCK and to
61   * have flushed both inodes' pagecache and active direct-ios.
62   */
63  int
64  xfs_exchrange_estimate(
65  	struct xfs_exchmaps_req	*req)
66  {
67  	int			error;
68  
69  	xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
70  	error = xfs_exchmaps_estimate(req);
71  	xfs_exchrange_iunlock(req->ip1, req->ip2);
72  	return error;
73  }
74  
75  /*
76   * Check that file2's metadata agree with the snapshot that we took for the
77   * range commit request.
78   *
79   * This should be called after the filesystem has locked /all/ inode metadata
80   * against modification.
81   */
82  STATIC int
83  xfs_exchrange_check_freshness(
84  	const struct xfs_exchrange	*fxr,
85  	struct xfs_inode		*ip2)
86  {
87  	struct inode			*inode2 = VFS_I(ip2);
88  	struct timespec64		ctime = inode_get_ctime(inode2);
89  	struct timespec64		mtime = inode_get_mtime(inode2);
90  
91  	trace_xfs_exchrange_freshness(fxr, ip2);
92  
93  	/* Check that file2 hasn't otherwise been modified. */
94  	if (fxr->file2_ino != ip2->i_ino ||
95  	    fxr->file2_gen != inode2->i_generation ||
96  	    !timespec64_equal(&fxr->file2_ctime, &ctime) ||
97  	    !timespec64_equal(&fxr->file2_mtime, &mtime))
98  		return -EBUSY;
99  
100  	return 0;
101  }
102  
103  #define QRETRY_IP1	(0x1)
104  #define QRETRY_IP2	(0x2)
105  
106  /*
107   * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
108   * this if quota enforcement is disabled or if both inodes' dquots are the
109   * same.  The qretry structure must be initialized to zeroes before the first
110   * call to this function.
111   */
112  STATIC int
113  xfs_exchrange_reserve_quota(
114  	struct xfs_trans		*tp,
115  	const struct xfs_exchmaps_req	*req,
116  	unsigned int			*qretry)
117  {
118  	int64_t				ddelta, rdelta;
119  	int				ip1_error = 0;
120  	int				error;
121  
122  	/*
123  	 * Don't bother with a quota reservation if we're not enforcing them
124  	 * or the two inodes have the same dquots.
125  	 */
126  	if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
127  	    (req->ip1->i_udquot == req->ip2->i_udquot &&
128  	     req->ip1->i_gdquot == req->ip2->i_gdquot &&
129  	     req->ip1->i_pdquot == req->ip2->i_pdquot))
130  		return 0;
131  
132  	*qretry = 0;
133  
134  	/*
135  	 * For each file, compute the net gain in the number of regular blocks
136  	 * that will be mapped into that file and reserve that much quota.  The
137  	 * quota counts must be able to absorb at least that much space.
138  	 */
139  	ddelta = req->ip2_bcount - req->ip1_bcount;
140  	rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
141  	if (ddelta > 0 || rdelta > 0) {
142  		error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
143  				ddelta > 0 ? ddelta : 0,
144  				rdelta > 0 ? rdelta : 0,
145  				false);
146  		if (error == -EDQUOT || error == -ENOSPC) {
147  			/*
148  			 * Save this error and see what happens if we try to
149  			 * reserve quota for ip2.  Then report both.
150  			 */
151  			*qretry |= QRETRY_IP1;
152  			ip1_error = error;
153  			error = 0;
154  		}
155  		if (error)
156  			return error;
157  	}
158  	if (ddelta < 0 || rdelta < 0) {
159  		error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
160  				ddelta < 0 ? -ddelta : 0,
161  				rdelta < 0 ? -rdelta : 0,
162  				false);
163  		if (error == -EDQUOT || error == -ENOSPC)
164  			*qretry |= QRETRY_IP2;
165  		if (error)
166  			return error;
167  	}
168  	if (ip1_error)
169  		return ip1_error;
170  
171  	/*
172  	 * For each file, forcibly reserve the gross gain in mapped blocks so
173  	 * that we don't trip over any quota block reservation assertions.
174  	 * We must reserve the gross gain because the quota code subtracts from
175  	 * bcount the number of blocks that we unmap; it does not add that
176  	 * quantity back to the quota block reservation.
177  	 */
178  	error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
179  			req->ip1_rtbcount, true);
180  	if (error)
181  		return error;
182  
183  	return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
184  			req->ip2_rtbcount, true);
185  }
186  
187  /* Exchange the mappings (and hence the contents) of two files' forks. */
188  STATIC int
189  xfs_exchrange_mappings(
190  	const struct xfs_exchrange	*fxr,
191  	struct xfs_inode		*ip1,
192  	struct xfs_inode		*ip2)
193  {
194  	struct xfs_mount		*mp = ip1->i_mount;
195  	struct xfs_exchmaps_req		req = {
196  		.ip1			= ip1,
197  		.ip2			= ip2,
198  		.startoff1		= XFS_B_TO_FSBT(mp, fxr->file1_offset),
199  		.startoff2		= XFS_B_TO_FSBT(mp, fxr->file2_offset),
200  		.blockcount		= XFS_B_TO_FSB(mp, fxr->length),
201  	};
202  	struct xfs_trans		*tp;
203  	unsigned int			qretry;
204  	bool				retried = false;
205  	int				error;
206  
207  	trace_xfs_exchrange_mappings(fxr, ip1, ip2);
208  
209  	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
210  		req.flags |= XFS_EXCHMAPS_SET_SIZES;
211  	if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
212  		req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
213  
214  	/*
215  	 * Round the request length up to the nearest file allocation unit.
216  	 * The prep function already checked that the request offsets and
217  	 * length in @fxr are safe to round up.
218  	 */
219  	if (xfs_inode_has_bigrtalloc(ip2))
220  		req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount);
221  
222  	error = xfs_exchrange_estimate(&req);
223  	if (error)
224  		return error;
225  
226  retry:
227  	/* Allocate the transaction, lock the inodes, and join them. */
228  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
229  			XFS_TRANS_RES_FDBLKS, &tp);
230  	if (error)
231  		return error;
232  
233  	xfs_exchrange_ilock(tp, ip1, ip2);
234  
235  	trace_xfs_exchrange_before(ip2, 2);
236  	trace_xfs_exchrange_before(ip1, 1);
237  
238  	error = xfs_exchmaps_check_forks(mp, &req);
239  	if (error)
240  		goto out_trans_cancel;
241  
242  	/*
243  	 * Reserve ourselves some quota if any of them are in enforcing mode.
244  	 * In theory we only need enough to satisfy the change in the number
245  	 * of blocks between the two ranges being remapped.
246  	 */
247  	error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
248  	if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
249  		xfs_trans_cancel(tp);
250  		xfs_exchrange_iunlock(ip1, ip2);
251  		if (qretry & QRETRY_IP1)
252  			xfs_blockgc_free_quota(ip1, 0);
253  		if (qretry & QRETRY_IP2)
254  			xfs_blockgc_free_quota(ip2, 0);
255  		retried = true;
256  		goto retry;
257  	}
258  	if (error)
259  		goto out_trans_cancel;
260  
261  	/* If we got this far on a dry run, all parameters are ok. */
262  	if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
263  		goto out_trans_cancel;
264  
265  	/* Update the mtime and ctime of both files. */
266  	if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
267  		xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
268  	if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
269  		xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
270  
271  	xfs_exchange_mappings(tp, &req);
272  
273  	/*
274  	 * Force the log to persist metadata updates if the caller or the
275  	 * administrator requires this.  The generic prep function already
276  	 * flushed the relevant parts of the page cache.
277  	 */
278  	if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
279  		xfs_trans_set_sync(tp);
280  
281  	error = xfs_trans_commit(tp);
282  
283  	trace_xfs_exchrange_after(ip2, 2);
284  	trace_xfs_exchrange_after(ip1, 1);
285  
286  	if (error)
287  		goto out_unlock;
288  
289  	/*
290  	 * If the caller wanted us to exchange the contents of two complete
291  	 * files of unequal length, exchange the incore sizes now.  This should
292  	 * be safe because we flushed both files' page caches, exchanged all
293  	 * the mappings, and updated the ondisk sizes.
294  	 */
295  	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
296  		loff_t	temp;
297  
298  		temp = i_size_read(VFS_I(ip2));
299  		i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
300  		i_size_write(VFS_I(ip1), temp);
301  	}
302  
303  out_unlock:
304  	xfs_exchrange_iunlock(ip1, ip2);
305  	return error;
306  
307  out_trans_cancel:
308  	xfs_trans_cancel(tp);
309  	goto out_unlock;
310  }
311  
312  /*
313   * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
314   * This part deals with struct file objects and byte ranges and does not deal
315   * with XFS-specific data structures such as xfs_inodes and block ranges.  This
316   * separation may some day facilitate porting to another filesystem.
317   *
318   * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
319   * file1 with the same number of bytes starting at fxr.file2_offset in file2.
320   * Implementations must call xfs_exchange_range_prep to prepare the two
321   * files prior to taking locks; and they must update the inode change and mod
322   * times of both files as part of the metadata update.  The timestamp update
323   * and freshness checks must be done atomically as part of the data exchange
324   * operation to ensure correctness of the freshness check.
325   * xfs_exchange_range_finish must be called after the operation completes
326   * successfully but before locks are dropped.
327   */
328  
329  /* Verify that we have security clearance to perform this operation. */
330  static int
331  xfs_exchange_range_verify_area(
332  	struct xfs_exchrange	*fxr)
333  {
334  	int			ret;
335  
336  	ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
337  			true);
338  	if (ret)
339  		return ret;
340  
341  	return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
342  			true);
343  }
344  
345  /*
346   * Performs necessary checks before doing a range exchange, having stabilized
347   * mutable inode attributes via i_rwsem.
348   */
349  static inline int
350  xfs_exchange_range_checks(
351  	struct xfs_exchrange	*fxr,
352  	unsigned int		alloc_unit)
353  {
354  	struct inode		*inode1 = file_inode(fxr->file1);
355  	struct inode		*inode2 = file_inode(fxr->file2);
356  	uint64_t		allocmask = alloc_unit - 1;
357  	int64_t			test_len;
358  	uint64_t		blen;
359  	loff_t			size1, size2, tmp;
360  	int			error;
361  
362  	/* Don't touch certain kinds of inodes */
363  	if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
364  		return -EPERM;
365  	if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
366  		return -ETXTBSY;
367  
368  	size1 = i_size_read(inode1);
369  	size2 = i_size_read(inode2);
370  
371  	/* Ranges cannot start after EOF. */
372  	if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
373  		return -EINVAL;
374  
375  	/*
376  	 * If the caller said to exchange to EOF, we set the length of the
377  	 * request large enough to cover everything to the end of both files.
378  	 */
379  	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
380  		fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
381  					     size2 - fxr->file2_offset);
382  
383  		error = xfs_exchange_range_verify_area(fxr);
384  		if (error)
385  			return error;
386  	}
387  
388  	/*
389  	 * The start of both ranges must be aligned to the file allocation
390  	 * unit.
391  	 */
392  	if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
393  	    !IS_ALIGNED(fxr->file2_offset, alloc_unit))
394  		return -EINVAL;
395  
396  	/* Ensure offsets don't wrap. */
397  	if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
398  	    check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
399  		return -EINVAL;
400  
401  	/*
402  	 * We require both ranges to end within EOF, unless we're exchanging
403  	 * to EOF.
404  	 */
405  	if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
406  	    (fxr->file1_offset + fxr->length > size1 ||
407  	     fxr->file2_offset + fxr->length > size2))
408  		return -EINVAL;
409  
410  	/*
411  	 * Make sure we don't hit any file size limits.  If we hit any size
412  	 * limits such that test_length was adjusted, we abort the whole
413  	 * operation.
414  	 */
415  	test_len = fxr->length;
416  	error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
417  			&test_len);
418  	if (error)
419  		return error;
420  	error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
421  			&test_len);
422  	if (error)
423  		return error;
424  	if (test_len != fxr->length)
425  		return -EINVAL;
426  
427  	/*
428  	 * If the user wanted us to exchange up to the infile's EOF, round up
429  	 * to the next allocation unit boundary for this check.  Do the same
430  	 * for the outfile.
431  	 *
432  	 * Otherwise, reject the range length if it's not aligned to an
433  	 * allocation unit.
434  	 */
435  	if (fxr->file1_offset + fxr->length == size1)
436  		blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
437  	else if (fxr->file2_offset + fxr->length == size2)
438  		blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
439  	else if (!IS_ALIGNED(fxr->length, alloc_unit))
440  		return -EINVAL;
441  	else
442  		blen = fxr->length;
443  
444  	/* Don't allow overlapped exchanges within the same file. */
445  	if (inode1 == inode2 &&
446  	    fxr->file2_offset + blen > fxr->file1_offset &&
447  	    fxr->file1_offset + blen > fxr->file2_offset)
448  		return -EINVAL;
449  
450  	/*
451  	 * Ensure that we don't exchange a partial EOF block into the middle of
452  	 * another file.
453  	 */
454  	if ((fxr->length & allocmask) == 0)
455  		return 0;
456  
457  	blen = fxr->length;
458  	if (fxr->file2_offset + blen < size2)
459  		blen &= ~allocmask;
460  
461  	if (fxr->file1_offset + blen < size1)
462  		blen &= ~allocmask;
463  
464  	return blen == fxr->length ? 0 : -EINVAL;
465  }
466  
467  /*
468   * Check that the two inodes are eligible for range exchanges, the ranges make
469   * sense, and then flush all dirty data.  Caller must ensure that the inodes
470   * have been locked against any other modifications.
471   */
472  static inline int
473  xfs_exchange_range_prep(
474  	struct xfs_exchrange	*fxr,
475  	unsigned int		alloc_unit)
476  {
477  	struct inode		*inode1 = file_inode(fxr->file1);
478  	struct inode		*inode2 = file_inode(fxr->file2);
479  	bool			same_inode = (inode1 == inode2);
480  	int			error;
481  
482  	/* Check that we don't violate system file offset limits. */
483  	error = xfs_exchange_range_checks(fxr, alloc_unit);
484  	if (error || fxr->length == 0)
485  		return error;
486  
487  	/* Wait for the completion of any pending IOs on both files */
488  	inode_dio_wait(inode1);
489  	if (!same_inode)
490  		inode_dio_wait(inode2);
491  
492  	error = filemap_write_and_wait_range(inode1->i_mapping,
493  			fxr->file1_offset,
494  			fxr->file1_offset + fxr->length - 1);
495  	if (error)
496  		return error;
497  
498  	error = filemap_write_and_wait_range(inode2->i_mapping,
499  			fxr->file2_offset,
500  			fxr->file2_offset + fxr->length - 1);
501  	if (error)
502  		return error;
503  
504  	/*
505  	 * If the files or inodes involved require synchronous writes, amend
506  	 * the request to force the filesystem to flush all data and metadata
507  	 * to disk after the operation completes.
508  	 */
509  	if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
510  	    IS_SYNC(inode1) || IS_SYNC(inode2))
511  		fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
512  
513  	return 0;
514  }
515  
516  /*
517   * Finish a range exchange operation, if it was successful.  Caller must ensure
518   * that the inodes are still locked against any other modifications.
519   */
520  static inline int
521  xfs_exchange_range_finish(
522  	struct xfs_exchrange	*fxr)
523  {
524  	int			error;
525  
526  	error = file_remove_privs(fxr->file1);
527  	if (error)
528  		return error;
529  	if (file_inode(fxr->file1) == file_inode(fxr->file2))
530  		return 0;
531  
532  	return file_remove_privs(fxr->file2);
533  }
534  
535  /*
536   * Check the alignment of an exchange request when the allocation unit size
537   * isn't a power of two.  The generic file-level helpers use (fast)
538   * bitmask-based alignment checks, but here we have to use slow long division.
539   */
540  static int
541  xfs_exchrange_check_rtalign(
542  	const struct xfs_exchrange	*fxr,
543  	struct xfs_inode		*ip1,
544  	struct xfs_inode		*ip2,
545  	unsigned int			alloc_unit)
546  {
547  	uint64_t			length = fxr->length;
548  	uint64_t			blen;
549  	loff_t				size1, size2;
550  
551  	size1 = i_size_read(VFS_I(ip1));
552  	size2 = i_size_read(VFS_I(ip2));
553  
554  	/* The start of both ranges must be aligned to a rt extent. */
555  	if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
556  	    !isaligned_64(fxr->file2_offset, alloc_unit))
557  		return -EINVAL;
558  
559  	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
560  		length = max_t(int64_t, size1 - fxr->file1_offset,
561  					size2 - fxr->file2_offset);
562  
563  	/*
564  	 * If the user wanted us to exchange up to the infile's EOF, round up
565  	 * to the next rt extent boundary for this check.  Do the same for the
566  	 * outfile.
567  	 *
568  	 * Otherwise, reject the range length if it's not rt extent aligned.
569  	 * We already confirmed the starting offsets' rt extent block
570  	 * alignment.
571  	 */
572  	if (fxr->file1_offset + length == size1)
573  		blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
574  	else if (fxr->file2_offset + length == size2)
575  		blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
576  	else if (!isaligned_64(length, alloc_unit))
577  		return -EINVAL;
578  	else
579  		blen = length;
580  
581  	/* Don't allow overlapped exchanges within the same file. */
582  	if (ip1 == ip2 &&
583  	    fxr->file2_offset + blen > fxr->file1_offset &&
584  	    fxr->file1_offset + blen > fxr->file2_offset)
585  		return -EINVAL;
586  
587  	/*
588  	 * Ensure that we don't exchange a partial EOF rt extent into the
589  	 * middle of another file.
590  	 */
591  	if (isaligned_64(length, alloc_unit))
592  		return 0;
593  
594  	blen = length;
595  	if (fxr->file2_offset + length < size2)
596  		blen = rounddown_64(blen, alloc_unit);
597  
598  	if (fxr->file1_offset + blen < size1)
599  		blen = rounddown_64(blen, alloc_unit);
600  
601  	return blen == length ? 0 : -EINVAL;
602  }
603  
604  /* Prepare two files to have their data exchanged. */
605  STATIC int
606  xfs_exchrange_prep(
607  	struct xfs_exchrange	*fxr,
608  	struct xfs_inode	*ip1,
609  	struct xfs_inode	*ip2)
610  {
611  	struct xfs_mount	*mp = ip2->i_mount;
612  	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip2);
613  	int			error;
614  
615  	trace_xfs_exchrange_prep(fxr, ip1, ip2);
616  
617  	/* Verify both files are either real-time or non-realtime */
618  	if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
619  		return -EINVAL;
620  
621  	/* Check non-power of two alignment issues, if necessary. */
622  	if (!is_power_of_2(alloc_unit)) {
623  		error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
624  		if (error)
625  			return error;
626  
627  		/*
628  		 * Do the generic file-level checks with the regular block
629  		 * alignment.
630  		 */
631  		alloc_unit = mp->m_sb.sb_blocksize;
632  	}
633  
634  	error = xfs_exchange_range_prep(fxr, alloc_unit);
635  	if (error || fxr->length == 0)
636  		return error;
637  
638  	if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
639  		error = xfs_exchrange_check_freshness(fxr, ip2);
640  		if (error)
641  			return error;
642  	}
643  
644  	/* Attach dquots to both inodes before changing block maps. */
645  	error = xfs_qm_dqattach(ip2);
646  	if (error)
647  		return error;
648  	error = xfs_qm_dqattach(ip1);
649  	if (error)
650  		return error;
651  
652  	trace_xfs_exchrange_flush(fxr, ip1, ip2);
653  
654  	/* Flush the relevant ranges of both files. */
655  	error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
656  	if (error)
657  		return error;
658  	error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
659  	if (error)
660  		return error;
661  
662  	/*
663  	 * Cancel CoW fork preallocations for the ranges of both files.  The
664  	 * prep function should have flushed all the dirty data, so the only
665  	 * CoW mappings remaining should be speculative.
666  	 */
667  	if (xfs_inode_has_cow_data(ip1)) {
668  		error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
669  				fxr->length, true);
670  		if (error)
671  			return error;
672  	}
673  
674  	if (xfs_inode_has_cow_data(ip2)) {
675  		error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
676  				fxr->length, true);
677  		if (error)
678  			return error;
679  	}
680  
681  	return 0;
682  }
683  
684  /*
685   * Exchange contents of files.  This is the binding between the generic
686   * file-level concepts and the XFS inode-specific implementation.
687   */
688  STATIC int
689  xfs_exchrange_contents(
690  	struct xfs_exchrange	*fxr)
691  {
692  	struct inode		*inode1 = file_inode(fxr->file1);
693  	struct inode		*inode2 = file_inode(fxr->file2);
694  	struct xfs_inode	*ip1 = XFS_I(inode1);
695  	struct xfs_inode	*ip2 = XFS_I(inode2);
696  	struct xfs_mount	*mp = ip1->i_mount;
697  	int			error;
698  
699  	if (!xfs_has_exchange_range(mp))
700  		return -EOPNOTSUPP;
701  
702  	if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
703  			   XFS_EXCHANGE_RANGE_PRIV_FLAGS))
704  		return -EINVAL;
705  
706  	if (xfs_is_shutdown(mp))
707  		return -EIO;
708  
709  	/* Lock both files against IO */
710  	error = xfs_ilock2_io_mmap(ip1, ip2);
711  	if (error)
712  		goto out_err;
713  
714  	/* Prepare and then exchange file contents. */
715  	error = xfs_exchrange_prep(fxr, ip1, ip2);
716  	if (error)
717  		goto out_unlock;
718  
719  	error = xfs_exchrange_mappings(fxr, ip1, ip2);
720  	if (error)
721  		goto out_unlock;
722  
723  	/*
724  	 * Finish the exchange by removing special file privileges like any
725  	 * other file write would do.  This may involve turning on support for
726  	 * logged xattrs if either file has security capabilities.
727  	 */
728  	error = xfs_exchange_range_finish(fxr);
729  	if (error)
730  		goto out_unlock;
731  
732  out_unlock:
733  	xfs_iunlock2_io_mmap(ip1, ip2);
734  out_err:
735  	if (error)
736  		trace_xfs_exchrange_error(ip2, error, _RET_IP_);
737  	return error;
738  }
739  
740  /* Exchange parts of two files. */
741  static int
742  xfs_exchange_range(
743  	struct xfs_exchrange	*fxr)
744  {
745  	struct inode		*inode1 = file_inode(fxr->file1);
746  	struct inode		*inode2 = file_inode(fxr->file2);
747  	int			ret;
748  
749  	BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
750  		     XFS_EXCHANGE_RANGE_PRIV_FLAGS);
751  
752  	/* Both files must be on the same mount/filesystem. */
753  	if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
754  		return -EXDEV;
755  
756  	if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
757  			 __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
758  		return -EINVAL;
759  
760  	/* Userspace requests only honored for regular files. */
761  	if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
762  		return -EISDIR;
763  	if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
764  		return -EINVAL;
765  
766  	/* Both files must be opened for read and write. */
767  	if (!(fxr->file1->f_mode & FMODE_READ) ||
768  	    !(fxr->file1->f_mode & FMODE_WRITE) ||
769  	    !(fxr->file2->f_mode & FMODE_READ) ||
770  	    !(fxr->file2->f_mode & FMODE_WRITE))
771  		return -EBADF;
772  
773  	/* Neither file can be opened append-only. */
774  	if ((fxr->file1->f_flags & O_APPEND) ||
775  	    (fxr->file2->f_flags & O_APPEND))
776  		return -EBADF;
777  
778  	/*
779  	 * If we're not exchanging to EOF, we can check the areas before
780  	 * stabilizing both files' i_size.
781  	 */
782  	if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
783  		ret = xfs_exchange_range_verify_area(fxr);
784  		if (ret)
785  			return ret;
786  	}
787  
788  	/* Update cmtime if the fd/inode don't forbid it. */
789  	if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
790  		fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
791  	if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
792  		fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
793  
794  	file_start_write(fxr->file2);
795  	ret = xfs_exchrange_contents(fxr);
796  	file_end_write(fxr->file2);
797  	if (ret)
798  		return ret;
799  
800  	fsnotify_modify(fxr->file1);
801  	if (fxr->file2 != fxr->file1)
802  		fsnotify_modify(fxr->file2);
803  	return 0;
804  }
805  
806  /* Collect exchange-range arguments from userspace. */
807  long
808  xfs_ioc_exchange_range(
809  	struct file			*file,
810  	struct xfs_exchange_range __user *argp)
811  {
812  	struct xfs_exchrange		fxr = {
813  		.file2			= file,
814  	};
815  	struct xfs_exchange_range	args;
816  
817  	if (copy_from_user(&args, argp, sizeof(args)))
818  		return -EFAULT;
819  	if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
820  		return -EINVAL;
821  	if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
822  		return -EINVAL;
823  
824  	fxr.file1_offset	= args.file1_offset;
825  	fxr.file2_offset	= args.file2_offset;
826  	fxr.length		= args.length;
827  	fxr.flags		= args.flags;
828  
829  	CLASS(fd, file1)(args.file1_fd);
830  	if (fd_empty(file1))
831  		return -EBADF;
832  	fxr.file1 = fd_file(file1);
833  
834  	return xfs_exchange_range(&fxr);
835  }
836  
837  /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
838  struct xfs_commit_range_fresh {
839  	xfs_fsid_t	fsid;		/* m_fixedfsid */
840  	__u64		file2_ino;	/* inode number */
841  	__s64		file2_mtime;	/* modification time */
842  	__s64		file2_ctime;	/* change time */
843  	__s32		file2_mtime_nsec; /* mod time, nsec */
844  	__s32		file2_ctime_nsec; /* change time, nsec */
845  	__u32		file2_gen;	/* inode generation */
846  	__u32		magic;		/* zero */
847  };
848  #define XCR_FRESH_MAGIC	0x444F524B	/* DORK */
849  
850  /* Set up a commitrange operation by sampling file2's write-related attrs */
851  long
852  xfs_ioc_start_commit(
853  	struct file			*file,
854  	struct xfs_commit_range __user	*argp)
855  {
856  	struct xfs_commit_range		args = { };
857  	struct timespec64		ts;
858  	struct xfs_commit_range_fresh	*kern_f;
859  	struct xfs_commit_range_fresh	__user *user_f;
860  	struct inode			*inode2 = file_inode(file);
861  	struct xfs_inode		*ip2 = XFS_I(inode2);
862  	const unsigned int		lockflags = XFS_IOLOCK_SHARED |
863  						    XFS_MMAPLOCK_SHARED |
864  						    XFS_ILOCK_SHARED;
865  
866  	BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
867  		     sizeof(args.file2_freshness));
868  
869  	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
870  
871  	memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
872  
873  	xfs_ilock(ip2, lockflags);
874  	ts = inode_get_ctime(inode2);
875  	kern_f->file2_ctime		= ts.tv_sec;
876  	kern_f->file2_ctime_nsec	= ts.tv_nsec;
877  	ts = inode_get_mtime(inode2);
878  	kern_f->file2_mtime		= ts.tv_sec;
879  	kern_f->file2_mtime_nsec	= ts.tv_nsec;
880  	kern_f->file2_ino		= ip2->i_ino;
881  	kern_f->file2_gen		= inode2->i_generation;
882  	kern_f->magic			= XCR_FRESH_MAGIC;
883  	xfs_iunlock(ip2, lockflags);
884  
885  	user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
886  	if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
887  		return -EFAULT;
888  
889  	return 0;
890  }
891  
892  /*
893   * Exchange file1 and file2 contents if file2 has not been written since the
894   * start commit operation.
895   */
896  long
897  xfs_ioc_commit_range(
898  	struct file			*file,
899  	struct xfs_commit_range __user	*argp)
900  {
901  	struct xfs_exchrange		fxr = {
902  		.file2			= file,
903  	};
904  	struct xfs_commit_range		args;
905  	struct xfs_commit_range_fresh	*kern_f;
906  	struct xfs_inode		*ip2 = XFS_I(file_inode(file));
907  	struct xfs_mount		*mp = ip2->i_mount;
908  
909  	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
910  
911  	if (copy_from_user(&args, argp, sizeof(args)))
912  		return -EFAULT;
913  	if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
914  		return -EINVAL;
915  	if (kern_f->magic != XCR_FRESH_MAGIC)
916  		return -EBUSY;
917  	if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
918  		return -EBUSY;
919  
920  	fxr.file1_offset	= args.file1_offset;
921  	fxr.file2_offset	= args.file2_offset;
922  	fxr.length		= args.length;
923  	fxr.flags		= args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
924  	fxr.file2_ino		= kern_f->file2_ino;
925  	fxr.file2_gen		= kern_f->file2_gen;
926  	fxr.file2_mtime.tv_sec	= kern_f->file2_mtime;
927  	fxr.file2_mtime.tv_nsec	= kern_f->file2_mtime_nsec;
928  	fxr.file2_ctime.tv_sec	= kern_f->file2_ctime;
929  	fxr.file2_ctime.tv_nsec	= kern_f->file2_ctime_nsec;
930  
931  	CLASS(fd, file1)(args.file1_fd);
932  	if (fd_empty(file1))
933  		return -EBADF;
934  	fxr.file1 = fd_file(file1);
935  
936  	return xfs_exchange_range(&fxr);
937  }
938