1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_quota.h"
16 #include "xfs_bmap_util.h"
17 #include "xfs_reflink.h"
18 #include "xfs_trace.h"
19 #include "xfs_exchrange.h"
20 #include "xfs_exchmaps.h"
21 #include "xfs_sb.h"
22 #include "xfs_icache.h"
23 #include "xfs_log.h"
24 #include "xfs_rtbitmap.h"
25 #include <linux/fsnotify.h>
26
27 /* Lock (and optionally join) two inodes for a file range exchange. */
28 void
xfs_exchrange_ilock(struct xfs_trans * tp,struct xfs_inode * ip1,struct xfs_inode * ip2)29 xfs_exchrange_ilock(
30 struct xfs_trans *tp,
31 struct xfs_inode *ip1,
32 struct xfs_inode *ip2)
33 {
34 if (ip1 != ip2)
35 xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
36 ip2, XFS_ILOCK_EXCL);
37 else
38 xfs_ilock(ip1, XFS_ILOCK_EXCL);
39 if (tp) {
40 xfs_trans_ijoin(tp, ip1, 0);
41 if (ip2 != ip1)
42 xfs_trans_ijoin(tp, ip2, 0);
43 }
44
45 }
46
47 /* Unlock two inodes after a file range exchange operation. */
48 void
xfs_exchrange_iunlock(struct xfs_inode * ip1,struct xfs_inode * ip2)49 xfs_exchrange_iunlock(
50 struct xfs_inode *ip1,
51 struct xfs_inode *ip2)
52 {
53 if (ip2 != ip1)
54 xfs_iunlock(ip2, XFS_ILOCK_EXCL);
55 xfs_iunlock(ip1, XFS_ILOCK_EXCL);
56 }
57
58 /*
59 * Estimate the resource requirements to exchange file contents between the two
60 * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to
61 * have flushed both inodes' pagecache and active direct-ios.
62 */
63 int
xfs_exchrange_estimate(struct xfs_exchmaps_req * req)64 xfs_exchrange_estimate(
65 struct xfs_exchmaps_req *req)
66 {
67 int error;
68
69 xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
70 error = xfs_exchmaps_estimate(req);
71 xfs_exchrange_iunlock(req->ip1, req->ip2);
72 return error;
73 }
74
75 /*
76 * Check that file2's metadata agree with the snapshot that we took for the
77 * range commit request.
78 *
79 * This should be called after the filesystem has locked /all/ inode metadata
80 * against modification.
81 */
82 STATIC int
xfs_exchrange_check_freshness(const struct xfs_exchrange * fxr,struct xfs_inode * ip2)83 xfs_exchrange_check_freshness(
84 const struct xfs_exchrange *fxr,
85 struct xfs_inode *ip2)
86 {
87 struct inode *inode2 = VFS_I(ip2);
88 struct timespec64 ctime = inode_get_ctime(inode2);
89 struct timespec64 mtime = inode_get_mtime(inode2);
90
91 trace_xfs_exchrange_freshness(fxr, ip2);
92
93 /* Check that file2 hasn't otherwise been modified. */
94 if (fxr->file2_ino != ip2->i_ino ||
95 fxr->file2_gen != inode2->i_generation ||
96 !timespec64_equal(&fxr->file2_ctime, &ctime) ||
97 !timespec64_equal(&fxr->file2_mtime, &mtime))
98 return -EBUSY;
99
100 return 0;
101 }
102
103 #define QRETRY_IP1 (0x1)
104 #define QRETRY_IP2 (0x2)
105
106 /*
107 * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
108 * this if quota enforcement is disabled or if both inodes' dquots are the
109 * same. The qretry structure must be initialized to zeroes before the first
110 * call to this function.
111 */
112 STATIC int
xfs_exchrange_reserve_quota(struct xfs_trans * tp,const struct xfs_exchmaps_req * req,unsigned int * qretry)113 xfs_exchrange_reserve_quota(
114 struct xfs_trans *tp,
115 const struct xfs_exchmaps_req *req,
116 unsigned int *qretry)
117 {
118 int64_t ddelta, rdelta;
119 int ip1_error = 0;
120 int error;
121
122 ASSERT(!xfs_is_metadir_inode(req->ip1));
123 ASSERT(!xfs_is_metadir_inode(req->ip2));
124
125 /*
126 * Don't bother with a quota reservation if we're not enforcing them
127 * or the two inodes have the same dquots.
128 */
129 if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
130 (req->ip1->i_udquot == req->ip2->i_udquot &&
131 req->ip1->i_gdquot == req->ip2->i_gdquot &&
132 req->ip1->i_pdquot == req->ip2->i_pdquot))
133 return 0;
134
135 *qretry = 0;
136
137 /*
138 * For each file, compute the net gain in the number of regular blocks
139 * that will be mapped into that file and reserve that much quota. The
140 * quota counts must be able to absorb at least that much space.
141 */
142 ddelta = req->ip2_bcount - req->ip1_bcount;
143 rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
144 if (ddelta > 0 || rdelta > 0) {
145 error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
146 ddelta > 0 ? ddelta : 0,
147 rdelta > 0 ? rdelta : 0,
148 false);
149 if (error == -EDQUOT || error == -ENOSPC) {
150 /*
151 * Save this error and see what happens if we try to
152 * reserve quota for ip2. Then report both.
153 */
154 *qretry |= QRETRY_IP1;
155 ip1_error = error;
156 error = 0;
157 }
158 if (error)
159 return error;
160 }
161 if (ddelta < 0 || rdelta < 0) {
162 error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
163 ddelta < 0 ? -ddelta : 0,
164 rdelta < 0 ? -rdelta : 0,
165 false);
166 if (error == -EDQUOT || error == -ENOSPC)
167 *qretry |= QRETRY_IP2;
168 if (error)
169 return error;
170 }
171 if (ip1_error)
172 return ip1_error;
173
174 /*
175 * For each file, forcibly reserve the gross gain in mapped blocks so
176 * that we don't trip over any quota block reservation assertions.
177 * We must reserve the gross gain because the quota code subtracts from
178 * bcount the number of blocks that we unmap; it does not add that
179 * quantity back to the quota block reservation.
180 */
181 error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
182 req->ip1_rtbcount, true);
183 if (error)
184 return error;
185
186 return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
187 req->ip2_rtbcount, true);
188 }
189
190 /* Exchange the mappings (and hence the contents) of two files' forks. */
191 STATIC int
xfs_exchrange_mappings(const struct xfs_exchrange * fxr,struct xfs_inode * ip1,struct xfs_inode * ip2)192 xfs_exchrange_mappings(
193 const struct xfs_exchrange *fxr,
194 struct xfs_inode *ip1,
195 struct xfs_inode *ip2)
196 {
197 struct xfs_mount *mp = ip1->i_mount;
198 struct xfs_exchmaps_req req = {
199 .ip1 = ip1,
200 .ip2 = ip2,
201 .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset),
202 .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset),
203 .blockcount = XFS_B_TO_FSB(mp, fxr->length),
204 };
205 struct xfs_trans *tp;
206 unsigned int qretry;
207 bool retried = false;
208 int error;
209
210 trace_xfs_exchrange_mappings(fxr, ip1, ip2);
211
212 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
213 req.flags |= XFS_EXCHMAPS_SET_SIZES;
214 if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
215 req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
216
217 /*
218 * Round the request length up to the nearest file allocation unit.
219 * The prep function already checked that the request offsets and
220 * length in @fxr are safe to round up.
221 */
222 if (xfs_inode_has_bigrtalloc(ip2))
223 req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount);
224
225 error = xfs_exchrange_estimate(&req);
226 if (error)
227 return error;
228
229 retry:
230 /* Allocate the transaction, lock the inodes, and join them. */
231 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
232 XFS_TRANS_RES_FDBLKS, &tp);
233 if (error)
234 return error;
235
236 xfs_exchrange_ilock(tp, ip1, ip2);
237
238 trace_xfs_exchrange_before(ip2, 2);
239 trace_xfs_exchrange_before(ip1, 1);
240
241 error = xfs_exchmaps_check_forks(mp, &req);
242 if (error)
243 goto out_trans_cancel;
244
245 /*
246 * Reserve ourselves some quota if any of them are in enforcing mode.
247 * In theory we only need enough to satisfy the change in the number
248 * of blocks between the two ranges being remapped.
249 */
250 error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
251 if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
252 xfs_trans_cancel(tp);
253 xfs_exchrange_iunlock(ip1, ip2);
254 if (qretry & QRETRY_IP1)
255 xfs_blockgc_free_quota(ip1, 0);
256 if (qretry & QRETRY_IP2)
257 xfs_blockgc_free_quota(ip2, 0);
258 retried = true;
259 goto retry;
260 }
261 if (error)
262 goto out_trans_cancel;
263
264 /* If we got this far on a dry run, all parameters are ok. */
265 if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
266 goto out_trans_cancel;
267
268 /* Update the mtime and ctime of both files. */
269 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
270 xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
271 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
272 xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
273
274 xfs_exchange_mappings(tp, &req);
275
276 /*
277 * Force the log to persist metadata updates if the caller or the
278 * administrator requires this. The generic prep function already
279 * flushed the relevant parts of the page cache.
280 */
281 if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
282 xfs_trans_set_sync(tp);
283
284 error = xfs_trans_commit(tp);
285
286 trace_xfs_exchrange_after(ip2, 2);
287 trace_xfs_exchrange_after(ip1, 1);
288
289 if (error)
290 goto out_unlock;
291
292 /*
293 * If the caller wanted us to exchange the contents of two complete
294 * files of unequal length, exchange the incore sizes now. This should
295 * be safe because we flushed both files' page caches, exchanged all
296 * the mappings, and updated the ondisk sizes.
297 */
298 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
299 loff_t temp;
300
301 temp = i_size_read(VFS_I(ip2));
302 i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
303 i_size_write(VFS_I(ip1), temp);
304 }
305
306 out_unlock:
307 xfs_exchrange_iunlock(ip1, ip2);
308 return error;
309
310 out_trans_cancel:
311 xfs_trans_cancel(tp);
312 goto out_unlock;
313 }
314
315 /*
316 * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
317 * This part deals with struct file objects and byte ranges and does not deal
318 * with XFS-specific data structures such as xfs_inodes and block ranges. This
319 * separation may some day facilitate porting to another filesystem.
320 *
321 * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
322 * file1 with the same number of bytes starting at fxr.file2_offset in file2.
323 * Implementations must call xfs_exchange_range_prep to prepare the two
324 * files prior to taking locks; and they must update the inode change and mod
325 * times of both files as part of the metadata update. The timestamp update
326 * and freshness checks must be done atomically as part of the data exchange
327 * operation to ensure correctness of the freshness check.
328 * xfs_exchange_range_finish must be called after the operation completes
329 * successfully but before locks are dropped.
330 */
331
332 /*
333 * Performs necessary checks before doing a range exchange, having stabilized
334 * mutable inode attributes via i_rwsem.
335 */
336 static inline int
xfs_exchange_range_checks(struct xfs_exchrange * fxr,unsigned int alloc_unit)337 xfs_exchange_range_checks(
338 struct xfs_exchrange *fxr,
339 unsigned int alloc_unit)
340 {
341 struct inode *inode1 = file_inode(fxr->file1);
342 loff_t size1 = i_size_read(inode1);
343 struct inode *inode2 = file_inode(fxr->file2);
344 loff_t size2 = i_size_read(inode2);
345 uint64_t allocmask = alloc_unit - 1;
346 int64_t test_len;
347 uint64_t blen;
348 loff_t tmp;
349 int error;
350
351 /* Don't touch certain kinds of inodes */
352 if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
353 return -EPERM;
354 if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
355 return -ETXTBSY;
356
357 /* Ranges cannot start after EOF. */
358 if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
359 return -EINVAL;
360
361 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
362 /*
363 * If the caller said to exchange to EOF, we set the length of
364 * the request large enough to cover everything to the end of
365 * both files.
366 */
367 fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
368 size2 - fxr->file2_offset);
369 } else {
370 /*
371 * Otherwise we require both ranges to end within EOF.
372 */
373 if (fxr->file1_offset + fxr->length > size1 ||
374 fxr->file2_offset + fxr->length > size2)
375 return -EINVAL;
376 }
377
378 /*
379 * The start of both ranges must be aligned to the file allocation
380 * unit.
381 */
382 if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
383 !IS_ALIGNED(fxr->file2_offset, alloc_unit))
384 return -EINVAL;
385
386 /* Ensure offsets don't wrap. */
387 if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
388 check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
389 return -EINVAL;
390
391 /*
392 * Make sure we don't hit any file size limits. If we hit any size
393 * limits such that test_length was adjusted, we abort the whole
394 * operation.
395 */
396 test_len = fxr->length;
397 error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
398 &test_len);
399 if (error)
400 return error;
401 error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
402 &test_len);
403 if (error)
404 return error;
405 if (test_len != fxr->length)
406 return -EINVAL;
407
408 /*
409 * If the user wanted us to exchange up to the infile's EOF, round up
410 * to the next allocation unit boundary for this check. Do the same
411 * for the outfile.
412 *
413 * Otherwise, reject the range length if it's not aligned to an
414 * allocation unit.
415 */
416 if (fxr->file1_offset + fxr->length == size1)
417 blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
418 else if (fxr->file2_offset + fxr->length == size2)
419 blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
420 else if (!IS_ALIGNED(fxr->length, alloc_unit))
421 return -EINVAL;
422 else
423 blen = fxr->length;
424
425 /* Don't allow overlapped exchanges within the same file. */
426 if (inode1 == inode2 &&
427 fxr->file2_offset + blen > fxr->file1_offset &&
428 fxr->file1_offset + blen > fxr->file2_offset)
429 return -EINVAL;
430
431 /*
432 * Ensure that we don't exchange a partial EOF block into the middle of
433 * another file.
434 */
435 if ((fxr->length & allocmask) == 0)
436 return 0;
437
438 blen = fxr->length;
439 if (fxr->file2_offset + blen < size2)
440 blen &= ~allocmask;
441
442 if (fxr->file1_offset + blen < size1)
443 blen &= ~allocmask;
444
445 return blen == fxr->length ? 0 : -EINVAL;
446 }
447
448 /*
449 * Check that the two inodes are eligible for range exchanges, the ranges make
450 * sense, and then flush all dirty data. Caller must ensure that the inodes
451 * have been locked against any other modifications.
452 */
453 static inline int
xfs_exchange_range_prep(struct xfs_exchrange * fxr,unsigned int alloc_unit)454 xfs_exchange_range_prep(
455 struct xfs_exchrange *fxr,
456 unsigned int alloc_unit)
457 {
458 struct inode *inode1 = file_inode(fxr->file1);
459 struct inode *inode2 = file_inode(fxr->file2);
460 bool same_inode = (inode1 == inode2);
461 int error;
462
463 /* Check that we don't violate system file offset limits. */
464 error = xfs_exchange_range_checks(fxr, alloc_unit);
465 if (error || fxr->length == 0)
466 return error;
467
468 /* Wait for the completion of any pending IOs on both files */
469 inode_dio_wait(inode1);
470 if (!same_inode)
471 inode_dio_wait(inode2);
472
473 error = filemap_write_and_wait_range(inode1->i_mapping,
474 fxr->file1_offset,
475 fxr->file1_offset + fxr->length - 1);
476 if (error)
477 return error;
478
479 error = filemap_write_and_wait_range(inode2->i_mapping,
480 fxr->file2_offset,
481 fxr->file2_offset + fxr->length - 1);
482 if (error)
483 return error;
484
485 /*
486 * If the files or inodes involved require synchronous writes, amend
487 * the request to force the filesystem to flush all data and metadata
488 * to disk after the operation completes.
489 */
490 if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
491 IS_SYNC(inode1) || IS_SYNC(inode2))
492 fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
493
494 return 0;
495 }
496
497 /*
498 * Finish a range exchange operation, if it was successful. Caller must ensure
499 * that the inodes are still locked against any other modifications.
500 */
501 static inline int
xfs_exchange_range_finish(struct xfs_exchrange * fxr)502 xfs_exchange_range_finish(
503 struct xfs_exchrange *fxr)
504 {
505 int error;
506
507 error = file_remove_privs(fxr->file1);
508 if (error)
509 return error;
510 if (file_inode(fxr->file1) == file_inode(fxr->file2))
511 return 0;
512
513 return file_remove_privs(fxr->file2);
514 }
515
516 /*
517 * Check the alignment of an exchange request when the allocation unit size
518 * isn't a power of two. The generic file-level helpers use (fast)
519 * bitmask-based alignment checks, but here we have to use slow long division.
520 */
521 static int
xfs_exchrange_check_rtalign(const struct xfs_exchrange * fxr,struct xfs_inode * ip1,struct xfs_inode * ip2,unsigned int alloc_unit)522 xfs_exchrange_check_rtalign(
523 const struct xfs_exchrange *fxr,
524 struct xfs_inode *ip1,
525 struct xfs_inode *ip2,
526 unsigned int alloc_unit)
527 {
528 uint64_t length = fxr->length;
529 uint64_t blen;
530 loff_t size1, size2;
531
532 size1 = i_size_read(VFS_I(ip1));
533 size2 = i_size_read(VFS_I(ip2));
534
535 /* The start of both ranges must be aligned to a rt extent. */
536 if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
537 !isaligned_64(fxr->file2_offset, alloc_unit))
538 return -EINVAL;
539
540 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
541 length = max_t(int64_t, size1 - fxr->file1_offset,
542 size2 - fxr->file2_offset);
543
544 /*
545 * If the user wanted us to exchange up to the infile's EOF, round up
546 * to the next rt extent boundary for this check. Do the same for the
547 * outfile.
548 *
549 * Otherwise, reject the range length if it's not rt extent aligned.
550 * We already confirmed the starting offsets' rt extent block
551 * alignment.
552 */
553 if (fxr->file1_offset + length == size1)
554 blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
555 else if (fxr->file2_offset + length == size2)
556 blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
557 else if (!isaligned_64(length, alloc_unit))
558 return -EINVAL;
559 else
560 blen = length;
561
562 /* Don't allow overlapped exchanges within the same file. */
563 if (ip1 == ip2 &&
564 fxr->file2_offset + blen > fxr->file1_offset &&
565 fxr->file1_offset + blen > fxr->file2_offset)
566 return -EINVAL;
567
568 /*
569 * Ensure that we don't exchange a partial EOF rt extent into the
570 * middle of another file.
571 */
572 if (isaligned_64(length, alloc_unit))
573 return 0;
574
575 blen = length;
576 if (fxr->file2_offset + length < size2)
577 blen = rounddown_64(blen, alloc_unit);
578
579 if (fxr->file1_offset + blen < size1)
580 blen = rounddown_64(blen, alloc_unit);
581
582 return blen == length ? 0 : -EINVAL;
583 }
584
585 /* Prepare two files to have their data exchanged. */
586 STATIC int
xfs_exchrange_prep(struct xfs_exchrange * fxr,struct xfs_inode * ip1,struct xfs_inode * ip2)587 xfs_exchrange_prep(
588 struct xfs_exchrange *fxr,
589 struct xfs_inode *ip1,
590 struct xfs_inode *ip2)
591 {
592 struct xfs_mount *mp = ip2->i_mount;
593 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2);
594 int error;
595
596 trace_xfs_exchrange_prep(fxr, ip1, ip2);
597
598 /* Verify both files are either real-time or non-realtime */
599 if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
600 return -EINVAL;
601
602 /* Check non-power of two alignment issues, if necessary. */
603 if (!is_power_of_2(alloc_unit)) {
604 error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
605 if (error)
606 return error;
607
608 /*
609 * Do the generic file-level checks with the regular block
610 * alignment.
611 */
612 alloc_unit = mp->m_sb.sb_blocksize;
613 }
614
615 error = xfs_exchange_range_prep(fxr, alloc_unit);
616 if (error || fxr->length == 0)
617 return error;
618
619 if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
620 error = xfs_exchrange_check_freshness(fxr, ip2);
621 if (error)
622 return error;
623 }
624
625 /* Attach dquots to both inodes before changing block maps. */
626 error = xfs_qm_dqattach(ip2);
627 if (error)
628 return error;
629 error = xfs_qm_dqattach(ip1);
630 if (error)
631 return error;
632
633 trace_xfs_exchrange_flush(fxr, ip1, ip2);
634
635 /* Flush the relevant ranges of both files. */
636 error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
637 if (error)
638 return error;
639 error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
640 if (error)
641 return error;
642
643 /*
644 * Cancel CoW fork preallocations for the ranges of both files. The
645 * prep function should have flushed all the dirty data, so the only
646 * CoW mappings remaining should be speculative.
647 */
648 if (xfs_inode_has_cow_data(ip1)) {
649 error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
650 fxr->length, true);
651 if (error)
652 return error;
653 }
654
655 if (xfs_inode_has_cow_data(ip2)) {
656 error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
657 fxr->length, true);
658 if (error)
659 return error;
660 }
661
662 return 0;
663 }
664
665 /*
666 * Exchange contents of files. This is the binding between the generic
667 * file-level concepts and the XFS inode-specific implementation.
668 */
669 STATIC int
xfs_exchrange_contents(struct xfs_exchrange * fxr)670 xfs_exchrange_contents(
671 struct xfs_exchrange *fxr)
672 {
673 struct inode *inode1 = file_inode(fxr->file1);
674 struct inode *inode2 = file_inode(fxr->file2);
675 struct xfs_inode *ip1 = XFS_I(inode1);
676 struct xfs_inode *ip2 = XFS_I(inode2);
677 struct xfs_mount *mp = ip1->i_mount;
678 int error;
679
680 if (!xfs_has_exchange_range(mp))
681 return -EOPNOTSUPP;
682
683 if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
684 XFS_EXCHANGE_RANGE_PRIV_FLAGS))
685 return -EINVAL;
686
687 if (xfs_is_shutdown(mp))
688 return -EIO;
689
690 /* Lock both files against IO */
691 error = xfs_ilock2_io_mmap(ip1, ip2);
692 if (error)
693 goto out_err;
694
695 /* Prepare and then exchange file contents. */
696 error = xfs_exchrange_prep(fxr, ip1, ip2);
697 if (error)
698 goto out_unlock;
699
700 error = xfs_exchrange_mappings(fxr, ip1, ip2);
701 if (error)
702 goto out_unlock;
703
704 /*
705 * Finish the exchange by removing special file privileges like any
706 * other file write would do. This may involve turning on support for
707 * logged xattrs if either file has security capabilities.
708 */
709 error = xfs_exchange_range_finish(fxr);
710 if (error)
711 goto out_unlock;
712
713 out_unlock:
714 xfs_iunlock2_io_mmap(ip1, ip2);
715 out_err:
716 if (error)
717 trace_xfs_exchrange_error(ip2, error, _RET_IP_);
718 return error;
719 }
720
721 /* Exchange parts of two files. */
722 static int
xfs_exchange_range(struct xfs_exchrange * fxr)723 xfs_exchange_range(
724 struct xfs_exchrange *fxr)
725 {
726 struct inode *inode1 = file_inode(fxr->file1);
727 struct inode *inode2 = file_inode(fxr->file2);
728 loff_t check_len = fxr->length;
729 int ret;
730
731 BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
732 XFS_EXCHANGE_RANGE_PRIV_FLAGS);
733
734 /* Both files must be on the same mount/filesystem. */
735 if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
736 return -EXDEV;
737
738 if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
739 __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
740 return -EINVAL;
741
742 /* Userspace requests only honored for regular files. */
743 if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
744 return -EISDIR;
745 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
746 return -EINVAL;
747
748 /* Both files must be opened for read and write. */
749 if (!(fxr->file1->f_mode & FMODE_READ) ||
750 !(fxr->file1->f_mode & FMODE_WRITE) ||
751 !(fxr->file2->f_mode & FMODE_READ) ||
752 !(fxr->file2->f_mode & FMODE_WRITE))
753 return -EBADF;
754
755 /* Neither file can be opened append-only. */
756 if ((fxr->file1->f_flags & O_APPEND) ||
757 (fxr->file2->f_flags & O_APPEND))
758 return -EBADF;
759
760 /*
761 * If we're exchanging to EOF we can't calculate the length until taking
762 * the iolock. Pass a 0 length to remap_verify_area similar to the
763 * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well.
764 */
765 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
766 check_len = 0;
767 ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true);
768 if (ret)
769 return ret;
770 ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true);
771 if (ret)
772 return ret;
773
774 /* Update cmtime if the fd/inode don't forbid it. */
775 if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
776 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
777 if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
778 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
779
780 file_start_write(fxr->file2);
781 ret = xfs_exchrange_contents(fxr);
782 file_end_write(fxr->file2);
783 if (ret)
784 return ret;
785
786 fsnotify_modify(fxr->file1);
787 if (fxr->file2 != fxr->file1)
788 fsnotify_modify(fxr->file2);
789 return 0;
790 }
791
792 /* Collect exchange-range arguments from userspace. */
793 long
xfs_ioc_exchange_range(struct file * file,struct xfs_exchange_range __user * argp)794 xfs_ioc_exchange_range(
795 struct file *file,
796 struct xfs_exchange_range __user *argp)
797 {
798 struct xfs_exchrange fxr = {
799 .file2 = file,
800 };
801 struct xfs_exchange_range args;
802
803 if (copy_from_user(&args, argp, sizeof(args)))
804 return -EFAULT;
805 if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
806 return -EINVAL;
807 if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
808 return -EINVAL;
809
810 fxr.file1_offset = args.file1_offset;
811 fxr.file2_offset = args.file2_offset;
812 fxr.length = args.length;
813 fxr.flags = args.flags;
814
815 CLASS(fd, file1)(args.file1_fd);
816 if (fd_empty(file1))
817 return -EBADF;
818 fxr.file1 = fd_file(file1);
819
820 return xfs_exchange_range(&fxr);
821 }
822
823 /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
824 struct xfs_commit_range_fresh {
825 xfs_fsid_t fsid; /* m_fixedfsid */
826 __u64 file2_ino; /* inode number */
827 __s64 file2_mtime; /* modification time */
828 __s64 file2_ctime; /* change time */
829 __s32 file2_mtime_nsec; /* mod time, nsec */
830 __s32 file2_ctime_nsec; /* change time, nsec */
831 __u32 file2_gen; /* inode generation */
832 __u32 magic; /* zero */
833 };
834 #define XCR_FRESH_MAGIC 0x444F524B /* DORK */
835
836 /* Set up a commitrange operation by sampling file2's write-related attrs */
837 long
xfs_ioc_start_commit(struct file * file,struct xfs_commit_range __user * argp)838 xfs_ioc_start_commit(
839 struct file *file,
840 struct xfs_commit_range __user *argp)
841 {
842 struct xfs_commit_range args = { };
843 struct kstat kstat = { };
844 struct xfs_commit_range_fresh *kern_f;
845 struct xfs_commit_range_fresh __user *user_f;
846 struct inode *inode2 = file_inode(file);
847 struct xfs_inode *ip2 = XFS_I(inode2);
848 const unsigned int lockflags = XFS_IOLOCK_SHARED |
849 XFS_MMAPLOCK_SHARED |
850 XFS_ILOCK_SHARED;
851
852 BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
853 sizeof(args.file2_freshness));
854
855 kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
856
857 memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
858
859 xfs_ilock(ip2, lockflags);
860 /* Force writing of a distinct ctime if any writes happen. */
861 fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2);
862 kern_f->file2_ctime = kstat.ctime.tv_sec;
863 kern_f->file2_ctime_nsec = kstat.ctime.tv_nsec;
864 kern_f->file2_mtime = kstat.mtime.tv_sec;
865 kern_f->file2_mtime_nsec = kstat.mtime.tv_nsec;
866 kern_f->file2_ino = ip2->i_ino;
867 kern_f->file2_gen = inode2->i_generation;
868 kern_f->magic = XCR_FRESH_MAGIC;
869 xfs_iunlock(ip2, lockflags);
870
871 user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
872 if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
873 return -EFAULT;
874
875 return 0;
876 }
877
878 /*
879 * Exchange file1 and file2 contents if file2 has not been written since the
880 * start commit operation.
881 */
882 long
xfs_ioc_commit_range(struct file * file,struct xfs_commit_range __user * argp)883 xfs_ioc_commit_range(
884 struct file *file,
885 struct xfs_commit_range __user *argp)
886 {
887 struct xfs_exchrange fxr = {
888 .file2 = file,
889 };
890 struct xfs_commit_range args;
891 struct xfs_commit_range_fresh *kern_f;
892 struct xfs_inode *ip2 = XFS_I(file_inode(file));
893 struct xfs_mount *mp = ip2->i_mount;
894
895 kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
896
897 if (copy_from_user(&args, argp, sizeof(args)))
898 return -EFAULT;
899 if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
900 return -EINVAL;
901 if (kern_f->magic != XCR_FRESH_MAGIC)
902 return -EBUSY;
903 if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
904 return -EBUSY;
905
906 fxr.file1_offset = args.file1_offset;
907 fxr.file2_offset = args.file2_offset;
908 fxr.length = args.length;
909 fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
910 fxr.file2_ino = kern_f->file2_ino;
911 fxr.file2_gen = kern_f->file2_gen;
912 fxr.file2_mtime.tv_sec = kern_f->file2_mtime;
913 fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec;
914 fxr.file2_ctime.tv_sec = kern_f->file2_ctime;
915 fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
916
917 CLASS(fd, file1)(args.file1_fd);
918 if (fd_empty(file1))
919 return -EBADF;
920 fxr.file1 = fd_file(file1);
921
922 return xfs_exchange_range(&fxr);
923 }
924