1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_quota.h"
16 #include "xfs_bmap_util.h"
17 #include "xfs_reflink.h"
18 #include "xfs_trace.h"
19 #include "xfs_exchrange.h"
20 #include "xfs_exchmaps.h"
21 #include "xfs_sb.h"
22 #include "xfs_icache.h"
23 #include "xfs_log.h"
24 #include "xfs_rtbitmap.h"
25 #include <linux/fsnotify.h>
26
27 /* Lock (and optionally join) two inodes for a file range exchange. */
28 void
xfs_exchrange_ilock(struct xfs_trans * tp,struct xfs_inode * ip1,struct xfs_inode * ip2)29 xfs_exchrange_ilock(
30 struct xfs_trans *tp,
31 struct xfs_inode *ip1,
32 struct xfs_inode *ip2)
33 {
34 if (ip1 != ip2)
35 xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
36 ip2, XFS_ILOCK_EXCL);
37 else
38 xfs_ilock(ip1, XFS_ILOCK_EXCL);
39 if (tp) {
40 xfs_trans_ijoin(tp, ip1, 0);
41 if (ip2 != ip1)
42 xfs_trans_ijoin(tp, ip2, 0);
43 }
44
45 }
46
47 /* Unlock two inodes after a file range exchange operation. */
48 void
xfs_exchrange_iunlock(struct xfs_inode * ip1,struct xfs_inode * ip2)49 xfs_exchrange_iunlock(
50 struct xfs_inode *ip1,
51 struct xfs_inode *ip2)
52 {
53 if (ip2 != ip1)
54 xfs_iunlock(ip2, XFS_ILOCK_EXCL);
55 xfs_iunlock(ip1, XFS_ILOCK_EXCL);
56 }
57
58 /*
59 * Estimate the resource requirements to exchange file contents between the two
60 * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to
61 * have flushed both inodes' pagecache and active direct-ios.
62 */
63 int
xfs_exchrange_estimate(struct xfs_exchmaps_req * req)64 xfs_exchrange_estimate(
65 struct xfs_exchmaps_req *req)
66 {
67 int error;
68
69 xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
70 error = xfs_exchmaps_estimate(req);
71 xfs_exchrange_iunlock(req->ip1, req->ip2);
72 return error;
73 }
74
75 /*
76 * Check that file2's metadata agree with the snapshot that we took for the
77 * range commit request.
78 *
79 * This should be called after the filesystem has locked /all/ inode metadata
80 * against modification.
81 */
82 STATIC int
xfs_exchrange_check_freshness(const struct xfs_exchrange * fxr,struct xfs_inode * ip2)83 xfs_exchrange_check_freshness(
84 const struct xfs_exchrange *fxr,
85 struct xfs_inode *ip2)
86 {
87 struct inode *inode2 = VFS_I(ip2);
88 struct timespec64 ctime = inode_get_ctime(inode2);
89 struct timespec64 mtime = inode_get_mtime(inode2);
90
91 trace_xfs_exchrange_freshness(fxr, ip2);
92
93 /* Check that file2 hasn't otherwise been modified. */
94 if (fxr->file2_ino != ip2->i_ino ||
95 fxr->file2_gen != inode2->i_generation ||
96 !timespec64_equal(&fxr->file2_ctime, &ctime) ||
97 !timespec64_equal(&fxr->file2_mtime, &mtime))
98 return -EBUSY;
99
100 return 0;
101 }
102
103 #define QRETRY_IP1 (0x1)
104 #define QRETRY_IP2 (0x2)
105
106 /*
107 * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
108 * this if quota enforcement is disabled or if both inodes' dquots are the
109 * same. The qretry structure must be initialized to zeroes before the first
110 * call to this function.
111 */
112 STATIC int
xfs_exchrange_reserve_quota(struct xfs_trans * tp,const struct xfs_exchmaps_req * req,unsigned int * qretry)113 xfs_exchrange_reserve_quota(
114 struct xfs_trans *tp,
115 const struct xfs_exchmaps_req *req,
116 unsigned int *qretry)
117 {
118 int64_t ddelta, rdelta;
119 int ip1_error = 0;
120 int error;
121
122 /*
123 * Don't bother with a quota reservation if we're not enforcing them
124 * or the two inodes have the same dquots.
125 */
126 if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
127 (req->ip1->i_udquot == req->ip2->i_udquot &&
128 req->ip1->i_gdquot == req->ip2->i_gdquot &&
129 req->ip1->i_pdquot == req->ip2->i_pdquot))
130 return 0;
131
132 *qretry = 0;
133
134 /*
135 * For each file, compute the net gain in the number of regular blocks
136 * that will be mapped into that file and reserve that much quota. The
137 * quota counts must be able to absorb at least that much space.
138 */
139 ddelta = req->ip2_bcount - req->ip1_bcount;
140 rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
141 if (ddelta > 0 || rdelta > 0) {
142 error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
143 ddelta > 0 ? ddelta : 0,
144 rdelta > 0 ? rdelta : 0,
145 false);
146 if (error == -EDQUOT || error == -ENOSPC) {
147 /*
148 * Save this error and see what happens if we try to
149 * reserve quota for ip2. Then report both.
150 */
151 *qretry |= QRETRY_IP1;
152 ip1_error = error;
153 error = 0;
154 }
155 if (error)
156 return error;
157 }
158 if (ddelta < 0 || rdelta < 0) {
159 error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
160 ddelta < 0 ? -ddelta : 0,
161 rdelta < 0 ? -rdelta : 0,
162 false);
163 if (error == -EDQUOT || error == -ENOSPC)
164 *qretry |= QRETRY_IP2;
165 if (error)
166 return error;
167 }
168 if (ip1_error)
169 return ip1_error;
170
171 /*
172 * For each file, forcibly reserve the gross gain in mapped blocks so
173 * that we don't trip over any quota block reservation assertions.
174 * We must reserve the gross gain because the quota code subtracts from
175 * bcount the number of blocks that we unmap; it does not add that
176 * quantity back to the quota block reservation.
177 */
178 error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
179 req->ip1_rtbcount, true);
180 if (error)
181 return error;
182
183 return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
184 req->ip2_rtbcount, true);
185 }
186
187 /* Exchange the mappings (and hence the contents) of two files' forks. */
188 STATIC int
xfs_exchrange_mappings(const struct xfs_exchrange * fxr,struct xfs_inode * ip1,struct xfs_inode * ip2)189 xfs_exchrange_mappings(
190 const struct xfs_exchrange *fxr,
191 struct xfs_inode *ip1,
192 struct xfs_inode *ip2)
193 {
194 struct xfs_mount *mp = ip1->i_mount;
195 struct xfs_exchmaps_req req = {
196 .ip1 = ip1,
197 .ip2 = ip2,
198 .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset),
199 .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset),
200 .blockcount = XFS_B_TO_FSB(mp, fxr->length),
201 };
202 struct xfs_trans *tp;
203 unsigned int qretry;
204 bool retried = false;
205 int error;
206
207 trace_xfs_exchrange_mappings(fxr, ip1, ip2);
208
209 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
210 req.flags |= XFS_EXCHMAPS_SET_SIZES;
211 if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
212 req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
213
214 /*
215 * Round the request length up to the nearest file allocation unit.
216 * The prep function already checked that the request offsets and
217 * length in @fxr are safe to round up.
218 */
219 if (xfs_inode_has_bigrtalloc(ip2))
220 req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount);
221
222 error = xfs_exchrange_estimate(&req);
223 if (error)
224 return error;
225
226 retry:
227 /* Allocate the transaction, lock the inodes, and join them. */
228 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
229 XFS_TRANS_RES_FDBLKS, &tp);
230 if (error)
231 return error;
232
233 xfs_exchrange_ilock(tp, ip1, ip2);
234
235 trace_xfs_exchrange_before(ip2, 2);
236 trace_xfs_exchrange_before(ip1, 1);
237
238 error = xfs_exchmaps_check_forks(mp, &req);
239 if (error)
240 goto out_trans_cancel;
241
242 /*
243 * Reserve ourselves some quota if any of them are in enforcing mode.
244 * In theory we only need enough to satisfy the change in the number
245 * of blocks between the two ranges being remapped.
246 */
247 error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
248 if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
249 xfs_trans_cancel(tp);
250 xfs_exchrange_iunlock(ip1, ip2);
251 if (qretry & QRETRY_IP1)
252 xfs_blockgc_free_quota(ip1, 0);
253 if (qretry & QRETRY_IP2)
254 xfs_blockgc_free_quota(ip2, 0);
255 retried = true;
256 goto retry;
257 }
258 if (error)
259 goto out_trans_cancel;
260
261 /* If we got this far on a dry run, all parameters are ok. */
262 if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
263 goto out_trans_cancel;
264
265 /* Update the mtime and ctime of both files. */
266 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
267 xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
268 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
269 xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
270
271 xfs_exchange_mappings(tp, &req);
272
273 /*
274 * Force the log to persist metadata updates if the caller or the
275 * administrator requires this. The generic prep function already
276 * flushed the relevant parts of the page cache.
277 */
278 if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
279 xfs_trans_set_sync(tp);
280
281 error = xfs_trans_commit(tp);
282
283 trace_xfs_exchrange_after(ip2, 2);
284 trace_xfs_exchrange_after(ip1, 1);
285
286 if (error)
287 goto out_unlock;
288
289 /*
290 * If the caller wanted us to exchange the contents of two complete
291 * files of unequal length, exchange the incore sizes now. This should
292 * be safe because we flushed both files' page caches, exchanged all
293 * the mappings, and updated the ondisk sizes.
294 */
295 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
296 loff_t temp;
297
298 temp = i_size_read(VFS_I(ip2));
299 i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
300 i_size_write(VFS_I(ip1), temp);
301 }
302
303 out_unlock:
304 xfs_exchrange_iunlock(ip1, ip2);
305 return error;
306
307 out_trans_cancel:
308 xfs_trans_cancel(tp);
309 goto out_unlock;
310 }
311
312 /*
313 * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
314 * This part deals with struct file objects and byte ranges and does not deal
315 * with XFS-specific data structures such as xfs_inodes and block ranges. This
316 * separation may some day facilitate porting to another filesystem.
317 *
318 * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
319 * file1 with the same number of bytes starting at fxr.file2_offset in file2.
320 * Implementations must call xfs_exchange_range_prep to prepare the two
321 * files prior to taking locks; and they must update the inode change and mod
322 * times of both files as part of the metadata update. The timestamp update
323 * and freshness checks must be done atomically as part of the data exchange
324 * operation to ensure correctness of the freshness check.
325 * xfs_exchange_range_finish must be called after the operation completes
326 * successfully but before locks are dropped.
327 */
328
329 /* Verify that we have security clearance to perform this operation. */
330 static int
xfs_exchange_range_verify_area(struct xfs_exchrange * fxr)331 xfs_exchange_range_verify_area(
332 struct xfs_exchrange *fxr)
333 {
334 int ret;
335
336 ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
337 true);
338 if (ret)
339 return ret;
340
341 return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
342 true);
343 }
344
345 /*
346 * Performs necessary checks before doing a range exchange, having stabilized
347 * mutable inode attributes via i_rwsem.
348 */
349 static inline int
xfs_exchange_range_checks(struct xfs_exchrange * fxr,unsigned int alloc_unit)350 xfs_exchange_range_checks(
351 struct xfs_exchrange *fxr,
352 unsigned int alloc_unit)
353 {
354 struct inode *inode1 = file_inode(fxr->file1);
355 struct inode *inode2 = file_inode(fxr->file2);
356 uint64_t allocmask = alloc_unit - 1;
357 int64_t test_len;
358 uint64_t blen;
359 loff_t size1, size2, tmp;
360 int error;
361
362 /* Don't touch certain kinds of inodes */
363 if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
364 return -EPERM;
365 if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
366 return -ETXTBSY;
367
368 size1 = i_size_read(inode1);
369 size2 = i_size_read(inode2);
370
371 /* Ranges cannot start after EOF. */
372 if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
373 return -EINVAL;
374
375 /*
376 * If the caller said to exchange to EOF, we set the length of the
377 * request large enough to cover everything to the end of both files.
378 */
379 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
380 fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
381 size2 - fxr->file2_offset);
382
383 error = xfs_exchange_range_verify_area(fxr);
384 if (error)
385 return error;
386 }
387
388 /*
389 * The start of both ranges must be aligned to the file allocation
390 * unit.
391 */
392 if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
393 !IS_ALIGNED(fxr->file2_offset, alloc_unit))
394 return -EINVAL;
395
396 /* Ensure offsets don't wrap. */
397 if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
398 check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
399 return -EINVAL;
400
401 /*
402 * We require both ranges to end within EOF, unless we're exchanging
403 * to EOF.
404 */
405 if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
406 (fxr->file1_offset + fxr->length > size1 ||
407 fxr->file2_offset + fxr->length > size2))
408 return -EINVAL;
409
410 /*
411 * Make sure we don't hit any file size limits. If we hit any size
412 * limits such that test_length was adjusted, we abort the whole
413 * operation.
414 */
415 test_len = fxr->length;
416 error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
417 &test_len);
418 if (error)
419 return error;
420 error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
421 &test_len);
422 if (error)
423 return error;
424 if (test_len != fxr->length)
425 return -EINVAL;
426
427 /*
428 * If the user wanted us to exchange up to the infile's EOF, round up
429 * to the next allocation unit boundary for this check. Do the same
430 * for the outfile.
431 *
432 * Otherwise, reject the range length if it's not aligned to an
433 * allocation unit.
434 */
435 if (fxr->file1_offset + fxr->length == size1)
436 blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
437 else if (fxr->file2_offset + fxr->length == size2)
438 blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
439 else if (!IS_ALIGNED(fxr->length, alloc_unit))
440 return -EINVAL;
441 else
442 blen = fxr->length;
443
444 /* Don't allow overlapped exchanges within the same file. */
445 if (inode1 == inode2 &&
446 fxr->file2_offset + blen > fxr->file1_offset &&
447 fxr->file1_offset + blen > fxr->file2_offset)
448 return -EINVAL;
449
450 /*
451 * Ensure that we don't exchange a partial EOF block into the middle of
452 * another file.
453 */
454 if ((fxr->length & allocmask) == 0)
455 return 0;
456
457 blen = fxr->length;
458 if (fxr->file2_offset + blen < size2)
459 blen &= ~allocmask;
460
461 if (fxr->file1_offset + blen < size1)
462 blen &= ~allocmask;
463
464 return blen == fxr->length ? 0 : -EINVAL;
465 }
466
467 /*
468 * Check that the two inodes are eligible for range exchanges, the ranges make
469 * sense, and then flush all dirty data. Caller must ensure that the inodes
470 * have been locked against any other modifications.
471 */
472 static inline int
xfs_exchange_range_prep(struct xfs_exchrange * fxr,unsigned int alloc_unit)473 xfs_exchange_range_prep(
474 struct xfs_exchrange *fxr,
475 unsigned int alloc_unit)
476 {
477 struct inode *inode1 = file_inode(fxr->file1);
478 struct inode *inode2 = file_inode(fxr->file2);
479 bool same_inode = (inode1 == inode2);
480 int error;
481
482 /* Check that we don't violate system file offset limits. */
483 error = xfs_exchange_range_checks(fxr, alloc_unit);
484 if (error || fxr->length == 0)
485 return error;
486
487 /* Wait for the completion of any pending IOs on both files */
488 inode_dio_wait(inode1);
489 if (!same_inode)
490 inode_dio_wait(inode2);
491
492 error = filemap_write_and_wait_range(inode1->i_mapping,
493 fxr->file1_offset,
494 fxr->file1_offset + fxr->length - 1);
495 if (error)
496 return error;
497
498 error = filemap_write_and_wait_range(inode2->i_mapping,
499 fxr->file2_offset,
500 fxr->file2_offset + fxr->length - 1);
501 if (error)
502 return error;
503
504 /*
505 * If the files or inodes involved require synchronous writes, amend
506 * the request to force the filesystem to flush all data and metadata
507 * to disk after the operation completes.
508 */
509 if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
510 IS_SYNC(inode1) || IS_SYNC(inode2))
511 fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
512
513 return 0;
514 }
515
516 /*
517 * Finish a range exchange operation, if it was successful. Caller must ensure
518 * that the inodes are still locked against any other modifications.
519 */
520 static inline int
xfs_exchange_range_finish(struct xfs_exchrange * fxr)521 xfs_exchange_range_finish(
522 struct xfs_exchrange *fxr)
523 {
524 int error;
525
526 error = file_remove_privs(fxr->file1);
527 if (error)
528 return error;
529 if (file_inode(fxr->file1) == file_inode(fxr->file2))
530 return 0;
531
532 return file_remove_privs(fxr->file2);
533 }
534
535 /*
536 * Check the alignment of an exchange request when the allocation unit size
537 * isn't a power of two. The generic file-level helpers use (fast)
538 * bitmask-based alignment checks, but here we have to use slow long division.
539 */
540 static int
xfs_exchrange_check_rtalign(const struct xfs_exchrange * fxr,struct xfs_inode * ip1,struct xfs_inode * ip2,unsigned int alloc_unit)541 xfs_exchrange_check_rtalign(
542 const struct xfs_exchrange *fxr,
543 struct xfs_inode *ip1,
544 struct xfs_inode *ip2,
545 unsigned int alloc_unit)
546 {
547 uint64_t length = fxr->length;
548 uint64_t blen;
549 loff_t size1, size2;
550
551 size1 = i_size_read(VFS_I(ip1));
552 size2 = i_size_read(VFS_I(ip2));
553
554 /* The start of both ranges must be aligned to a rt extent. */
555 if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
556 !isaligned_64(fxr->file2_offset, alloc_unit))
557 return -EINVAL;
558
559 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
560 length = max_t(int64_t, size1 - fxr->file1_offset,
561 size2 - fxr->file2_offset);
562
563 /*
564 * If the user wanted us to exchange up to the infile's EOF, round up
565 * to the next rt extent boundary for this check. Do the same for the
566 * outfile.
567 *
568 * Otherwise, reject the range length if it's not rt extent aligned.
569 * We already confirmed the starting offsets' rt extent block
570 * alignment.
571 */
572 if (fxr->file1_offset + length == size1)
573 blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
574 else if (fxr->file2_offset + length == size2)
575 blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
576 else if (!isaligned_64(length, alloc_unit))
577 return -EINVAL;
578 else
579 blen = length;
580
581 /* Don't allow overlapped exchanges within the same file. */
582 if (ip1 == ip2 &&
583 fxr->file2_offset + blen > fxr->file1_offset &&
584 fxr->file1_offset + blen > fxr->file2_offset)
585 return -EINVAL;
586
587 /*
588 * Ensure that we don't exchange a partial EOF rt extent into the
589 * middle of another file.
590 */
591 if (isaligned_64(length, alloc_unit))
592 return 0;
593
594 blen = length;
595 if (fxr->file2_offset + length < size2)
596 blen = rounddown_64(blen, alloc_unit);
597
598 if (fxr->file1_offset + blen < size1)
599 blen = rounddown_64(blen, alloc_unit);
600
601 return blen == length ? 0 : -EINVAL;
602 }
603
604 /* Prepare two files to have their data exchanged. */
605 STATIC int
xfs_exchrange_prep(struct xfs_exchrange * fxr,struct xfs_inode * ip1,struct xfs_inode * ip2)606 xfs_exchrange_prep(
607 struct xfs_exchrange *fxr,
608 struct xfs_inode *ip1,
609 struct xfs_inode *ip2)
610 {
611 struct xfs_mount *mp = ip2->i_mount;
612 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2);
613 int error;
614
615 trace_xfs_exchrange_prep(fxr, ip1, ip2);
616
617 /* Verify both files are either real-time or non-realtime */
618 if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
619 return -EINVAL;
620
621 /* Check non-power of two alignment issues, if necessary. */
622 if (!is_power_of_2(alloc_unit)) {
623 error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
624 if (error)
625 return error;
626
627 /*
628 * Do the generic file-level checks with the regular block
629 * alignment.
630 */
631 alloc_unit = mp->m_sb.sb_blocksize;
632 }
633
634 error = xfs_exchange_range_prep(fxr, alloc_unit);
635 if (error || fxr->length == 0)
636 return error;
637
638 if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
639 error = xfs_exchrange_check_freshness(fxr, ip2);
640 if (error)
641 return error;
642 }
643
644 /* Attach dquots to both inodes before changing block maps. */
645 error = xfs_qm_dqattach(ip2);
646 if (error)
647 return error;
648 error = xfs_qm_dqattach(ip1);
649 if (error)
650 return error;
651
652 trace_xfs_exchrange_flush(fxr, ip1, ip2);
653
654 /* Flush the relevant ranges of both files. */
655 error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
656 if (error)
657 return error;
658 error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
659 if (error)
660 return error;
661
662 /*
663 * Cancel CoW fork preallocations for the ranges of both files. The
664 * prep function should have flushed all the dirty data, so the only
665 * CoW mappings remaining should be speculative.
666 */
667 if (xfs_inode_has_cow_data(ip1)) {
668 error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
669 fxr->length, true);
670 if (error)
671 return error;
672 }
673
674 if (xfs_inode_has_cow_data(ip2)) {
675 error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
676 fxr->length, true);
677 if (error)
678 return error;
679 }
680
681 return 0;
682 }
683
684 /*
685 * Exchange contents of files. This is the binding between the generic
686 * file-level concepts and the XFS inode-specific implementation.
687 */
688 STATIC int
xfs_exchrange_contents(struct xfs_exchrange * fxr)689 xfs_exchrange_contents(
690 struct xfs_exchrange *fxr)
691 {
692 struct inode *inode1 = file_inode(fxr->file1);
693 struct inode *inode2 = file_inode(fxr->file2);
694 struct xfs_inode *ip1 = XFS_I(inode1);
695 struct xfs_inode *ip2 = XFS_I(inode2);
696 struct xfs_mount *mp = ip1->i_mount;
697 int error;
698
699 if (!xfs_has_exchange_range(mp))
700 return -EOPNOTSUPP;
701
702 if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
703 XFS_EXCHANGE_RANGE_PRIV_FLAGS))
704 return -EINVAL;
705
706 if (xfs_is_shutdown(mp))
707 return -EIO;
708
709 /* Lock both files against IO */
710 error = xfs_ilock2_io_mmap(ip1, ip2);
711 if (error)
712 goto out_err;
713
714 /* Prepare and then exchange file contents. */
715 error = xfs_exchrange_prep(fxr, ip1, ip2);
716 if (error)
717 goto out_unlock;
718
719 error = xfs_exchrange_mappings(fxr, ip1, ip2);
720 if (error)
721 goto out_unlock;
722
723 /*
724 * Finish the exchange by removing special file privileges like any
725 * other file write would do. This may involve turning on support for
726 * logged xattrs if either file has security capabilities.
727 */
728 error = xfs_exchange_range_finish(fxr);
729 if (error)
730 goto out_unlock;
731
732 out_unlock:
733 xfs_iunlock2_io_mmap(ip1, ip2);
734 out_err:
735 if (error)
736 trace_xfs_exchrange_error(ip2, error, _RET_IP_);
737 return error;
738 }
739
740 /* Exchange parts of two files. */
741 static int
xfs_exchange_range(struct xfs_exchrange * fxr)742 xfs_exchange_range(
743 struct xfs_exchrange *fxr)
744 {
745 struct inode *inode1 = file_inode(fxr->file1);
746 struct inode *inode2 = file_inode(fxr->file2);
747 int ret;
748
749 BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
750 XFS_EXCHANGE_RANGE_PRIV_FLAGS);
751
752 /* Both files must be on the same mount/filesystem. */
753 if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
754 return -EXDEV;
755
756 if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
757 __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
758 return -EINVAL;
759
760 /* Userspace requests only honored for regular files. */
761 if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
762 return -EISDIR;
763 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
764 return -EINVAL;
765
766 /* Both files must be opened for read and write. */
767 if (!(fxr->file1->f_mode & FMODE_READ) ||
768 !(fxr->file1->f_mode & FMODE_WRITE) ||
769 !(fxr->file2->f_mode & FMODE_READ) ||
770 !(fxr->file2->f_mode & FMODE_WRITE))
771 return -EBADF;
772
773 /* Neither file can be opened append-only. */
774 if ((fxr->file1->f_flags & O_APPEND) ||
775 (fxr->file2->f_flags & O_APPEND))
776 return -EBADF;
777
778 /*
779 * If we're not exchanging to EOF, we can check the areas before
780 * stabilizing both files' i_size.
781 */
782 if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
783 ret = xfs_exchange_range_verify_area(fxr);
784 if (ret)
785 return ret;
786 }
787
788 /* Update cmtime if the fd/inode don't forbid it. */
789 if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
790 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
791 if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
792 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
793
794 file_start_write(fxr->file2);
795 ret = xfs_exchrange_contents(fxr);
796 file_end_write(fxr->file2);
797 if (ret)
798 return ret;
799
800 fsnotify_modify(fxr->file1);
801 if (fxr->file2 != fxr->file1)
802 fsnotify_modify(fxr->file2);
803 return 0;
804 }
805
806 /* Collect exchange-range arguments from userspace. */
807 long
xfs_ioc_exchange_range(struct file * file,struct xfs_exchange_range __user * argp)808 xfs_ioc_exchange_range(
809 struct file *file,
810 struct xfs_exchange_range __user *argp)
811 {
812 struct xfs_exchrange fxr = {
813 .file2 = file,
814 };
815 struct xfs_exchange_range args;
816
817 if (copy_from_user(&args, argp, sizeof(args)))
818 return -EFAULT;
819 if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
820 return -EINVAL;
821 if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
822 return -EINVAL;
823
824 fxr.file1_offset = args.file1_offset;
825 fxr.file2_offset = args.file2_offset;
826 fxr.length = args.length;
827 fxr.flags = args.flags;
828
829 CLASS(fd, file1)(args.file1_fd);
830 if (fd_empty(file1))
831 return -EBADF;
832 fxr.file1 = fd_file(file1);
833
834 return xfs_exchange_range(&fxr);
835 }
836
837 /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
838 struct xfs_commit_range_fresh {
839 xfs_fsid_t fsid; /* m_fixedfsid */
840 __u64 file2_ino; /* inode number */
841 __s64 file2_mtime; /* modification time */
842 __s64 file2_ctime; /* change time */
843 __s32 file2_mtime_nsec; /* mod time, nsec */
844 __s32 file2_ctime_nsec; /* change time, nsec */
845 __u32 file2_gen; /* inode generation */
846 __u32 magic; /* zero */
847 };
848 #define XCR_FRESH_MAGIC 0x444F524B /* DORK */
849
850 /* Set up a commitrange operation by sampling file2's write-related attrs */
851 long
xfs_ioc_start_commit(struct file * file,struct xfs_commit_range __user * argp)852 xfs_ioc_start_commit(
853 struct file *file,
854 struct xfs_commit_range __user *argp)
855 {
856 struct xfs_commit_range args = { };
857 struct kstat kstat = { };
858 struct xfs_commit_range_fresh *kern_f;
859 struct xfs_commit_range_fresh __user *user_f;
860 struct inode *inode2 = file_inode(file);
861 struct xfs_inode *ip2 = XFS_I(inode2);
862 const unsigned int lockflags = XFS_IOLOCK_SHARED |
863 XFS_MMAPLOCK_SHARED |
864 XFS_ILOCK_SHARED;
865
866 BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
867 sizeof(args.file2_freshness));
868
869 kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
870
871 memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
872
873 xfs_ilock(ip2, lockflags);
874 /* Force writing of a distinct ctime if any writes happen. */
875 fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2);
876 kern_f->file2_ctime = kstat.ctime.tv_sec;
877 kern_f->file2_ctime_nsec = kstat.ctime.tv_nsec;
878 kern_f->file2_mtime = kstat.mtime.tv_sec;
879 kern_f->file2_mtime_nsec = kstat.mtime.tv_nsec;
880 kern_f->file2_ino = ip2->i_ino;
881 kern_f->file2_gen = inode2->i_generation;
882 kern_f->magic = XCR_FRESH_MAGIC;
883 xfs_iunlock(ip2, lockflags);
884
885 user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
886 if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
887 return -EFAULT;
888
889 return 0;
890 }
891
892 /*
893 * Exchange file1 and file2 contents if file2 has not been written since the
894 * start commit operation.
895 */
896 long
xfs_ioc_commit_range(struct file * file,struct xfs_commit_range __user * argp)897 xfs_ioc_commit_range(
898 struct file *file,
899 struct xfs_commit_range __user *argp)
900 {
901 struct xfs_exchrange fxr = {
902 .file2 = file,
903 };
904 struct xfs_commit_range args;
905 struct xfs_commit_range_fresh *kern_f;
906 struct xfs_inode *ip2 = XFS_I(file_inode(file));
907 struct xfs_mount *mp = ip2->i_mount;
908
909 kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
910
911 if (copy_from_user(&args, argp, sizeof(args)))
912 return -EFAULT;
913 if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
914 return -EINVAL;
915 if (kern_f->magic != XCR_FRESH_MAGIC)
916 return -EBUSY;
917 if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
918 return -EBUSY;
919
920 fxr.file1_offset = args.file1_offset;
921 fxr.file2_offset = args.file2_offset;
922 fxr.length = args.length;
923 fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
924 fxr.file2_ino = kern_f->file2_ino;
925 fxr.file2_gen = kern_f->file2_gen;
926 fxr.file2_mtime.tv_sec = kern_f->file2_mtime;
927 fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec;
928 fxr.file2_ctime.tv_sec = kern_f->file2_ctime;
929 fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
930
931 CLASS(fd, file1)(args.file1_fd);
932 if (fd_empty(file1))
933 return -EBADF;
934 fxr.file1 = fd_file(file1);
935
936 return xfs_exchange_range(&fxr);
937 }
938