1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 #include "xfs_file.h"
28 #include "xfs_aops.h"
29 #include "xfs_zone_alloc.h"
30
31 #include <linux/dax.h>
32 #include <linux/falloc.h>
33 #include <linux/backing-dev.h>
34 #include <linux/mman.h>
35 #include <linux/fadvise.h>
36 #include <linux/mount.h>
37
38 static const struct vm_operations_struct xfs_file_vm_ops;
39
40 /*
41 * Decide if the given file range is aligned to the size of the fundamental
42 * allocation unit for the file.
43 */
44 bool
xfs_is_falloc_aligned(struct xfs_inode * ip,loff_t pos,long long int len)45 xfs_is_falloc_aligned(
46 struct xfs_inode *ip,
47 loff_t pos,
48 long long int len)
49 {
50 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
51
52 if (!is_power_of_2(alloc_unit))
53 return isaligned_64(pos, alloc_unit) &&
54 isaligned_64(len, alloc_unit);
55
56 return !((pos | len) & (alloc_unit - 1));
57 }
58
59 /*
60 * Fsync operations on directories are much simpler than on regular files,
61 * as there is no file data to flush, and thus also no need for explicit
62 * cache flush operations, and there are no non-transaction metadata updates
63 * on directories either.
64 */
65 STATIC int
xfs_dir_fsync(struct file * file,loff_t start,loff_t end,int datasync)66 xfs_dir_fsync(
67 struct file *file,
68 loff_t start,
69 loff_t end,
70 int datasync)
71 {
72 struct xfs_inode *ip = XFS_I(file->f_mapping->host);
73
74 trace_xfs_dir_fsync(ip);
75 return xfs_log_force_inode(ip);
76 }
77
78 /*
79 * All metadata updates are logged, which means that we just have to push the
80 * journal to the required sequence number than holds the updates. We track
81 * datasync commits separately to full sync commits, and hence only need to
82 * select the correct sequence number for the log force here.
83 *
84 * We don't have to serialise against concurrent modifications, as we do not
85 * have to wait for modifications that have not yet completed. We define a
86 * transaction commit as completing when the commit sequence number is updated,
87 * hence if the sequence number has not updated, the sync operation has been
88 * run before the commit completed and we don't have to wait for it.
89 *
90 * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
91 * set on the log item until - at least - the journal flush completes. In
92 * reality, they are only cleared when the inode is fully unpinned (i.e.
93 * persistent in the journal and not dirty in the CIL), and so we rely on
94 * xfs_log_force_seq() either skipping sequences that have been persisted or
95 * waiting on sequences that are still in flight to correctly order concurrent
96 * sync operations.
97 */
98 static int
xfs_fsync_flush_log(struct xfs_inode * ip,bool datasync,int * log_flushed)99 xfs_fsync_flush_log(
100 struct xfs_inode *ip,
101 bool datasync,
102 int *log_flushed)
103 {
104 struct xfs_inode_log_item *iip = ip->i_itemp;
105 xfs_csn_t seq = 0;
106
107 spin_lock(&iip->ili_lock);
108 if (datasync)
109 seq = iip->ili_datasync_seq;
110 else
111 seq = iip->ili_commit_seq;
112 spin_unlock(&iip->ili_lock);
113
114 if (!seq)
115 return 0;
116
117 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
118 log_flushed);
119 }
120
121 STATIC int
xfs_file_fsync(struct file * file,loff_t start,loff_t end,int datasync)122 xfs_file_fsync(
123 struct file *file,
124 loff_t start,
125 loff_t end,
126 int datasync)
127 {
128 struct xfs_inode *ip = XFS_I(file->f_mapping->host);
129 struct xfs_mount *mp = ip->i_mount;
130 int error, err2;
131 int log_flushed = 0;
132
133 trace_xfs_file_fsync(ip);
134
135 error = file_write_and_wait_range(file, start, end);
136 if (error)
137 return error;
138
139 if (xfs_is_shutdown(mp))
140 return -EIO;
141
142 xfs_iflags_clear(ip, XFS_ITRUNCATED);
143
144 /*
145 * If we have an RT and/or log subvolume we need to make sure to flush
146 * the write cache the device used for file data first. This is to
147 * ensure newly written file data make it to disk before logging the new
148 * inode size in case of an extending write.
149 */
150 if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
151 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
152 else if (mp->m_logdev_targp != mp->m_ddev_targp)
153 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
154
155 /*
156 * If the inode has a inode log item attached, it may need the journal
157 * flushed to persist any changes the log item might be tracking.
158 */
159 if (ip->i_itemp) {
160 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
161 if (err2 && !error)
162 error = err2;
163 }
164
165 /*
166 * If we only have a single device, and the log force about was
167 * a no-op we might have to flush the data device cache here.
168 * This can only happen for fdatasync/O_DSYNC if we were overwriting
169 * an already allocated file and thus do not have any metadata to
170 * commit.
171 */
172 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
173 mp->m_logdev_targp == mp->m_ddev_targp) {
174 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
175 if (err2 && !error)
176 error = err2;
177 }
178
179 return error;
180 }
181
182 static int
xfs_ilock_iocb(struct kiocb * iocb,unsigned int lock_mode)183 xfs_ilock_iocb(
184 struct kiocb *iocb,
185 unsigned int lock_mode)
186 {
187 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
188
189 if (iocb->ki_flags & IOCB_NOWAIT) {
190 if (!xfs_ilock_nowait(ip, lock_mode))
191 return -EAGAIN;
192 } else {
193 xfs_ilock(ip, lock_mode);
194 }
195
196 return 0;
197 }
198
199 static int
xfs_ilock_iocb_for_write(struct kiocb * iocb,unsigned int * lock_mode)200 xfs_ilock_iocb_for_write(
201 struct kiocb *iocb,
202 unsigned int *lock_mode)
203 {
204 ssize_t ret;
205 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
206
207 ret = xfs_ilock_iocb(iocb, *lock_mode);
208 if (ret)
209 return ret;
210
211 /*
212 * If a reflink remap is in progress we always need to take the iolock
213 * exclusively to wait for it to finish.
214 */
215 if (*lock_mode == XFS_IOLOCK_SHARED &&
216 xfs_iflags_test(ip, XFS_IREMAPPING)) {
217 xfs_iunlock(ip, *lock_mode);
218 *lock_mode = XFS_IOLOCK_EXCL;
219 return xfs_ilock_iocb(iocb, *lock_mode);
220 }
221
222 return 0;
223 }
224
225 STATIC ssize_t
xfs_file_dio_read(struct kiocb * iocb,struct iov_iter * to)226 xfs_file_dio_read(
227 struct kiocb *iocb,
228 struct iov_iter *to)
229 {
230 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
231 ssize_t ret;
232
233 trace_xfs_file_direct_read(iocb, to);
234
235 if (!iov_iter_count(to))
236 return 0; /* skip atime */
237
238 file_accessed(iocb->ki_filp);
239
240 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
241 if (ret)
242 return ret;
243 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
244 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
245
246 return ret;
247 }
248
249 static noinline ssize_t
xfs_file_dax_read(struct kiocb * iocb,struct iov_iter * to)250 xfs_file_dax_read(
251 struct kiocb *iocb,
252 struct iov_iter *to)
253 {
254 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
255 ssize_t ret = 0;
256
257 trace_xfs_file_dax_read(iocb, to);
258
259 if (!iov_iter_count(to))
260 return 0; /* skip atime */
261
262 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
263 if (ret)
264 return ret;
265 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
266 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
267
268 file_accessed(iocb->ki_filp);
269 return ret;
270 }
271
272 STATIC ssize_t
xfs_file_buffered_read(struct kiocb * iocb,struct iov_iter * to)273 xfs_file_buffered_read(
274 struct kiocb *iocb,
275 struct iov_iter *to)
276 {
277 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
278 ssize_t ret;
279
280 trace_xfs_file_buffered_read(iocb, to);
281
282 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
283 if (ret)
284 return ret;
285 ret = generic_file_read_iter(iocb, to);
286 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
287
288 return ret;
289 }
290
291 STATIC ssize_t
xfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)292 xfs_file_read_iter(
293 struct kiocb *iocb,
294 struct iov_iter *to)
295 {
296 struct inode *inode = file_inode(iocb->ki_filp);
297 struct xfs_mount *mp = XFS_I(inode)->i_mount;
298 ssize_t ret = 0;
299
300 XFS_STATS_INC(mp, xs_read_calls);
301
302 if (xfs_is_shutdown(mp))
303 return -EIO;
304
305 if (IS_DAX(inode))
306 ret = xfs_file_dax_read(iocb, to);
307 else if (iocb->ki_flags & IOCB_DIRECT)
308 ret = xfs_file_dio_read(iocb, to);
309 else
310 ret = xfs_file_buffered_read(iocb, to);
311
312 if (ret > 0)
313 XFS_STATS_ADD(mp, xs_read_bytes, ret);
314 return ret;
315 }
316
317 STATIC ssize_t
xfs_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)318 xfs_file_splice_read(
319 struct file *in,
320 loff_t *ppos,
321 struct pipe_inode_info *pipe,
322 size_t len,
323 unsigned int flags)
324 {
325 struct inode *inode = file_inode(in);
326 struct xfs_inode *ip = XFS_I(inode);
327 struct xfs_mount *mp = ip->i_mount;
328 ssize_t ret = 0;
329
330 XFS_STATS_INC(mp, xs_read_calls);
331
332 if (xfs_is_shutdown(mp))
333 return -EIO;
334
335 trace_xfs_file_splice_read(ip, *ppos, len);
336
337 xfs_ilock(ip, XFS_IOLOCK_SHARED);
338 ret = filemap_splice_read(in, ppos, pipe, len, flags);
339 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
340 if (ret > 0)
341 XFS_STATS_ADD(mp, xs_read_bytes, ret);
342 return ret;
343 }
344
345 /*
346 * Take care of zeroing post-EOF blocks when they might exist.
347 *
348 * Returns 0 if successfully, a negative error for a failure, or 1 if this
349 * function dropped the iolock and reacquired it exclusively and the caller
350 * needs to restart the write sanity checks.
351 */
352 static ssize_t
xfs_file_write_zero_eof(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,size_t count,bool * drained_dio,struct xfs_zone_alloc_ctx * ac)353 xfs_file_write_zero_eof(
354 struct kiocb *iocb,
355 struct iov_iter *from,
356 unsigned int *iolock,
357 size_t count,
358 bool *drained_dio,
359 struct xfs_zone_alloc_ctx *ac)
360 {
361 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
362 loff_t isize;
363 int error;
364
365 /*
366 * We need to serialise against EOF updates that occur in IO completions
367 * here. We want to make sure that nobody is changing the size while
368 * we do this check until we have placed an IO barrier (i.e. hold
369 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
370 * spinlock effectively forms a memory barrier once we have
371 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
372 * hence be able to correctly determine if we need to run zeroing.
373 */
374 spin_lock(&ip->i_flags_lock);
375 isize = i_size_read(VFS_I(ip));
376 if (iocb->ki_pos <= isize) {
377 spin_unlock(&ip->i_flags_lock);
378 return 0;
379 }
380 spin_unlock(&ip->i_flags_lock);
381
382 if (iocb->ki_flags & IOCB_NOWAIT)
383 return -EAGAIN;
384
385 if (!*drained_dio) {
386 /*
387 * If zeroing is needed and we are currently holding the iolock
388 * shared, we need to update it to exclusive which implies
389 * having to redo all checks before.
390 */
391 if (*iolock == XFS_IOLOCK_SHARED) {
392 xfs_iunlock(ip, *iolock);
393 *iolock = XFS_IOLOCK_EXCL;
394 xfs_ilock(ip, *iolock);
395 iov_iter_reexpand(from, count);
396 }
397
398 /*
399 * We now have an IO submission barrier in place, but AIO can do
400 * EOF updates during IO completion and hence we now need to
401 * wait for all of them to drain. Non-AIO DIO will have drained
402 * before we are given the XFS_IOLOCK_EXCL, and so for most
403 * cases this wait is a no-op.
404 */
405 inode_dio_wait(VFS_I(ip));
406 *drained_dio = true;
407 return 1;
408 }
409
410 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
411
412 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
413 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
414 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
415
416 return error;
417 }
418
419 /*
420 * Common pre-write limit and setup checks.
421 *
422 * Called with the iolock held either shared and exclusive according to
423 * @iolock, and returns with it held. Might upgrade the iolock to exclusive
424 * if called for a direct write beyond i_size.
425 */
426 STATIC ssize_t
xfs_file_write_checks(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,struct xfs_zone_alloc_ctx * ac)427 xfs_file_write_checks(
428 struct kiocb *iocb,
429 struct iov_iter *from,
430 unsigned int *iolock,
431 struct xfs_zone_alloc_ctx *ac)
432 {
433 struct inode *inode = iocb->ki_filp->f_mapping->host;
434 size_t count = iov_iter_count(from);
435 bool drained_dio = false;
436 ssize_t error;
437
438 restart:
439 error = generic_write_checks(iocb, from);
440 if (error <= 0)
441 return error;
442
443 if (iocb->ki_flags & IOCB_NOWAIT) {
444 error = break_layout(inode, false);
445 if (error == -EWOULDBLOCK)
446 error = -EAGAIN;
447 } else {
448 error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
449 }
450
451 if (error)
452 return error;
453
454 /*
455 * For changing security info in file_remove_privs() we need i_rwsem
456 * exclusively.
457 */
458 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
459 xfs_iunlock(XFS_I(inode), *iolock);
460 *iolock = XFS_IOLOCK_EXCL;
461 error = xfs_ilock_iocb(iocb, *iolock);
462 if (error) {
463 *iolock = 0;
464 return error;
465 }
466 goto restart;
467 }
468
469 /*
470 * If the offset is beyond the size of the file, we need to zero all
471 * blocks that fall between the existing EOF and the start of this
472 * write.
473 *
474 * We can do an unlocked check for i_size here safely as I/O completion
475 * can only extend EOF. Truncate is locked out at this point, so the
476 * EOF can not move backwards, only forwards. Hence we only need to take
477 * the slow path when we are at or beyond the current EOF.
478 */
479 if (iocb->ki_pos > i_size_read(inode)) {
480 error = xfs_file_write_zero_eof(iocb, from, iolock, count,
481 &drained_dio, ac);
482 if (error == 1)
483 goto restart;
484 if (error)
485 return error;
486 }
487
488 return kiocb_modified(iocb);
489 }
490
491 static ssize_t
xfs_zoned_write_space_reserve(struct xfs_mount * mp,struct kiocb * iocb,struct iov_iter * from,unsigned int flags,struct xfs_zone_alloc_ctx * ac)492 xfs_zoned_write_space_reserve(
493 struct xfs_mount *mp,
494 struct kiocb *iocb,
495 struct iov_iter *from,
496 unsigned int flags,
497 struct xfs_zone_alloc_ctx *ac)
498 {
499 loff_t count = iov_iter_count(from);
500 int error;
501
502 if (iocb->ki_flags & IOCB_NOWAIT)
503 flags |= XFS_ZR_NOWAIT;
504
505 /*
506 * Check the rlimit and LFS boundary first so that we don't over-reserve
507 * by possibly a lot.
508 *
509 * The generic write path will redo this check later, and it might have
510 * changed by then. If it got expanded we'll stick to our earlier
511 * smaller limit, and if it is decreased the new smaller limit will be
512 * used and our extra space reservation will be returned after finishing
513 * the write.
514 */
515 error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
516 if (error)
517 return error;
518
519 /*
520 * Sloppily round up count to file system blocks.
521 *
522 * This will often reserve an extra block, but that avoids having to look
523 * at the start offset, which isn't stable for O_APPEND until taking the
524 * iolock. Also we need to reserve a block each for zeroing the old
525 * EOF block and the new start block if they are unaligned.
526 *
527 * Any remaining block will be returned after the write.
528 */
529 return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
530 flags, ac);
531 }
532
533 static int
xfs_dio_write_end_io(struct kiocb * iocb,ssize_t size,int error,unsigned flags)534 xfs_dio_write_end_io(
535 struct kiocb *iocb,
536 ssize_t size,
537 int error,
538 unsigned flags)
539 {
540 struct inode *inode = file_inode(iocb->ki_filp);
541 struct xfs_inode *ip = XFS_I(inode);
542 loff_t offset = iocb->ki_pos;
543 unsigned int nofs_flag;
544
545 ASSERT(!xfs_is_zoned_inode(ip) ||
546 !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
547
548 trace_xfs_end_io_direct_write(ip, offset, size);
549
550 if (xfs_is_shutdown(ip->i_mount))
551 return -EIO;
552
553 if (error)
554 return error;
555 if (!size)
556 return 0;
557
558 /*
559 * Capture amount written on completion as we can't reliably account
560 * for it on submission.
561 */
562 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
563
564 /*
565 * We can allocate memory here while doing writeback on behalf of
566 * memory reclaim. To avoid memory allocation deadlocks set the
567 * task-wide nofs context for the following operations.
568 */
569 nofs_flag = memalloc_nofs_save();
570
571 if (flags & IOMAP_DIO_COW) {
572 if (iocb->ki_flags & IOCB_ATOMIC)
573 error = xfs_reflink_end_atomic_cow(ip, offset, size);
574 else
575 error = xfs_reflink_end_cow(ip, offset, size);
576 if (error)
577 goto out;
578 }
579
580 /*
581 * Unwritten conversion updates the in-core isize after extent
582 * conversion but before updating the on-disk size. Updating isize any
583 * earlier allows a racing dio read to find unwritten extents before
584 * they are converted.
585 */
586 if (flags & IOMAP_DIO_UNWRITTEN) {
587 error = xfs_iomap_write_unwritten(ip, offset, size, true);
588 goto out;
589 }
590
591 /*
592 * We need to update the in-core inode size here so that we don't end up
593 * with the on-disk inode size being outside the in-core inode size. We
594 * have no other method of updating EOF for AIO, so always do it here
595 * if necessary.
596 *
597 * We need to lock the test/set EOF update as we can be racing with
598 * other IO completions here to update the EOF. Failing to serialise
599 * here can result in EOF moving backwards and Bad Things Happen when
600 * that occurs.
601 *
602 * As IO completion only ever extends EOF, we can do an unlocked check
603 * here to avoid taking the spinlock. If we land within the current EOF,
604 * then we do not need to do an extending update at all, and we don't
605 * need to take the lock to check this. If we race with an update moving
606 * EOF, then we'll either still be beyond EOF and need to take the lock,
607 * or we'll be within EOF and we don't need to take it at all.
608 */
609 if (offset + size <= i_size_read(inode))
610 goto out;
611
612 spin_lock(&ip->i_flags_lock);
613 if (offset + size > i_size_read(inode)) {
614 i_size_write(inode, offset + size);
615 spin_unlock(&ip->i_flags_lock);
616 error = xfs_setfilesize(ip, offset, size);
617 } else {
618 spin_unlock(&ip->i_flags_lock);
619 }
620
621 out:
622 memalloc_nofs_restore(nofs_flag);
623 return error;
624 }
625
626 static const struct iomap_dio_ops xfs_dio_write_ops = {
627 .end_io = xfs_dio_write_end_io,
628 };
629
630 static void
xfs_dio_zoned_submit_io(const struct iomap_iter * iter,struct bio * bio,loff_t file_offset)631 xfs_dio_zoned_submit_io(
632 const struct iomap_iter *iter,
633 struct bio *bio,
634 loff_t file_offset)
635 {
636 struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
637 struct xfs_zone_alloc_ctx *ac = iter->private;
638 xfs_filblks_t count_fsb;
639 struct iomap_ioend *ioend;
640
641 count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
642 if (count_fsb > ac->reserved_blocks) {
643 xfs_err(mp,
644 "allocation (%lld) larger than reservation (%lld).",
645 count_fsb, ac->reserved_blocks);
646 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
647 bio_io_error(bio);
648 return;
649 }
650 ac->reserved_blocks -= count_fsb;
651
652 bio->bi_end_io = xfs_end_bio;
653 ioend = iomap_init_ioend(iter->inode, bio, file_offset,
654 IOMAP_IOEND_DIRECT);
655 xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
656 }
657
658 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
659 .bio_set = &iomap_ioend_bioset,
660 .submit_io = xfs_dio_zoned_submit_io,
661 .end_io = xfs_dio_write_end_io,
662 };
663
664 /*
665 * Handle block aligned direct I/O writes.
666 */
667 static noinline ssize_t
xfs_file_dio_write_aligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from,const struct iomap_ops * ops,const struct iomap_dio_ops * dops,struct xfs_zone_alloc_ctx * ac)668 xfs_file_dio_write_aligned(
669 struct xfs_inode *ip,
670 struct kiocb *iocb,
671 struct iov_iter *from,
672 const struct iomap_ops *ops,
673 const struct iomap_dio_ops *dops,
674 struct xfs_zone_alloc_ctx *ac)
675 {
676 unsigned int iolock = XFS_IOLOCK_SHARED;
677 ssize_t ret;
678
679 ret = xfs_ilock_iocb_for_write(iocb, &iolock);
680 if (ret)
681 return ret;
682 ret = xfs_file_write_checks(iocb, from, &iolock, ac);
683 if (ret)
684 goto out_unlock;
685
686 /*
687 * We don't need to hold the IOLOCK exclusively across the IO, so demote
688 * the iolock back to shared if we had to take the exclusive lock in
689 * xfs_file_write_checks() for other reasons.
690 */
691 if (iolock == XFS_IOLOCK_EXCL) {
692 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
693 iolock = XFS_IOLOCK_SHARED;
694 }
695 trace_xfs_file_direct_write(iocb, from);
696 ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
697 out_unlock:
698 xfs_iunlock(ip, iolock);
699 return ret;
700 }
701
702 /*
703 * Handle block aligned direct I/O writes to zoned devices.
704 */
705 static noinline ssize_t
xfs_file_dio_write_zoned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)706 xfs_file_dio_write_zoned(
707 struct xfs_inode *ip,
708 struct kiocb *iocb,
709 struct iov_iter *from)
710 {
711 struct xfs_zone_alloc_ctx ac = { };
712 ssize_t ret;
713
714 ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
715 if (ret < 0)
716 return ret;
717 ret = xfs_file_dio_write_aligned(ip, iocb, from,
718 &xfs_zoned_direct_write_iomap_ops,
719 &xfs_dio_zoned_write_ops, &ac);
720 xfs_zoned_space_unreserve(ip->i_mount, &ac);
721 return ret;
722 }
723
724 /*
725 * Handle block atomic writes
726 *
727 * Two methods of atomic writes are supported:
728 * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
729 * disk
730 * - COW-based, which uses a COW fork as a staging extent for data updates
731 * before atomically updating extent mappings for the range being written
732 *
733 */
734 static noinline ssize_t
xfs_file_dio_write_atomic(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)735 xfs_file_dio_write_atomic(
736 struct xfs_inode *ip,
737 struct kiocb *iocb,
738 struct iov_iter *from)
739 {
740 unsigned int iolock = XFS_IOLOCK_SHARED;
741 ssize_t ret, ocount = iov_iter_count(from);
742 const struct iomap_ops *dops;
743
744 /*
745 * HW offload should be faster, so try that first if it is already
746 * known that the write length is not too large.
747 */
748 if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
749 dops = &xfs_atomic_write_cow_iomap_ops;
750 else
751 dops = &xfs_direct_write_iomap_ops;
752
753 retry:
754 ret = xfs_ilock_iocb_for_write(iocb, &iolock);
755 if (ret)
756 return ret;
757
758 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
759 if (ret)
760 goto out_unlock;
761
762 /* Demote similar to xfs_file_dio_write_aligned() */
763 if (iolock == XFS_IOLOCK_EXCL) {
764 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
765 iolock = XFS_IOLOCK_SHARED;
766 }
767
768 trace_xfs_file_direct_write(iocb, from);
769 ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
770 0, NULL, 0);
771
772 /*
773 * The retry mechanism is based on the ->iomap_begin method returning
774 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
775 * possible. The REQ_ATOMIC-based method typically not be possible if
776 * the write spans multiple extents or the disk blocks are misaligned.
777 */
778 if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
779 xfs_iunlock(ip, iolock);
780 dops = &xfs_atomic_write_cow_iomap_ops;
781 goto retry;
782 }
783
784 out_unlock:
785 if (iolock)
786 xfs_iunlock(ip, iolock);
787 return ret;
788 }
789
790 /*
791 * Handle block unaligned direct I/O writes
792 *
793 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
794 * them to be done in parallel with reads and other direct I/O writes. However,
795 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
796 * to do sub-block zeroing and that requires serialisation against other direct
797 * I/O to the same block. In this case we need to serialise the submission of
798 * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
799 * In the case where sub-block zeroing is not required, we can do concurrent
800 * sub-block dios to the same block successfully.
801 *
802 * Optimistically submit the I/O using the shared lock first, but use the
803 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
804 * if block allocation or partial block zeroing would be required. In that case
805 * we try again with the exclusive lock.
806 */
807 static noinline ssize_t
xfs_file_dio_write_unaligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)808 xfs_file_dio_write_unaligned(
809 struct xfs_inode *ip,
810 struct kiocb *iocb,
811 struct iov_iter *from)
812 {
813 size_t isize = i_size_read(VFS_I(ip));
814 size_t count = iov_iter_count(from);
815 unsigned int iolock = XFS_IOLOCK_SHARED;
816 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
817 ssize_t ret;
818
819 /*
820 * Extending writes need exclusivity because of the sub-block zeroing
821 * that the DIO code always does for partial tail blocks beyond EOF, so
822 * don't even bother trying the fast path in this case.
823 */
824 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
825 if (iocb->ki_flags & IOCB_NOWAIT)
826 return -EAGAIN;
827 retry_exclusive:
828 iolock = XFS_IOLOCK_EXCL;
829 flags = IOMAP_DIO_FORCE_WAIT;
830 }
831
832 ret = xfs_ilock_iocb_for_write(iocb, &iolock);
833 if (ret)
834 return ret;
835
836 /*
837 * We can't properly handle unaligned direct I/O to reflink files yet,
838 * as we can't unshare a partial block.
839 */
840 if (xfs_is_cow_inode(ip)) {
841 trace_xfs_reflink_bounce_dio_write(iocb, from);
842 ret = -ENOTBLK;
843 goto out_unlock;
844 }
845
846 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
847 if (ret)
848 goto out_unlock;
849
850 /*
851 * If we are doing exclusive unaligned I/O, this must be the only I/O
852 * in-flight. Otherwise we risk data corruption due to unwritten extent
853 * conversions from the AIO end_io handler. Wait for all other I/O to
854 * drain first.
855 */
856 if (flags & IOMAP_DIO_FORCE_WAIT)
857 inode_dio_wait(VFS_I(ip));
858
859 trace_xfs_file_direct_write(iocb, from);
860 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
861 &xfs_dio_write_ops, flags, NULL, 0);
862
863 /*
864 * Retry unaligned I/O with exclusive blocking semantics if the DIO
865 * layer rejected it for mapping or locking reasons. If we are doing
866 * nonblocking user I/O, propagate the error.
867 */
868 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
869 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
870 xfs_iunlock(ip, iolock);
871 goto retry_exclusive;
872 }
873
874 out_unlock:
875 if (iolock)
876 xfs_iunlock(ip, iolock);
877 return ret;
878 }
879
880 static ssize_t
xfs_file_dio_write(struct kiocb * iocb,struct iov_iter * from)881 xfs_file_dio_write(
882 struct kiocb *iocb,
883 struct iov_iter *from)
884 {
885 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
886 struct xfs_buftarg *target = xfs_inode_buftarg(ip);
887 size_t count = iov_iter_count(from);
888
889 /* direct I/O must be aligned to device logical sector size */
890 if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
891 return -EINVAL;
892
893 /*
894 * For always COW inodes we also must check the alignment of each
895 * individual iovec segment, as they could end up with different
896 * I/Os due to the way bio_iov_iter_get_pages works, and we'd
897 * then overwrite an already written block.
898 */
899 if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
900 (xfs_is_always_cow_inode(ip) &&
901 (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
902 return xfs_file_dio_write_unaligned(ip, iocb, from);
903 if (xfs_is_zoned_inode(ip))
904 return xfs_file_dio_write_zoned(ip, iocb, from);
905 if (iocb->ki_flags & IOCB_ATOMIC)
906 return xfs_file_dio_write_atomic(ip, iocb, from);
907 return xfs_file_dio_write_aligned(ip, iocb, from,
908 &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
909 }
910
911 static noinline ssize_t
xfs_file_dax_write(struct kiocb * iocb,struct iov_iter * from)912 xfs_file_dax_write(
913 struct kiocb *iocb,
914 struct iov_iter *from)
915 {
916 struct inode *inode = iocb->ki_filp->f_mapping->host;
917 struct xfs_inode *ip = XFS_I(inode);
918 unsigned int iolock = XFS_IOLOCK_EXCL;
919 ssize_t ret, error = 0;
920 loff_t pos;
921
922 ret = xfs_ilock_iocb(iocb, iolock);
923 if (ret)
924 return ret;
925 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
926 if (ret)
927 goto out;
928
929 pos = iocb->ki_pos;
930
931 trace_xfs_file_dax_write(iocb, from);
932 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
933 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
934 i_size_write(inode, iocb->ki_pos);
935 error = xfs_setfilesize(ip, pos, ret);
936 }
937 out:
938 if (iolock)
939 xfs_iunlock(ip, iolock);
940 if (error)
941 return error;
942
943 if (ret > 0) {
944 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
945
946 /* Handle various SYNC-type writes */
947 ret = generic_write_sync(iocb, ret);
948 }
949 return ret;
950 }
951
952 STATIC ssize_t
xfs_file_buffered_write(struct kiocb * iocb,struct iov_iter * from)953 xfs_file_buffered_write(
954 struct kiocb *iocb,
955 struct iov_iter *from)
956 {
957 struct inode *inode = iocb->ki_filp->f_mapping->host;
958 struct xfs_inode *ip = XFS_I(inode);
959 ssize_t ret;
960 bool cleared_space = false;
961 unsigned int iolock;
962
963 write_retry:
964 iolock = XFS_IOLOCK_EXCL;
965 ret = xfs_ilock_iocb(iocb, iolock);
966 if (ret)
967 return ret;
968
969 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
970 if (ret)
971 goto out;
972
973 trace_xfs_file_buffered_write(iocb, from);
974 ret = iomap_file_buffered_write(iocb, from,
975 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
976 NULL);
977
978 /*
979 * If we hit a space limit, try to free up some lingering preallocated
980 * space before returning an error. In the case of ENOSPC, first try to
981 * write back all dirty inodes to free up some of the excess reserved
982 * metadata space. This reduces the chances that the eofblocks scan
983 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
984 * also behaves as a filter to prevent too many eofblocks scans from
985 * running at the same time. Use a synchronous scan to increase the
986 * effectiveness of the scan.
987 */
988 if (ret == -EDQUOT && !cleared_space) {
989 xfs_iunlock(ip, iolock);
990 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
991 cleared_space = true;
992 goto write_retry;
993 } else if (ret == -ENOSPC && !cleared_space) {
994 struct xfs_icwalk icw = {0};
995
996 cleared_space = true;
997 xfs_flush_inodes(ip->i_mount);
998
999 xfs_iunlock(ip, iolock);
1000 icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
1001 xfs_blockgc_free_space(ip->i_mount, &icw);
1002 goto write_retry;
1003 }
1004
1005 out:
1006 if (iolock)
1007 xfs_iunlock(ip, iolock);
1008
1009 if (ret > 0) {
1010 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
1011 /* Handle various SYNC-type writes */
1012 ret = generic_write_sync(iocb, ret);
1013 }
1014 return ret;
1015 }
1016
1017 STATIC ssize_t
xfs_file_buffered_write_zoned(struct kiocb * iocb,struct iov_iter * from)1018 xfs_file_buffered_write_zoned(
1019 struct kiocb *iocb,
1020 struct iov_iter *from)
1021 {
1022 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
1023 struct xfs_mount *mp = ip->i_mount;
1024 unsigned int iolock = XFS_IOLOCK_EXCL;
1025 bool cleared_space = false;
1026 struct xfs_zone_alloc_ctx ac = { };
1027 ssize_t ret;
1028
1029 ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
1030 if (ret < 0)
1031 return ret;
1032
1033 ret = xfs_ilock_iocb(iocb, iolock);
1034 if (ret)
1035 goto out_unreserve;
1036
1037 ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
1038 if (ret)
1039 goto out_unlock;
1040
1041 /*
1042 * Truncate the iter to the length that we were actually able to
1043 * allocate blocks for. This needs to happen after
1044 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
1045 * writes.
1046 */
1047 iov_iter_truncate(from,
1048 XFS_FSB_TO_B(mp, ac.reserved_blocks) -
1049 (iocb->ki_pos & mp->m_blockmask));
1050 if (!iov_iter_count(from))
1051 goto out_unlock;
1052
1053 retry:
1054 trace_xfs_file_buffered_write(iocb, from);
1055 ret = iomap_file_buffered_write(iocb, from,
1056 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
1057 &ac);
1058 if (ret == -ENOSPC && !cleared_space) {
1059 /*
1060 * Kick off writeback to convert delalloc space and release the
1061 * usually too pessimistic indirect block reservations.
1062 */
1063 xfs_flush_inodes(mp);
1064 cleared_space = true;
1065 goto retry;
1066 }
1067
1068 out_unlock:
1069 xfs_iunlock(ip, iolock);
1070 out_unreserve:
1071 xfs_zoned_space_unreserve(ip->i_mount, &ac);
1072 if (ret > 0) {
1073 XFS_STATS_ADD(mp, xs_write_bytes, ret);
1074 ret = generic_write_sync(iocb, ret);
1075 }
1076 return ret;
1077 }
1078
1079 STATIC ssize_t
xfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1080 xfs_file_write_iter(
1081 struct kiocb *iocb,
1082 struct iov_iter *from)
1083 {
1084 struct inode *inode = iocb->ki_filp->f_mapping->host;
1085 struct xfs_inode *ip = XFS_I(inode);
1086 ssize_t ret;
1087 size_t ocount = iov_iter_count(from);
1088
1089 XFS_STATS_INC(ip->i_mount, xs_write_calls);
1090
1091 if (ocount == 0)
1092 return 0;
1093
1094 if (xfs_is_shutdown(ip->i_mount))
1095 return -EIO;
1096
1097 if (iocb->ki_flags & IOCB_ATOMIC) {
1098 if (ocount < xfs_get_atomic_write_min(ip))
1099 return -EINVAL;
1100
1101 if (ocount > xfs_get_atomic_write_max(ip))
1102 return -EINVAL;
1103
1104 ret = generic_atomic_write_valid(iocb, from);
1105 if (ret)
1106 return ret;
1107 }
1108
1109 if (IS_DAX(inode))
1110 return xfs_file_dax_write(iocb, from);
1111
1112 if (iocb->ki_flags & IOCB_DIRECT) {
1113 /*
1114 * Allow a directio write to fall back to a buffered
1115 * write *only* in the case that we're doing a reflink
1116 * CoW. In all other directio scenarios we do not
1117 * allow an operation to fall back to buffered mode.
1118 */
1119 ret = xfs_file_dio_write(iocb, from);
1120 if (ret != -ENOTBLK)
1121 return ret;
1122 }
1123
1124 if (xfs_is_zoned_inode(ip))
1125 return xfs_file_buffered_write_zoned(iocb, from);
1126 return xfs_file_buffered_write(iocb, from);
1127 }
1128
1129 /* Does this file, inode, or mount want synchronous writes? */
xfs_file_sync_writes(struct file * filp)1130 static inline bool xfs_file_sync_writes(struct file *filp)
1131 {
1132 struct xfs_inode *ip = XFS_I(file_inode(filp));
1133
1134 if (xfs_has_wsync(ip->i_mount))
1135 return true;
1136 if (filp->f_flags & (__O_SYNC | O_DSYNC))
1137 return true;
1138 if (IS_SYNC(file_inode(filp)))
1139 return true;
1140
1141 return false;
1142 }
1143
1144 static int
xfs_falloc_newsize(struct file * file,int mode,loff_t offset,loff_t len,loff_t * new_size)1145 xfs_falloc_newsize(
1146 struct file *file,
1147 int mode,
1148 loff_t offset,
1149 loff_t len,
1150 loff_t *new_size)
1151 {
1152 struct inode *inode = file_inode(file);
1153
1154 if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
1155 return 0;
1156 *new_size = offset + len;
1157 return inode_newsize_ok(inode, *new_size);
1158 }
1159
1160 static int
xfs_falloc_setsize(struct file * file,loff_t new_size)1161 xfs_falloc_setsize(
1162 struct file *file,
1163 loff_t new_size)
1164 {
1165 struct iattr iattr = {
1166 .ia_valid = ATTR_SIZE,
1167 .ia_size = new_size,
1168 };
1169
1170 if (!new_size)
1171 return 0;
1172 return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
1173 &iattr);
1174 }
1175
1176 static int
xfs_falloc_collapse_range(struct file * file,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1177 xfs_falloc_collapse_range(
1178 struct file *file,
1179 loff_t offset,
1180 loff_t len,
1181 struct xfs_zone_alloc_ctx *ac)
1182 {
1183 struct inode *inode = file_inode(file);
1184 loff_t new_size = i_size_read(inode) - len;
1185 int error;
1186
1187 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1188 return -EINVAL;
1189
1190 /*
1191 * There is no need to overlap collapse range with EOF, in which case it
1192 * is effectively a truncate operation
1193 */
1194 if (offset + len >= i_size_read(inode))
1195 return -EINVAL;
1196
1197 error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1198 if (error)
1199 return error;
1200 return xfs_falloc_setsize(file, new_size);
1201 }
1202
1203 static int
xfs_falloc_insert_range(struct file * file,loff_t offset,loff_t len)1204 xfs_falloc_insert_range(
1205 struct file *file,
1206 loff_t offset,
1207 loff_t len)
1208 {
1209 struct inode *inode = file_inode(file);
1210 loff_t isize = i_size_read(inode);
1211 int error;
1212
1213 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1214 return -EINVAL;
1215
1216 /*
1217 * New inode size must not exceed ->s_maxbytes, accounting for
1218 * possible signed overflow.
1219 */
1220 if (inode->i_sb->s_maxbytes - isize < len)
1221 return -EFBIG;
1222
1223 /* Offset should be less than i_size */
1224 if (offset >= isize)
1225 return -EINVAL;
1226
1227 error = xfs_falloc_setsize(file, isize + len);
1228 if (error)
1229 return error;
1230
1231 /*
1232 * Perform hole insertion now that the file size has been updated so
1233 * that if we crash during the operation we don't leave shifted extents
1234 * past EOF and hence losing access to the data that is contained within
1235 * them.
1236 */
1237 return xfs_insert_file_space(XFS_I(inode), offset, len);
1238 }
1239
1240 /*
1241 * Punch a hole and prealloc the range. We use a hole punch rather than
1242 * unwritten extent conversion for two reasons:
1243 *
1244 * 1.) Hole punch handles partial block zeroing for us.
1245 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1246 * virtue of the hole punch.
1247 */
1248 static int
xfs_falloc_zero_range(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1249 xfs_falloc_zero_range(
1250 struct file *file,
1251 int mode,
1252 loff_t offset,
1253 loff_t len,
1254 struct xfs_zone_alloc_ctx *ac)
1255 {
1256 struct inode *inode = file_inode(file);
1257 unsigned int blksize = i_blocksize(inode);
1258 loff_t new_size = 0;
1259 int error;
1260
1261 trace_xfs_zero_file_space(XFS_I(inode));
1262
1263 error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1264 if (error)
1265 return error;
1266
1267 error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
1268 if (error)
1269 return error;
1270
1271 len = round_up(offset + len, blksize) - round_down(offset, blksize);
1272 offset = round_down(offset, blksize);
1273 error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1274 if (error)
1275 return error;
1276 return xfs_falloc_setsize(file, new_size);
1277 }
1278
1279 static int
xfs_falloc_unshare_range(struct file * file,int mode,loff_t offset,loff_t len)1280 xfs_falloc_unshare_range(
1281 struct file *file,
1282 int mode,
1283 loff_t offset,
1284 loff_t len)
1285 {
1286 struct inode *inode = file_inode(file);
1287 loff_t new_size = 0;
1288 int error;
1289
1290 error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1291 if (error)
1292 return error;
1293
1294 error = xfs_reflink_unshare(XFS_I(inode), offset, len);
1295 if (error)
1296 return error;
1297
1298 error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1299 if (error)
1300 return error;
1301 return xfs_falloc_setsize(file, new_size);
1302 }
1303
1304 static int
xfs_falloc_allocate_range(struct file * file,int mode,loff_t offset,loff_t len)1305 xfs_falloc_allocate_range(
1306 struct file *file,
1307 int mode,
1308 loff_t offset,
1309 loff_t len)
1310 {
1311 struct inode *inode = file_inode(file);
1312 loff_t new_size = 0;
1313 int error;
1314
1315 /*
1316 * If always_cow mode we can't use preallocations and thus should not
1317 * create them.
1318 */
1319 if (xfs_is_always_cow_inode(XFS_I(inode)))
1320 return -EOPNOTSUPP;
1321
1322 error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1323 if (error)
1324 return error;
1325
1326 error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1327 if (error)
1328 return error;
1329 return xfs_falloc_setsize(file, new_size);
1330 }
1331
1332 #define XFS_FALLOC_FL_SUPPORTED \
1333 (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \
1334 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \
1335 FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \
1336 FALLOC_FL_UNSHARE_RANGE)
1337
1338 STATIC long
__xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1339 __xfs_file_fallocate(
1340 struct file *file,
1341 int mode,
1342 loff_t offset,
1343 loff_t len,
1344 struct xfs_zone_alloc_ctx *ac)
1345 {
1346 struct inode *inode = file_inode(file);
1347 struct xfs_inode *ip = XFS_I(inode);
1348 long error;
1349 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1350
1351 xfs_ilock(ip, iolock);
1352 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1353 if (error)
1354 goto out_unlock;
1355
1356 /*
1357 * Must wait for all AIO to complete before we continue as AIO can
1358 * change the file size on completion without holding any locks we
1359 * currently hold. We must do this first because AIO can update both
1360 * the on disk and in memory inode sizes, and the operations that follow
1361 * require the in-memory size to be fully up-to-date.
1362 */
1363 inode_dio_wait(inode);
1364
1365 error = file_modified(file);
1366 if (error)
1367 goto out_unlock;
1368
1369 switch (mode & FALLOC_FL_MODE_MASK) {
1370 case FALLOC_FL_PUNCH_HOLE:
1371 error = xfs_free_file_space(ip, offset, len, ac);
1372 break;
1373 case FALLOC_FL_COLLAPSE_RANGE:
1374 error = xfs_falloc_collapse_range(file, offset, len, ac);
1375 break;
1376 case FALLOC_FL_INSERT_RANGE:
1377 error = xfs_falloc_insert_range(file, offset, len);
1378 break;
1379 case FALLOC_FL_ZERO_RANGE:
1380 error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1381 break;
1382 case FALLOC_FL_UNSHARE_RANGE:
1383 error = xfs_falloc_unshare_range(file, mode, offset, len);
1384 break;
1385 case FALLOC_FL_ALLOCATE_RANGE:
1386 error = xfs_falloc_allocate_range(file, mode, offset, len);
1387 break;
1388 default:
1389 error = -EOPNOTSUPP;
1390 break;
1391 }
1392
1393 if (!error && xfs_file_sync_writes(file))
1394 error = xfs_log_force_inode(ip);
1395
1396 out_unlock:
1397 xfs_iunlock(ip, iolock);
1398 return error;
1399 }
1400
1401 static long
xfs_file_zoned_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1402 xfs_file_zoned_fallocate(
1403 struct file *file,
1404 int mode,
1405 loff_t offset,
1406 loff_t len)
1407 {
1408 struct xfs_zone_alloc_ctx ac = { };
1409 struct xfs_inode *ip = XFS_I(file_inode(file));
1410 int error;
1411
1412 error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac);
1413 if (error)
1414 return error;
1415 error = __xfs_file_fallocate(file, mode, offset, len, &ac);
1416 xfs_zoned_space_unreserve(ip->i_mount, &ac);
1417 return error;
1418 }
1419
1420 static long
xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1421 xfs_file_fallocate(
1422 struct file *file,
1423 int mode,
1424 loff_t offset,
1425 loff_t len)
1426 {
1427 struct inode *inode = file_inode(file);
1428
1429 if (!S_ISREG(inode->i_mode))
1430 return -EINVAL;
1431 if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1432 return -EOPNOTSUPP;
1433
1434 /*
1435 * For zoned file systems, zeroing the first and last block of a hole
1436 * punch requires allocating a new block to rewrite the remaining data
1437 * and new zeroes out of place. Get a reservations for those before
1438 * taking the iolock. Dip into the reserved pool because we are
1439 * expected to be able to punch a hole even on a completely full
1440 * file system.
1441 */
1442 if (xfs_is_zoned_inode(XFS_I(inode)) &&
1443 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1444 FALLOC_FL_COLLAPSE_RANGE)))
1445 return xfs_file_zoned_fallocate(file, mode, offset, len);
1446 return __xfs_file_fallocate(file, mode, offset, len, NULL);
1447 }
1448
1449 STATIC int
xfs_file_fadvise(struct file * file,loff_t start,loff_t end,int advice)1450 xfs_file_fadvise(
1451 struct file *file,
1452 loff_t start,
1453 loff_t end,
1454 int advice)
1455 {
1456 struct xfs_inode *ip = XFS_I(file_inode(file));
1457 int ret;
1458 int lockflags = 0;
1459
1460 /*
1461 * Operations creating pages in page cache need protection from hole
1462 * punching and similar ops
1463 */
1464 if (advice == POSIX_FADV_WILLNEED) {
1465 lockflags = XFS_IOLOCK_SHARED;
1466 xfs_ilock(ip, lockflags);
1467 }
1468 ret = generic_fadvise(file, start, end, advice);
1469 if (lockflags)
1470 xfs_iunlock(ip, lockflags);
1471 return ret;
1472 }
1473
1474 STATIC loff_t
xfs_file_remap_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)1475 xfs_file_remap_range(
1476 struct file *file_in,
1477 loff_t pos_in,
1478 struct file *file_out,
1479 loff_t pos_out,
1480 loff_t len,
1481 unsigned int remap_flags)
1482 {
1483 struct inode *inode_in = file_inode(file_in);
1484 struct xfs_inode *src = XFS_I(inode_in);
1485 struct inode *inode_out = file_inode(file_out);
1486 struct xfs_inode *dest = XFS_I(inode_out);
1487 struct xfs_mount *mp = src->i_mount;
1488 loff_t remapped = 0;
1489 xfs_extlen_t cowextsize;
1490 int ret;
1491
1492 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1493 return -EINVAL;
1494
1495 if (!xfs_has_reflink(mp))
1496 return -EOPNOTSUPP;
1497
1498 if (xfs_is_shutdown(mp))
1499 return -EIO;
1500
1501 /* Prepare and then clone file data. */
1502 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1503 &len, remap_flags);
1504 if (ret || len == 0)
1505 return ret;
1506
1507 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1508
1509 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1510 &remapped);
1511 if (ret)
1512 goto out_unlock;
1513
1514 /*
1515 * Carry the cowextsize hint from src to dest if we're sharing the
1516 * entire source file to the entire destination file, the source file
1517 * has a cowextsize hint, and the destination file does not.
1518 */
1519 cowextsize = 0;
1520 if (pos_in == 0 && len == i_size_read(inode_in) &&
1521 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1522 pos_out == 0 && len >= i_size_read(inode_out) &&
1523 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1524 cowextsize = src->i_cowextsize;
1525
1526 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1527 remap_flags);
1528 if (ret)
1529 goto out_unlock;
1530
1531 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1532 xfs_log_force_inode(dest);
1533 out_unlock:
1534 xfs_iunlock2_remapping(src, dest);
1535 if (ret)
1536 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1537 /*
1538 * If the caller did not set CAN_SHORTEN, then it is not prepared to
1539 * handle partial results -- either the whole remap succeeds, or we
1540 * must say why it did not. In this case, any error should be returned
1541 * to the caller.
1542 */
1543 if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1544 return ret;
1545 return remapped > 0 ? remapped : ret;
1546 }
1547
1548 STATIC int
xfs_file_open(struct inode * inode,struct file * file)1549 xfs_file_open(
1550 struct inode *inode,
1551 struct file *file)
1552 {
1553 if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1554 return -EIO;
1555 file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
1556 if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
1557 file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
1558 return generic_file_open(inode, file);
1559 }
1560
1561 STATIC int
xfs_dir_open(struct inode * inode,struct file * file)1562 xfs_dir_open(
1563 struct inode *inode,
1564 struct file *file)
1565 {
1566 struct xfs_inode *ip = XFS_I(inode);
1567 unsigned int mode;
1568 int error;
1569
1570 if (xfs_is_shutdown(ip->i_mount))
1571 return -EIO;
1572 error = generic_file_open(inode, file);
1573 if (error)
1574 return error;
1575
1576 /*
1577 * If there are any blocks, read-ahead block 0 as we're almost
1578 * certain to have the next operation be a read there.
1579 */
1580 mode = xfs_ilock_data_map_shared(ip);
1581 if (ip->i_df.if_nextents > 0)
1582 error = xfs_dir3_data_readahead(ip, 0, 0);
1583 xfs_iunlock(ip, mode);
1584 return error;
1585 }
1586
1587 /*
1588 * Don't bother propagating errors. We're just doing cleanup, and the caller
1589 * ignores the return value anyway.
1590 */
1591 STATIC int
xfs_file_release(struct inode * inode,struct file * file)1592 xfs_file_release(
1593 struct inode *inode,
1594 struct file *file)
1595 {
1596 struct xfs_inode *ip = XFS_I(inode);
1597 struct xfs_mount *mp = ip->i_mount;
1598
1599 /*
1600 * If this is a read-only mount or the file system has been shut down,
1601 * don't generate I/O.
1602 */
1603 if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
1604 return 0;
1605
1606 /*
1607 * If we previously truncated this file and removed old data in the
1608 * process, we want to initiate "early" writeout on the last close.
1609 * This is an attempt to combat the notorious NULL files problem which
1610 * is particularly noticeable from a truncate down, buffered (re-)write
1611 * (delalloc), followed by a crash. What we are effectively doing here
1612 * is significantly reducing the time window where we'd otherwise be
1613 * exposed to that problem.
1614 */
1615 if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1616 xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1617 if (ip->i_delayed_blks > 0)
1618 filemap_flush(inode->i_mapping);
1619 }
1620
1621 /*
1622 * XFS aggressively preallocates post-EOF space to generate contiguous
1623 * allocations for writers that append to the end of the file.
1624 *
1625 * To support workloads that close and reopen the file frequently, these
1626 * preallocations usually persist after a close unless it is the first
1627 * close for the inode. This is a tradeoff to generate tightly packed
1628 * data layouts for unpacking tarballs or similar archives that write
1629 * one file after another without going back to it while keeping the
1630 * preallocation for files that have recurring open/write/close cycles.
1631 *
1632 * This heuristic is skipped for inodes with the append-only flag as
1633 * that flag is rather pointless for inodes written only once.
1634 *
1635 * There is no point in freeing blocks here for open but unlinked files
1636 * as they will be taken care of by the inactivation path soon.
1637 *
1638 * When releasing a read-only context, don't flush data or trim post-EOF
1639 * blocks. This avoids open/read/close workloads from removing EOF
1640 * blocks that other writers depend upon to reduce fragmentation.
1641 *
1642 * Inodes on the zoned RT device never have preallocations, so skip
1643 * taking the locks below.
1644 */
1645 if (!inode->i_nlink ||
1646 !(file->f_mode & FMODE_WRITE) ||
1647 (ip->i_diflags & XFS_DIFLAG_APPEND) ||
1648 xfs_is_zoned_inode(ip))
1649 return 0;
1650
1651 /*
1652 * If we can't get the iolock just skip truncating the blocks past EOF
1653 * because we could deadlock with the mmap_lock otherwise. We'll get
1654 * another chance to drop them once the last reference to the inode is
1655 * dropped, so we'll never leak blocks permanently.
1656 */
1657 if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1658 xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1659 if (xfs_can_free_eofblocks(ip) &&
1660 !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1661 xfs_free_eofblocks(ip);
1662 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1663 }
1664
1665 return 0;
1666 }
1667
1668 STATIC int
xfs_file_readdir(struct file * file,struct dir_context * ctx)1669 xfs_file_readdir(
1670 struct file *file,
1671 struct dir_context *ctx)
1672 {
1673 struct inode *inode = file_inode(file);
1674 xfs_inode_t *ip = XFS_I(inode);
1675 size_t bufsize;
1676
1677 /*
1678 * The Linux API doesn't pass down the total size of the buffer
1679 * we read into down to the filesystem. With the filldir concept
1680 * it's not needed for correct information, but the XFS dir2 leaf
1681 * code wants an estimate of the buffer size to calculate it's
1682 * readahead window and size the buffers used for mapping to
1683 * physical blocks.
1684 *
1685 * Try to give it an estimate that's good enough, maybe at some
1686 * point we can change the ->readdir prototype to include the
1687 * buffer size. For now we use the current glibc buffer size.
1688 */
1689 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1690
1691 return xfs_readdir(NULL, ip, ctx, bufsize);
1692 }
1693
1694 STATIC loff_t
xfs_file_llseek(struct file * file,loff_t offset,int whence)1695 xfs_file_llseek(
1696 struct file *file,
1697 loff_t offset,
1698 int whence)
1699 {
1700 struct inode *inode = file->f_mapping->host;
1701
1702 if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1703 return -EIO;
1704
1705 switch (whence) {
1706 default:
1707 return generic_file_llseek(file, offset, whence);
1708 case SEEK_HOLE:
1709 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1710 break;
1711 case SEEK_DATA:
1712 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1713 break;
1714 }
1715
1716 if (offset < 0)
1717 return offset;
1718 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1719 }
1720
1721 static inline vm_fault_t
xfs_dax_fault_locked(struct vm_fault * vmf,unsigned int order,bool write_fault)1722 xfs_dax_fault_locked(
1723 struct vm_fault *vmf,
1724 unsigned int order,
1725 bool write_fault)
1726 {
1727 vm_fault_t ret;
1728 unsigned long pfn;
1729
1730 if (!IS_ENABLED(CONFIG_FS_DAX)) {
1731 ASSERT(0);
1732 return VM_FAULT_SIGBUS;
1733 }
1734 ret = dax_iomap_fault(vmf, order, &pfn, NULL,
1735 (write_fault && !vmf->cow_page) ?
1736 &xfs_dax_write_iomap_ops :
1737 &xfs_read_iomap_ops);
1738 if (ret & VM_FAULT_NEEDDSYNC)
1739 ret = dax_finish_sync_fault(vmf, order, pfn);
1740 return ret;
1741 }
1742
1743 static vm_fault_t
xfs_dax_read_fault(struct vm_fault * vmf,unsigned int order)1744 xfs_dax_read_fault(
1745 struct vm_fault *vmf,
1746 unsigned int order)
1747 {
1748 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
1749 vm_fault_t ret;
1750
1751 trace_xfs_read_fault(ip, order);
1752
1753 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1754 ret = xfs_dax_fault_locked(vmf, order, false);
1755 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1756
1757 return ret;
1758 }
1759
1760 /*
1761 * Locking for serialisation of IO during page faults. This results in a lock
1762 * ordering of:
1763 *
1764 * mmap_lock (MM)
1765 * sb_start_pagefault(vfs, freeze)
1766 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1767 * page_lock (MM)
1768 * i_lock (XFS - extent map serialisation)
1769 */
1770 static vm_fault_t
__xfs_write_fault(struct vm_fault * vmf,unsigned int order,struct xfs_zone_alloc_ctx * ac)1771 __xfs_write_fault(
1772 struct vm_fault *vmf,
1773 unsigned int order,
1774 struct xfs_zone_alloc_ctx *ac)
1775 {
1776 struct inode *inode = file_inode(vmf->vma->vm_file);
1777 struct xfs_inode *ip = XFS_I(inode);
1778 unsigned int lock_mode = XFS_MMAPLOCK_SHARED;
1779 vm_fault_t ret;
1780
1781 trace_xfs_write_fault(ip, order);
1782
1783 sb_start_pagefault(inode->i_sb);
1784 file_update_time(vmf->vma->vm_file);
1785
1786 /*
1787 * Normally we only need the shared mmaplock, but if a reflink remap is
1788 * in progress we take the exclusive lock to wait for the remap to
1789 * finish before taking a write fault.
1790 */
1791 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1792 if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1793 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1794 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1795 lock_mode = XFS_MMAPLOCK_EXCL;
1796 }
1797
1798 if (IS_DAX(inode))
1799 ret = xfs_dax_fault_locked(vmf, order, true);
1800 else
1801 ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
1802 ac);
1803 xfs_iunlock(ip, lock_mode);
1804
1805 sb_end_pagefault(inode->i_sb);
1806 return ret;
1807 }
1808
1809 static vm_fault_t
xfs_write_fault_zoned(struct vm_fault * vmf,unsigned int order)1810 xfs_write_fault_zoned(
1811 struct vm_fault *vmf,
1812 unsigned int order)
1813 {
1814 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
1815 unsigned int len = folio_size(page_folio(vmf->page));
1816 struct xfs_zone_alloc_ctx ac = { };
1817 int error;
1818 vm_fault_t ret;
1819
1820 /*
1821 * This could over-allocate as it doesn't check for truncation.
1822 *
1823 * But as the overallocation is limited to less than a folio and will be
1824 * release instantly that's just fine.
1825 */
1826 error = xfs_zoned_space_reserve(ip->i_mount,
1827 XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
1828 if (error < 0)
1829 return vmf_fs_error(error);
1830 ret = __xfs_write_fault(vmf, order, &ac);
1831 xfs_zoned_space_unreserve(ip->i_mount, &ac);
1832 return ret;
1833 }
1834
1835 static vm_fault_t
xfs_write_fault(struct vm_fault * vmf,unsigned int order)1836 xfs_write_fault(
1837 struct vm_fault *vmf,
1838 unsigned int order)
1839 {
1840 if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
1841 return xfs_write_fault_zoned(vmf, order);
1842 return __xfs_write_fault(vmf, order, NULL);
1843 }
1844
1845 static inline bool
xfs_is_write_fault(struct vm_fault * vmf)1846 xfs_is_write_fault(
1847 struct vm_fault *vmf)
1848 {
1849 return (vmf->flags & FAULT_FLAG_WRITE) &&
1850 (vmf->vma->vm_flags & VM_SHARED);
1851 }
1852
1853 static vm_fault_t
xfs_filemap_fault(struct vm_fault * vmf)1854 xfs_filemap_fault(
1855 struct vm_fault *vmf)
1856 {
1857 struct inode *inode = file_inode(vmf->vma->vm_file);
1858
1859 /* DAX can shortcut the normal fault path on write faults! */
1860 if (IS_DAX(inode)) {
1861 if (xfs_is_write_fault(vmf))
1862 return xfs_write_fault(vmf, 0);
1863 return xfs_dax_read_fault(vmf, 0);
1864 }
1865
1866 trace_xfs_read_fault(XFS_I(inode), 0);
1867 return filemap_fault(vmf);
1868 }
1869
1870 static vm_fault_t
xfs_filemap_huge_fault(struct vm_fault * vmf,unsigned int order)1871 xfs_filemap_huge_fault(
1872 struct vm_fault *vmf,
1873 unsigned int order)
1874 {
1875 if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1876 return VM_FAULT_FALLBACK;
1877
1878 /* DAX can shortcut the normal fault path on write faults! */
1879 if (xfs_is_write_fault(vmf))
1880 return xfs_write_fault(vmf, order);
1881 return xfs_dax_read_fault(vmf, order);
1882 }
1883
1884 static vm_fault_t
xfs_filemap_page_mkwrite(struct vm_fault * vmf)1885 xfs_filemap_page_mkwrite(
1886 struct vm_fault *vmf)
1887 {
1888 return xfs_write_fault(vmf, 0);
1889 }
1890
1891 /*
1892 * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1893 * on write faults. In reality, it needs to serialise against truncate and
1894 * prepare memory for writing so handle is as standard write fault.
1895 */
1896 static vm_fault_t
xfs_filemap_pfn_mkwrite(struct vm_fault * vmf)1897 xfs_filemap_pfn_mkwrite(
1898 struct vm_fault *vmf)
1899 {
1900 return xfs_write_fault(vmf, 0);
1901 }
1902
1903 static const struct vm_operations_struct xfs_file_vm_ops = {
1904 .fault = xfs_filemap_fault,
1905 .huge_fault = xfs_filemap_huge_fault,
1906 .map_pages = filemap_map_pages,
1907 .page_mkwrite = xfs_filemap_page_mkwrite,
1908 .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
1909 };
1910
1911 STATIC int
xfs_file_mmap_prepare(struct vm_area_desc * desc)1912 xfs_file_mmap_prepare(
1913 struct vm_area_desc *desc)
1914 {
1915 struct file *file = desc->file;
1916 struct inode *inode = file_inode(file);
1917 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
1918
1919 /*
1920 * We don't support synchronous mappings for non-DAX files and
1921 * for DAX files if underneath dax_device is not synchronous.
1922 */
1923 if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
1924 target->bt_daxdev))
1925 return -EOPNOTSUPP;
1926
1927 file_accessed(file);
1928 desc->vm_ops = &xfs_file_vm_ops;
1929 if (IS_DAX(inode))
1930 desc->vm_flags |= VM_HUGEPAGE;
1931 return 0;
1932 }
1933
1934 const struct file_operations xfs_file_operations = {
1935 .llseek = xfs_file_llseek,
1936 .read_iter = xfs_file_read_iter,
1937 .write_iter = xfs_file_write_iter,
1938 .splice_read = xfs_file_splice_read,
1939 .splice_write = iter_file_splice_write,
1940 .iopoll = iocb_bio_iopoll,
1941 .unlocked_ioctl = xfs_file_ioctl,
1942 #ifdef CONFIG_COMPAT
1943 .compat_ioctl = xfs_file_compat_ioctl,
1944 #endif
1945 .mmap_prepare = xfs_file_mmap_prepare,
1946 .open = xfs_file_open,
1947 .release = xfs_file_release,
1948 .fsync = xfs_file_fsync,
1949 .get_unmapped_area = thp_get_unmapped_area,
1950 .fallocate = xfs_file_fallocate,
1951 .fadvise = xfs_file_fadvise,
1952 .remap_file_range = xfs_file_remap_range,
1953 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
1954 FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
1955 FOP_DONTCACHE,
1956 };
1957
1958 const struct file_operations xfs_dir_file_operations = {
1959 .open = xfs_dir_open,
1960 .read = generic_read_dir,
1961 .iterate_shared = xfs_file_readdir,
1962 .llseek = generic_file_llseek,
1963 .unlocked_ioctl = xfs_file_ioctl,
1964 #ifdef CONFIG_COMPAT
1965 .compat_ioctl = xfs_file_compat_ioctl,
1966 #endif
1967 .fsync = xfs_dir_fsync,
1968 };
1969