1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 #include "xfs_file.h"
28 #include "xfs_aops.h"
29 #include "xfs_zone_alloc.h"
30 #include "xfs_error.h"
31 #include "xfs_errortag.h"
32
33 #include <linux/dax.h>
34 #include <linux/falloc.h>
35 #include <linux/backing-dev.h>
36 #include <linux/mman.h>
37 #include <linux/fadvise.h>
38 #include <linux/mount.h>
39
40 static const struct vm_operations_struct xfs_file_vm_ops;
41
42 /*
43 * Decide if the given file range is aligned to the size of the fundamental
44 * allocation unit for the file.
45 */
46 bool
xfs_is_falloc_aligned(struct xfs_inode * ip,loff_t pos,long long int len)47 xfs_is_falloc_aligned(
48 struct xfs_inode *ip,
49 loff_t pos,
50 long long int len)
51 {
52 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
53
54 if (!is_power_of_2(alloc_unit))
55 return isaligned_64(pos, alloc_unit) &&
56 isaligned_64(len, alloc_unit);
57
58 return !((pos | len) & (alloc_unit - 1));
59 }
60
61 /*
62 * Fsync operations on directories are much simpler than on regular files,
63 * as there is no file data to flush, and thus also no need for explicit
64 * cache flush operations, and there are no non-transaction metadata updates
65 * on directories either.
66 */
67 STATIC int
xfs_dir_fsync(struct file * file,loff_t start,loff_t end,int datasync)68 xfs_dir_fsync(
69 struct file *file,
70 loff_t start,
71 loff_t end,
72 int datasync)
73 {
74 struct xfs_inode *ip = XFS_I(file->f_mapping->host);
75
76 trace_xfs_dir_fsync(ip);
77 return xfs_log_force_inode(ip);
78 }
79
80 /*
81 * All metadata updates are logged, which means that we just have to push the
82 * journal to the required sequence number than holds the updates. We track
83 * datasync commits separately to full sync commits, and hence only need to
84 * select the correct sequence number for the log force here.
85 *
86 * We don't have to serialise against concurrent modifications, as we do not
87 * have to wait for modifications that have not yet completed. We define a
88 * transaction commit as completing when the commit sequence number is updated,
89 * hence if the sequence number has not updated, the sync operation has been
90 * run before the commit completed and we don't have to wait for it.
91 *
92 * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
93 * set on the log item until - at least - the journal flush completes. In
94 * reality, they are only cleared when the inode is fully unpinned (i.e.
95 * persistent in the journal and not dirty in the CIL), and so we rely on
96 * xfs_log_force_seq() either skipping sequences that have been persisted or
97 * waiting on sequences that are still in flight to correctly order concurrent
98 * sync operations.
99 */
100 static int
xfs_fsync_flush_log(struct xfs_inode * ip,bool datasync,int * log_flushed)101 xfs_fsync_flush_log(
102 struct xfs_inode *ip,
103 bool datasync,
104 int *log_flushed)
105 {
106 struct xfs_inode_log_item *iip = ip->i_itemp;
107 xfs_csn_t seq = 0;
108
109 spin_lock(&iip->ili_lock);
110 if (datasync)
111 seq = iip->ili_datasync_seq;
112 else
113 seq = iip->ili_commit_seq;
114 spin_unlock(&iip->ili_lock);
115
116 if (!seq)
117 return 0;
118
119 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
120 log_flushed);
121 }
122
123 STATIC int
xfs_file_fsync(struct file * file,loff_t start,loff_t end,int datasync)124 xfs_file_fsync(
125 struct file *file,
126 loff_t start,
127 loff_t end,
128 int datasync)
129 {
130 struct xfs_inode *ip = XFS_I(file->f_mapping->host);
131 struct xfs_mount *mp = ip->i_mount;
132 int error, err2;
133 int log_flushed = 0;
134
135 trace_xfs_file_fsync(ip);
136
137 error = file_write_and_wait_range(file, start, end);
138 if (error)
139 return error;
140
141 if (xfs_is_shutdown(mp))
142 return -EIO;
143
144 xfs_iflags_clear(ip, XFS_ITRUNCATED);
145
146 /*
147 * If we have an RT and/or log subvolume we need to make sure to flush
148 * the write cache the device used for file data first. This is to
149 * ensure newly written file data make it to disk before logging the new
150 * inode size in case of an extending write.
151 */
152 if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
153 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
154 else if (mp->m_logdev_targp != mp->m_ddev_targp)
155 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
156
157 /*
158 * If the inode has a inode log item attached, it may need the journal
159 * flushed to persist any changes the log item might be tracking.
160 */
161 if (ip->i_itemp) {
162 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
163 if (err2 && !error)
164 error = err2;
165 }
166
167 /*
168 * If we only have a single device, and the log force about was
169 * a no-op we might have to flush the data device cache here.
170 * This can only happen for fdatasync/O_DSYNC if we were overwriting
171 * an already allocated file and thus do not have any metadata to
172 * commit.
173 */
174 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
175 mp->m_logdev_targp == mp->m_ddev_targp) {
176 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
177 if (err2 && !error)
178 error = err2;
179 }
180
181 return error;
182 }
183
184 static int
xfs_ilock_iocb(struct kiocb * iocb,unsigned int lock_mode)185 xfs_ilock_iocb(
186 struct kiocb *iocb,
187 unsigned int lock_mode)
188 {
189 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
190
191 if (iocb->ki_flags & IOCB_NOWAIT) {
192 if (!xfs_ilock_nowait(ip, lock_mode))
193 return -EAGAIN;
194 } else {
195 xfs_ilock(ip, lock_mode);
196 }
197
198 return 0;
199 }
200
201 static int
xfs_ilock_iocb_for_write(struct kiocb * iocb,unsigned int * lock_mode)202 xfs_ilock_iocb_for_write(
203 struct kiocb *iocb,
204 unsigned int *lock_mode)
205 {
206 ssize_t ret;
207 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
208
209 ret = xfs_ilock_iocb(iocb, *lock_mode);
210 if (ret)
211 return ret;
212
213 /*
214 * If a reflink remap is in progress we always need to take the iolock
215 * exclusively to wait for it to finish.
216 */
217 if (*lock_mode == XFS_IOLOCK_SHARED &&
218 xfs_iflags_test(ip, XFS_IREMAPPING)) {
219 xfs_iunlock(ip, *lock_mode);
220 *lock_mode = XFS_IOLOCK_EXCL;
221 return xfs_ilock_iocb(iocb, *lock_mode);
222 }
223
224 return 0;
225 }
226
227 STATIC ssize_t
xfs_file_dio_read(struct kiocb * iocb,struct iov_iter * to)228 xfs_file_dio_read(
229 struct kiocb *iocb,
230 struct iov_iter *to)
231 {
232 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
233 ssize_t ret;
234
235 trace_xfs_file_direct_read(iocb, to);
236
237 if (!iov_iter_count(to))
238 return 0; /* skip atime */
239
240 file_accessed(iocb->ki_filp);
241
242 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
243 if (ret)
244 return ret;
245 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
246 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
247
248 return ret;
249 }
250
251 static noinline ssize_t
xfs_file_dax_read(struct kiocb * iocb,struct iov_iter * to)252 xfs_file_dax_read(
253 struct kiocb *iocb,
254 struct iov_iter *to)
255 {
256 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
257 ssize_t ret = 0;
258
259 trace_xfs_file_dax_read(iocb, to);
260
261 if (!iov_iter_count(to))
262 return 0; /* skip atime */
263
264 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
265 if (ret)
266 return ret;
267 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
268 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
269
270 file_accessed(iocb->ki_filp);
271 return ret;
272 }
273
274 STATIC ssize_t
xfs_file_buffered_read(struct kiocb * iocb,struct iov_iter * to)275 xfs_file_buffered_read(
276 struct kiocb *iocb,
277 struct iov_iter *to)
278 {
279 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
280 ssize_t ret;
281
282 trace_xfs_file_buffered_read(iocb, to);
283
284 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
285 if (ret)
286 return ret;
287 ret = generic_file_read_iter(iocb, to);
288 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
289
290 return ret;
291 }
292
293 STATIC ssize_t
xfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)294 xfs_file_read_iter(
295 struct kiocb *iocb,
296 struct iov_iter *to)
297 {
298 struct inode *inode = file_inode(iocb->ki_filp);
299 struct xfs_mount *mp = XFS_I(inode)->i_mount;
300 ssize_t ret = 0;
301
302 XFS_STATS_INC(mp, xs_read_calls);
303
304 if (xfs_is_shutdown(mp))
305 return -EIO;
306
307 if (IS_DAX(inode))
308 ret = xfs_file_dax_read(iocb, to);
309 else if (iocb->ki_flags & IOCB_DIRECT)
310 ret = xfs_file_dio_read(iocb, to);
311 else
312 ret = xfs_file_buffered_read(iocb, to);
313
314 if (ret > 0)
315 XFS_STATS_ADD(mp, xs_read_bytes, ret);
316 return ret;
317 }
318
319 STATIC ssize_t
xfs_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)320 xfs_file_splice_read(
321 struct file *in,
322 loff_t *ppos,
323 struct pipe_inode_info *pipe,
324 size_t len,
325 unsigned int flags)
326 {
327 struct inode *inode = file_inode(in);
328 struct xfs_inode *ip = XFS_I(inode);
329 struct xfs_mount *mp = ip->i_mount;
330 ssize_t ret = 0;
331
332 XFS_STATS_INC(mp, xs_read_calls);
333
334 if (xfs_is_shutdown(mp))
335 return -EIO;
336
337 trace_xfs_file_splice_read(ip, *ppos, len);
338
339 xfs_ilock(ip, XFS_IOLOCK_SHARED);
340 ret = filemap_splice_read(in, ppos, pipe, len, flags);
341 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
342 if (ret > 0)
343 XFS_STATS_ADD(mp, xs_read_bytes, ret);
344 return ret;
345 }
346
347 /*
348 * Take care of zeroing post-EOF blocks when they might exist.
349 *
350 * Returns 0 if successfully, a negative error for a failure, or 1 if this
351 * function dropped the iolock and reacquired it exclusively and the caller
352 * needs to restart the write sanity checks.
353 */
354 static ssize_t
xfs_file_write_zero_eof(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,size_t count,bool * drained_dio,struct xfs_zone_alloc_ctx * ac)355 xfs_file_write_zero_eof(
356 struct kiocb *iocb,
357 struct iov_iter *from,
358 unsigned int *iolock,
359 size_t count,
360 bool *drained_dio,
361 struct xfs_zone_alloc_ctx *ac)
362 {
363 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
364 loff_t isize;
365 int error;
366
367 /*
368 * We need to serialise against EOF updates that occur in IO completions
369 * here. We want to make sure that nobody is changing the size while
370 * we do this check until we have placed an IO barrier (i.e. hold
371 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
372 * spinlock effectively forms a memory barrier once we have
373 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
374 * hence be able to correctly determine if we need to run zeroing.
375 */
376 spin_lock(&ip->i_flags_lock);
377 isize = i_size_read(VFS_I(ip));
378 if (iocb->ki_pos <= isize) {
379 spin_unlock(&ip->i_flags_lock);
380 return 0;
381 }
382 spin_unlock(&ip->i_flags_lock);
383
384 if (iocb->ki_flags & IOCB_NOWAIT)
385 return -EAGAIN;
386
387 if (!*drained_dio) {
388 /*
389 * If zeroing is needed and we are currently holding the iolock
390 * shared, we need to update it to exclusive which implies
391 * having to redo all checks before.
392 */
393 if (*iolock == XFS_IOLOCK_SHARED) {
394 xfs_iunlock(ip, *iolock);
395 *iolock = XFS_IOLOCK_EXCL;
396 xfs_ilock(ip, *iolock);
397 iov_iter_reexpand(from, count);
398 }
399
400 /*
401 * We now have an IO submission barrier in place, but AIO can do
402 * EOF updates during IO completion and hence we now need to
403 * wait for all of them to drain. Non-AIO DIO will have drained
404 * before we are given the XFS_IOLOCK_EXCL, and so for most
405 * cases this wait is a no-op.
406 */
407 inode_dio_wait(VFS_I(ip));
408 *drained_dio = true;
409 return 1;
410 }
411
412 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
413
414 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
415 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
416 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
417
418 return error;
419 }
420
421 /*
422 * Common pre-write limit and setup checks.
423 *
424 * Called with the iolock held either shared and exclusive according to
425 * @iolock, and returns with it held. Might upgrade the iolock to exclusive
426 * if called for a direct write beyond i_size.
427 */
428 STATIC ssize_t
xfs_file_write_checks(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,struct xfs_zone_alloc_ctx * ac)429 xfs_file_write_checks(
430 struct kiocb *iocb,
431 struct iov_iter *from,
432 unsigned int *iolock,
433 struct xfs_zone_alloc_ctx *ac)
434 {
435 struct inode *inode = iocb->ki_filp->f_mapping->host;
436 size_t count = iov_iter_count(from);
437 bool drained_dio = false;
438 ssize_t error;
439
440 restart:
441 error = generic_write_checks(iocb, from);
442 if (error <= 0)
443 return error;
444
445 if (iocb->ki_flags & IOCB_NOWAIT) {
446 error = break_layout(inode, false);
447 if (error == -EWOULDBLOCK)
448 error = -EAGAIN;
449 } else {
450 error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
451 }
452
453 if (error)
454 return error;
455
456 /*
457 * For changing security info in file_remove_privs() we need i_rwsem
458 * exclusively.
459 */
460 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
461 xfs_iunlock(XFS_I(inode), *iolock);
462 *iolock = XFS_IOLOCK_EXCL;
463 error = xfs_ilock_iocb(iocb, *iolock);
464 if (error) {
465 *iolock = 0;
466 return error;
467 }
468 goto restart;
469 }
470
471 /*
472 * If the offset is beyond the size of the file, we need to zero all
473 * blocks that fall between the existing EOF and the start of this
474 * write.
475 *
476 * We can do an unlocked check for i_size here safely as I/O completion
477 * can only extend EOF. Truncate is locked out at this point, so the
478 * EOF can not move backwards, only forwards. Hence we only need to take
479 * the slow path when we are at or beyond the current EOF.
480 */
481 if (iocb->ki_pos > i_size_read(inode)) {
482 error = xfs_file_write_zero_eof(iocb, from, iolock, count,
483 &drained_dio, ac);
484 if (error == 1)
485 goto restart;
486 if (error)
487 return error;
488 }
489
490 return kiocb_modified(iocb);
491 }
492
493 static ssize_t
xfs_zoned_write_space_reserve(struct xfs_mount * mp,struct kiocb * iocb,struct iov_iter * from,unsigned int flags,struct xfs_zone_alloc_ctx * ac)494 xfs_zoned_write_space_reserve(
495 struct xfs_mount *mp,
496 struct kiocb *iocb,
497 struct iov_iter *from,
498 unsigned int flags,
499 struct xfs_zone_alloc_ctx *ac)
500 {
501 loff_t count = iov_iter_count(from);
502 int error;
503
504 if (iocb->ki_flags & IOCB_NOWAIT)
505 flags |= XFS_ZR_NOWAIT;
506
507 /*
508 * Check the rlimit and LFS boundary first so that we don't over-reserve
509 * by possibly a lot.
510 *
511 * The generic write path will redo this check later, and it might have
512 * changed by then. If it got expanded we'll stick to our earlier
513 * smaller limit, and if it is decreased the new smaller limit will be
514 * used and our extra space reservation will be returned after finishing
515 * the write.
516 */
517 error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
518 if (error)
519 return error;
520
521 /*
522 * Sloppily round up count to file system blocks.
523 *
524 * This will often reserve an extra block, but that avoids having to look
525 * at the start offset, which isn't stable for O_APPEND until taking the
526 * iolock. Also we need to reserve a block each for zeroing the old
527 * EOF block and the new start block if they are unaligned.
528 *
529 * Any remaining block will be returned after the write.
530 */
531 return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
532 flags, ac);
533 }
534
535 static int
xfs_dio_write_end_io(struct kiocb * iocb,ssize_t size,int error,unsigned flags)536 xfs_dio_write_end_io(
537 struct kiocb *iocb,
538 ssize_t size,
539 int error,
540 unsigned flags)
541 {
542 struct inode *inode = file_inode(iocb->ki_filp);
543 struct xfs_inode *ip = XFS_I(inode);
544 loff_t offset = iocb->ki_pos;
545 unsigned int nofs_flag;
546
547 ASSERT(!xfs_is_zoned_inode(ip) ||
548 !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
549
550 trace_xfs_end_io_direct_write(ip, offset, size);
551
552 if (xfs_is_shutdown(ip->i_mount))
553 return -EIO;
554
555 if (error)
556 return error;
557 if (!size)
558 return 0;
559
560 /*
561 * Capture amount written on completion as we can't reliably account
562 * for it on submission.
563 */
564 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
565
566 /*
567 * We can allocate memory here while doing writeback on behalf of
568 * memory reclaim. To avoid memory allocation deadlocks set the
569 * task-wide nofs context for the following operations.
570 */
571 nofs_flag = memalloc_nofs_save();
572
573 if (flags & IOMAP_DIO_COW) {
574 if (iocb->ki_flags & IOCB_ATOMIC)
575 error = xfs_reflink_end_atomic_cow(ip, offset, size);
576 else
577 error = xfs_reflink_end_cow(ip, offset, size);
578 if (error)
579 goto out;
580 }
581
582 /*
583 * Unwritten conversion updates the in-core isize after extent
584 * conversion but before updating the on-disk size. Updating isize any
585 * earlier allows a racing dio read to find unwritten extents before
586 * they are converted.
587 */
588 if (flags & IOMAP_DIO_UNWRITTEN) {
589 error = xfs_iomap_write_unwritten(ip, offset, size, true);
590 goto out;
591 }
592
593 /*
594 * We need to update the in-core inode size here so that we don't end up
595 * with the on-disk inode size being outside the in-core inode size. We
596 * have no other method of updating EOF for AIO, so always do it here
597 * if necessary.
598 *
599 * We need to lock the test/set EOF update as we can be racing with
600 * other IO completions here to update the EOF. Failing to serialise
601 * here can result in EOF moving backwards and Bad Things Happen when
602 * that occurs.
603 *
604 * As IO completion only ever extends EOF, we can do an unlocked check
605 * here to avoid taking the spinlock. If we land within the current EOF,
606 * then we do not need to do an extending update at all, and we don't
607 * need to take the lock to check this. If we race with an update moving
608 * EOF, then we'll either still be beyond EOF and need to take the lock,
609 * or we'll be within EOF and we don't need to take it at all.
610 */
611 if (offset + size <= i_size_read(inode))
612 goto out;
613
614 spin_lock(&ip->i_flags_lock);
615 if (offset + size > i_size_read(inode)) {
616 i_size_write(inode, offset + size);
617 spin_unlock(&ip->i_flags_lock);
618 error = xfs_setfilesize(ip, offset, size);
619 } else {
620 spin_unlock(&ip->i_flags_lock);
621 }
622
623 out:
624 memalloc_nofs_restore(nofs_flag);
625 return error;
626 }
627
628 static const struct iomap_dio_ops xfs_dio_write_ops = {
629 .end_io = xfs_dio_write_end_io,
630 };
631
632 static void
xfs_dio_zoned_submit_io(const struct iomap_iter * iter,struct bio * bio,loff_t file_offset)633 xfs_dio_zoned_submit_io(
634 const struct iomap_iter *iter,
635 struct bio *bio,
636 loff_t file_offset)
637 {
638 struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
639 struct xfs_zone_alloc_ctx *ac = iter->private;
640 xfs_filblks_t count_fsb;
641 struct iomap_ioend *ioend;
642
643 count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
644 if (count_fsb > ac->reserved_blocks) {
645 xfs_err(mp,
646 "allocation (%lld) larger than reservation (%lld).",
647 count_fsb, ac->reserved_blocks);
648 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
649 bio_io_error(bio);
650 return;
651 }
652 ac->reserved_blocks -= count_fsb;
653
654 bio->bi_end_io = xfs_end_bio;
655 ioend = iomap_init_ioend(iter->inode, bio, file_offset,
656 IOMAP_IOEND_DIRECT);
657 xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
658 }
659
660 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
661 .bio_set = &iomap_ioend_bioset,
662 .submit_io = xfs_dio_zoned_submit_io,
663 .end_io = xfs_dio_write_end_io,
664 };
665
666 /*
667 * Handle block aligned direct I/O writes.
668 */
669 static noinline ssize_t
xfs_file_dio_write_aligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from,const struct iomap_ops * ops,const struct iomap_dio_ops * dops,struct xfs_zone_alloc_ctx * ac)670 xfs_file_dio_write_aligned(
671 struct xfs_inode *ip,
672 struct kiocb *iocb,
673 struct iov_iter *from,
674 const struct iomap_ops *ops,
675 const struct iomap_dio_ops *dops,
676 struct xfs_zone_alloc_ctx *ac)
677 {
678 unsigned int iolock = XFS_IOLOCK_SHARED;
679 unsigned int dio_flags = 0;
680 ssize_t ret;
681
682 /*
683 * For always COW inodes, each bio must be aligned to the file system
684 * block size and not just the device sector size because we need to
685 * allocate a block-aligned amount of space for each write.
686 */
687 if (xfs_is_always_cow_inode(ip))
688 dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
689
690 ret = xfs_ilock_iocb_for_write(iocb, &iolock);
691 if (ret)
692 return ret;
693 ret = xfs_file_write_checks(iocb, from, &iolock, ac);
694 if (ret)
695 goto out_unlock;
696
697 /*
698 * We don't need to hold the IOLOCK exclusively across the IO, so demote
699 * the iolock back to shared if we had to take the exclusive lock in
700 * xfs_file_write_checks() for other reasons.
701 */
702 if (iolock == XFS_IOLOCK_EXCL) {
703 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
704 iolock = XFS_IOLOCK_SHARED;
705 }
706 trace_xfs_file_direct_write(iocb, from);
707 ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
708 out_unlock:
709 xfs_iunlock(ip, iolock);
710 return ret;
711 }
712
713 /*
714 * Handle block aligned direct I/O writes to zoned devices.
715 */
716 static noinline ssize_t
xfs_file_dio_write_zoned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)717 xfs_file_dio_write_zoned(
718 struct xfs_inode *ip,
719 struct kiocb *iocb,
720 struct iov_iter *from)
721 {
722 struct xfs_zone_alloc_ctx ac = { };
723 ssize_t ret;
724
725 ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
726 if (ret < 0)
727 return ret;
728 ret = xfs_file_dio_write_aligned(ip, iocb, from,
729 &xfs_zoned_direct_write_iomap_ops,
730 &xfs_dio_zoned_write_ops, &ac);
731 xfs_zoned_space_unreserve(ip->i_mount, &ac);
732 return ret;
733 }
734
735 /*
736 * Handle block atomic writes
737 *
738 * Two methods of atomic writes are supported:
739 * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
740 * disk
741 * - COW-based, which uses a COW fork as a staging extent for data updates
742 * before atomically updating extent mappings for the range being written
743 *
744 */
745 static noinline ssize_t
xfs_file_dio_write_atomic(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)746 xfs_file_dio_write_atomic(
747 struct xfs_inode *ip,
748 struct kiocb *iocb,
749 struct iov_iter *from)
750 {
751 unsigned int iolock = XFS_IOLOCK_SHARED;
752 ssize_t ret, ocount = iov_iter_count(from);
753 const struct iomap_ops *dops;
754
755 /*
756 * HW offload should be faster, so try that first if it is already
757 * known that the write length is not too large.
758 */
759 if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
760 dops = &xfs_atomic_write_cow_iomap_ops;
761 else
762 dops = &xfs_direct_write_iomap_ops;
763
764 retry:
765 ret = xfs_ilock_iocb_for_write(iocb, &iolock);
766 if (ret)
767 return ret;
768
769 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
770 if (ret)
771 goto out_unlock;
772
773 /* Demote similar to xfs_file_dio_write_aligned() */
774 if (iolock == XFS_IOLOCK_EXCL) {
775 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
776 iolock = XFS_IOLOCK_SHARED;
777 }
778
779 trace_xfs_file_direct_write(iocb, from);
780 ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
781 0, NULL, 0);
782
783 /*
784 * The retry mechanism is based on the ->iomap_begin method returning
785 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
786 * possible. The REQ_ATOMIC-based method typically not be possible if
787 * the write spans multiple extents or the disk blocks are misaligned.
788 */
789 if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
790 xfs_iunlock(ip, iolock);
791 dops = &xfs_atomic_write_cow_iomap_ops;
792 goto retry;
793 }
794
795 out_unlock:
796 if (iolock)
797 xfs_iunlock(ip, iolock);
798 return ret;
799 }
800
801 /*
802 * Handle block unaligned direct I/O writes
803 *
804 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
805 * them to be done in parallel with reads and other direct I/O writes. However,
806 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
807 * to do sub-block zeroing and that requires serialisation against other direct
808 * I/O to the same block. In this case we need to serialise the submission of
809 * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
810 * In the case where sub-block zeroing is not required, we can do concurrent
811 * sub-block dios to the same block successfully.
812 *
813 * Optimistically submit the I/O using the shared lock first, but use the
814 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
815 * if block allocation or partial block zeroing would be required. In that case
816 * we try again with the exclusive lock.
817 */
818 static noinline ssize_t
xfs_file_dio_write_unaligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)819 xfs_file_dio_write_unaligned(
820 struct xfs_inode *ip,
821 struct kiocb *iocb,
822 struct iov_iter *from)
823 {
824 size_t isize = i_size_read(VFS_I(ip));
825 size_t count = iov_iter_count(from);
826 unsigned int iolock = XFS_IOLOCK_SHARED;
827 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
828 ssize_t ret;
829
830 /*
831 * Extending writes need exclusivity because of the sub-block zeroing
832 * that the DIO code always does for partial tail blocks beyond EOF, so
833 * don't even bother trying the fast path in this case.
834 */
835 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
836 if (iocb->ki_flags & IOCB_NOWAIT)
837 return -EAGAIN;
838 retry_exclusive:
839 iolock = XFS_IOLOCK_EXCL;
840 flags = IOMAP_DIO_FORCE_WAIT;
841 }
842
843 ret = xfs_ilock_iocb_for_write(iocb, &iolock);
844 if (ret)
845 return ret;
846
847 /*
848 * We can't properly handle unaligned direct I/O to reflink files yet,
849 * as we can't unshare a partial block.
850 */
851 if (xfs_is_cow_inode(ip)) {
852 trace_xfs_reflink_bounce_dio_write(iocb, from);
853 ret = -ENOTBLK;
854 goto out_unlock;
855 }
856
857 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
858 if (ret)
859 goto out_unlock;
860
861 /*
862 * If we are doing exclusive unaligned I/O, this must be the only I/O
863 * in-flight. Otherwise we risk data corruption due to unwritten extent
864 * conversions from the AIO end_io handler. Wait for all other I/O to
865 * drain first.
866 */
867 if (flags & IOMAP_DIO_FORCE_WAIT)
868 inode_dio_wait(VFS_I(ip));
869
870 trace_xfs_file_direct_write(iocb, from);
871 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
872 &xfs_dio_write_ops, flags, NULL, 0);
873
874 /*
875 * Retry unaligned I/O with exclusive blocking semantics if the DIO
876 * layer rejected it for mapping or locking reasons. If we are doing
877 * nonblocking user I/O, propagate the error.
878 */
879 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
880 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
881 xfs_iunlock(ip, iolock);
882 goto retry_exclusive;
883 }
884
885 out_unlock:
886 if (iolock)
887 xfs_iunlock(ip, iolock);
888 return ret;
889 }
890
891 static ssize_t
xfs_file_dio_write(struct kiocb * iocb,struct iov_iter * from)892 xfs_file_dio_write(
893 struct kiocb *iocb,
894 struct iov_iter *from)
895 {
896 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
897 struct xfs_buftarg *target = xfs_inode_buftarg(ip);
898 size_t count = iov_iter_count(from);
899
900 /* direct I/O must be aligned to device logical sector size */
901 if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
902 return -EINVAL;
903
904 if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
905 return xfs_file_dio_write_unaligned(ip, iocb, from);
906 if (xfs_is_zoned_inode(ip))
907 return xfs_file_dio_write_zoned(ip, iocb, from);
908 if (iocb->ki_flags & IOCB_ATOMIC)
909 return xfs_file_dio_write_atomic(ip, iocb, from);
910 return xfs_file_dio_write_aligned(ip, iocb, from,
911 &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
912 }
913
914 static noinline ssize_t
xfs_file_dax_write(struct kiocb * iocb,struct iov_iter * from)915 xfs_file_dax_write(
916 struct kiocb *iocb,
917 struct iov_iter *from)
918 {
919 struct inode *inode = iocb->ki_filp->f_mapping->host;
920 struct xfs_inode *ip = XFS_I(inode);
921 unsigned int iolock = XFS_IOLOCK_EXCL;
922 ssize_t ret, error = 0;
923 loff_t pos;
924
925 ret = xfs_ilock_iocb(iocb, iolock);
926 if (ret)
927 return ret;
928 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
929 if (ret)
930 goto out;
931
932 pos = iocb->ki_pos;
933
934 trace_xfs_file_dax_write(iocb, from);
935 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
936 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
937 i_size_write(inode, iocb->ki_pos);
938 error = xfs_setfilesize(ip, pos, ret);
939 }
940 out:
941 if (iolock)
942 xfs_iunlock(ip, iolock);
943 if (error)
944 return error;
945
946 if (ret > 0) {
947 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
948
949 /* Handle various SYNC-type writes */
950 ret = generic_write_sync(iocb, ret);
951 }
952 return ret;
953 }
954
955 STATIC ssize_t
xfs_file_buffered_write(struct kiocb * iocb,struct iov_iter * from)956 xfs_file_buffered_write(
957 struct kiocb *iocb,
958 struct iov_iter *from)
959 {
960 struct inode *inode = iocb->ki_filp->f_mapping->host;
961 struct xfs_inode *ip = XFS_I(inode);
962 ssize_t ret;
963 bool cleared_space = false;
964 unsigned int iolock;
965
966 write_retry:
967 iolock = XFS_IOLOCK_EXCL;
968 ret = xfs_ilock_iocb(iocb, iolock);
969 if (ret)
970 return ret;
971
972 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
973 if (ret)
974 goto out;
975
976 trace_xfs_file_buffered_write(iocb, from);
977 ret = iomap_file_buffered_write(iocb, from,
978 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
979 NULL);
980
981 /*
982 * If we hit a space limit, try to free up some lingering preallocated
983 * space before returning an error. In the case of ENOSPC, first try to
984 * write back all dirty inodes to free up some of the excess reserved
985 * metadata space. This reduces the chances that the eofblocks scan
986 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
987 * also behaves as a filter to prevent too many eofblocks scans from
988 * running at the same time. Use a synchronous scan to increase the
989 * effectiveness of the scan.
990 */
991 if (ret == -EDQUOT && !cleared_space) {
992 xfs_iunlock(ip, iolock);
993 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
994 cleared_space = true;
995 goto write_retry;
996 } else if (ret == -ENOSPC && !cleared_space) {
997 struct xfs_icwalk icw = {0};
998
999 cleared_space = true;
1000 xfs_flush_inodes(ip->i_mount);
1001
1002 xfs_iunlock(ip, iolock);
1003 icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
1004 xfs_blockgc_free_space(ip->i_mount, &icw);
1005 goto write_retry;
1006 }
1007
1008 out:
1009 if (iolock)
1010 xfs_iunlock(ip, iolock);
1011
1012 if (ret > 0) {
1013 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
1014 /* Handle various SYNC-type writes */
1015 ret = generic_write_sync(iocb, ret);
1016 }
1017 return ret;
1018 }
1019
1020 STATIC ssize_t
xfs_file_buffered_write_zoned(struct kiocb * iocb,struct iov_iter * from)1021 xfs_file_buffered_write_zoned(
1022 struct kiocb *iocb,
1023 struct iov_iter *from)
1024 {
1025 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
1026 struct xfs_mount *mp = ip->i_mount;
1027 unsigned int iolock = XFS_IOLOCK_EXCL;
1028 bool cleared_space = false;
1029 struct xfs_zone_alloc_ctx ac = { };
1030 ssize_t ret;
1031
1032 ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
1033 if (ret < 0)
1034 return ret;
1035
1036 ret = xfs_ilock_iocb(iocb, iolock);
1037 if (ret)
1038 goto out_unreserve;
1039
1040 ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
1041 if (ret)
1042 goto out_unlock;
1043
1044 /*
1045 * Truncate the iter to the length that we were actually able to
1046 * allocate blocks for. This needs to happen after
1047 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
1048 * writes.
1049 */
1050 iov_iter_truncate(from,
1051 XFS_FSB_TO_B(mp, ac.reserved_blocks) -
1052 (iocb->ki_pos & mp->m_blockmask));
1053 if (!iov_iter_count(from))
1054 goto out_unlock;
1055
1056 retry:
1057 trace_xfs_file_buffered_write(iocb, from);
1058 ret = iomap_file_buffered_write(iocb, from,
1059 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
1060 &ac);
1061 if (ret == -ENOSPC && !cleared_space) {
1062 /*
1063 * Kick off writeback to convert delalloc space and release the
1064 * usually too pessimistic indirect block reservations.
1065 */
1066 xfs_flush_inodes(mp);
1067 cleared_space = true;
1068 goto retry;
1069 }
1070
1071 out_unlock:
1072 xfs_iunlock(ip, iolock);
1073 out_unreserve:
1074 xfs_zoned_space_unreserve(ip->i_mount, &ac);
1075 if (ret > 0) {
1076 XFS_STATS_ADD(mp, xs_write_bytes, ret);
1077 ret = generic_write_sync(iocb, ret);
1078 }
1079 return ret;
1080 }
1081
1082 STATIC ssize_t
xfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1083 xfs_file_write_iter(
1084 struct kiocb *iocb,
1085 struct iov_iter *from)
1086 {
1087 struct inode *inode = iocb->ki_filp->f_mapping->host;
1088 struct xfs_inode *ip = XFS_I(inode);
1089 ssize_t ret;
1090 size_t ocount = iov_iter_count(from);
1091
1092 XFS_STATS_INC(ip->i_mount, xs_write_calls);
1093
1094 if (ocount == 0)
1095 return 0;
1096
1097 if (xfs_is_shutdown(ip->i_mount))
1098 return -EIO;
1099
1100 if (iocb->ki_flags & IOCB_ATOMIC) {
1101 if (ocount < xfs_get_atomic_write_min(ip))
1102 return -EINVAL;
1103
1104 if (ocount > xfs_get_atomic_write_max(ip))
1105 return -EINVAL;
1106
1107 ret = generic_atomic_write_valid(iocb, from);
1108 if (ret)
1109 return ret;
1110 }
1111
1112 if (IS_DAX(inode))
1113 return xfs_file_dax_write(iocb, from);
1114
1115 if (iocb->ki_flags & IOCB_DIRECT) {
1116 /*
1117 * Allow a directio write to fall back to a buffered
1118 * write *only* in the case that we're doing a reflink
1119 * CoW. In all other directio scenarios we do not
1120 * allow an operation to fall back to buffered mode.
1121 */
1122 ret = xfs_file_dio_write(iocb, from);
1123 if (ret != -ENOTBLK)
1124 return ret;
1125 }
1126
1127 if (xfs_is_zoned_inode(ip))
1128 return xfs_file_buffered_write_zoned(iocb, from);
1129 return xfs_file_buffered_write(iocb, from);
1130 }
1131
1132 /* Does this file, inode, or mount want synchronous writes? */
xfs_file_sync_writes(struct file * filp)1133 static inline bool xfs_file_sync_writes(struct file *filp)
1134 {
1135 struct xfs_inode *ip = XFS_I(file_inode(filp));
1136
1137 if (xfs_has_wsync(ip->i_mount))
1138 return true;
1139 if (filp->f_flags & (__O_SYNC | O_DSYNC))
1140 return true;
1141 if (IS_SYNC(file_inode(filp)))
1142 return true;
1143
1144 return false;
1145 }
1146
1147 static int
xfs_falloc_newsize(struct file * file,int mode,loff_t offset,loff_t len,loff_t * new_size)1148 xfs_falloc_newsize(
1149 struct file *file,
1150 int mode,
1151 loff_t offset,
1152 loff_t len,
1153 loff_t *new_size)
1154 {
1155 struct inode *inode = file_inode(file);
1156
1157 if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
1158 return 0;
1159 *new_size = offset + len;
1160 return inode_newsize_ok(inode, *new_size);
1161 }
1162
1163 static int
xfs_falloc_setsize(struct file * file,loff_t new_size)1164 xfs_falloc_setsize(
1165 struct file *file,
1166 loff_t new_size)
1167 {
1168 struct iattr iattr = {
1169 .ia_valid = ATTR_SIZE,
1170 .ia_size = new_size,
1171 };
1172
1173 if (!new_size)
1174 return 0;
1175 return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
1176 &iattr);
1177 }
1178
1179 static int
xfs_falloc_collapse_range(struct file * file,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1180 xfs_falloc_collapse_range(
1181 struct file *file,
1182 loff_t offset,
1183 loff_t len,
1184 struct xfs_zone_alloc_ctx *ac)
1185 {
1186 struct inode *inode = file_inode(file);
1187 loff_t new_size = i_size_read(inode) - len;
1188 int error;
1189
1190 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1191 return -EINVAL;
1192
1193 /*
1194 * There is no need to overlap collapse range with EOF, in which case it
1195 * is effectively a truncate operation
1196 */
1197 if (offset + len >= i_size_read(inode))
1198 return -EINVAL;
1199
1200 error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1201 if (error)
1202 return error;
1203 return xfs_falloc_setsize(file, new_size);
1204 }
1205
1206 static int
xfs_falloc_insert_range(struct file * file,loff_t offset,loff_t len)1207 xfs_falloc_insert_range(
1208 struct file *file,
1209 loff_t offset,
1210 loff_t len)
1211 {
1212 struct inode *inode = file_inode(file);
1213 loff_t isize = i_size_read(inode);
1214 int error;
1215
1216 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1217 return -EINVAL;
1218
1219 /*
1220 * New inode size must not exceed ->s_maxbytes, accounting for
1221 * possible signed overflow.
1222 */
1223 if (inode->i_sb->s_maxbytes - isize < len)
1224 return -EFBIG;
1225
1226 /* Offset should be less than i_size */
1227 if (offset >= isize)
1228 return -EINVAL;
1229
1230 error = xfs_falloc_setsize(file, isize + len);
1231 if (error)
1232 return error;
1233
1234 /*
1235 * Perform hole insertion now that the file size has been updated so
1236 * that if we crash during the operation we don't leave shifted extents
1237 * past EOF and hence losing access to the data that is contained within
1238 * them.
1239 */
1240 return xfs_insert_file_space(XFS_I(inode), offset, len);
1241 }
1242
1243 /*
1244 * For various operations we need to zero up to one block at each end of
1245 * the affected range. For zoned file systems this will require a space
1246 * allocation, for which we need a reservation ahead of time.
1247 */
1248 #define XFS_ZONED_ZERO_EDGE_SPACE_RES 2
1249
1250 /*
1251 * Zero range implements a full zeroing mechanism but is only used in limited
1252 * situations. It is more efficient to allocate unwritten extents than to
1253 * perform zeroing here, so use an errortag to randomly force zeroing on DEBUG
1254 * kernels for added test coverage.
1255 *
1256 * On zoned file systems, the error is already injected by
1257 * xfs_file_zoned_fallocate, which then reserves the additional space needed.
1258 * We only check for this extra space reservation here.
1259 */
1260 static inline bool
xfs_falloc_force_zero(struct xfs_inode * ip,struct xfs_zone_alloc_ctx * ac)1261 xfs_falloc_force_zero(
1262 struct xfs_inode *ip,
1263 struct xfs_zone_alloc_ctx *ac)
1264 {
1265 if (xfs_is_zoned_inode(ip)) {
1266 if (ac->reserved_blocks > XFS_ZONED_ZERO_EDGE_SPACE_RES) {
1267 ASSERT(IS_ENABLED(CONFIG_XFS_DEBUG));
1268 return true;
1269 }
1270 return false;
1271 }
1272 return XFS_TEST_ERROR(ip->i_mount, XFS_ERRTAG_FORCE_ZERO_RANGE);
1273 }
1274
1275 /*
1276 * Punch a hole and prealloc the range. We use a hole punch rather than
1277 * unwritten extent conversion for two reasons:
1278 *
1279 * 1.) Hole punch handles partial block zeroing for us.
1280 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1281 * virtue of the hole punch.
1282 */
1283 static int
xfs_falloc_zero_range(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1284 xfs_falloc_zero_range(
1285 struct file *file,
1286 int mode,
1287 loff_t offset,
1288 loff_t len,
1289 struct xfs_zone_alloc_ctx *ac)
1290 {
1291 struct inode *inode = file_inode(file);
1292 struct xfs_inode *ip = XFS_I(inode);
1293 unsigned int blksize = i_blocksize(inode);
1294 loff_t new_size = 0;
1295 int error;
1296
1297 trace_xfs_zero_file_space(ip);
1298
1299 error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1300 if (error)
1301 return error;
1302
1303 if (xfs_falloc_force_zero(ip, ac)) {
1304 error = xfs_zero_range(ip, offset, len, ac, NULL);
1305 } else {
1306 error = xfs_free_file_space(ip, offset, len, ac);
1307 if (error)
1308 return error;
1309
1310 len = round_up(offset + len, blksize) -
1311 round_down(offset, blksize);
1312 offset = round_down(offset, blksize);
1313 error = xfs_alloc_file_space(ip, offset, len);
1314 }
1315 if (error)
1316 return error;
1317 return xfs_falloc_setsize(file, new_size);
1318 }
1319
1320 static int
xfs_falloc_unshare_range(struct file * file,int mode,loff_t offset,loff_t len)1321 xfs_falloc_unshare_range(
1322 struct file *file,
1323 int mode,
1324 loff_t offset,
1325 loff_t len)
1326 {
1327 struct inode *inode = file_inode(file);
1328 loff_t new_size = 0;
1329 int error;
1330
1331 error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1332 if (error)
1333 return error;
1334
1335 error = xfs_reflink_unshare(XFS_I(inode), offset, len);
1336 if (error)
1337 return error;
1338
1339 error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1340 if (error)
1341 return error;
1342 return xfs_falloc_setsize(file, new_size);
1343 }
1344
1345 static int
xfs_falloc_allocate_range(struct file * file,int mode,loff_t offset,loff_t len)1346 xfs_falloc_allocate_range(
1347 struct file *file,
1348 int mode,
1349 loff_t offset,
1350 loff_t len)
1351 {
1352 struct inode *inode = file_inode(file);
1353 loff_t new_size = 0;
1354 int error;
1355
1356 /*
1357 * If always_cow mode we can't use preallocations and thus should not
1358 * create them.
1359 */
1360 if (xfs_is_always_cow_inode(XFS_I(inode)))
1361 return -EOPNOTSUPP;
1362
1363 error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1364 if (error)
1365 return error;
1366
1367 error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1368 if (error)
1369 return error;
1370 return xfs_falloc_setsize(file, new_size);
1371 }
1372
1373 #define XFS_FALLOC_FL_SUPPORTED \
1374 (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \
1375 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \
1376 FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \
1377 FALLOC_FL_UNSHARE_RANGE)
1378
1379 STATIC long
__xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1380 __xfs_file_fallocate(
1381 struct file *file,
1382 int mode,
1383 loff_t offset,
1384 loff_t len,
1385 struct xfs_zone_alloc_ctx *ac)
1386 {
1387 struct inode *inode = file_inode(file);
1388 struct xfs_inode *ip = XFS_I(inode);
1389 long error;
1390 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1391
1392 xfs_ilock(ip, iolock);
1393 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1394 if (error)
1395 goto out_unlock;
1396
1397 /*
1398 * Must wait for all AIO to complete before we continue as AIO can
1399 * change the file size on completion without holding any locks we
1400 * currently hold. We must do this first because AIO can update both
1401 * the on disk and in memory inode sizes, and the operations that follow
1402 * require the in-memory size to be fully up-to-date.
1403 */
1404 inode_dio_wait(inode);
1405
1406 error = file_modified(file);
1407 if (error)
1408 goto out_unlock;
1409
1410 switch (mode & FALLOC_FL_MODE_MASK) {
1411 case FALLOC_FL_PUNCH_HOLE:
1412 error = xfs_free_file_space(ip, offset, len, ac);
1413 break;
1414 case FALLOC_FL_COLLAPSE_RANGE:
1415 error = xfs_falloc_collapse_range(file, offset, len, ac);
1416 break;
1417 case FALLOC_FL_INSERT_RANGE:
1418 error = xfs_falloc_insert_range(file, offset, len);
1419 break;
1420 case FALLOC_FL_ZERO_RANGE:
1421 error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1422 break;
1423 case FALLOC_FL_UNSHARE_RANGE:
1424 error = xfs_falloc_unshare_range(file, mode, offset, len);
1425 break;
1426 case FALLOC_FL_ALLOCATE_RANGE:
1427 error = xfs_falloc_allocate_range(file, mode, offset, len);
1428 break;
1429 default:
1430 error = -EOPNOTSUPP;
1431 break;
1432 }
1433
1434 if (!error && xfs_file_sync_writes(file))
1435 error = xfs_log_force_inode(ip);
1436
1437 out_unlock:
1438 xfs_iunlock(ip, iolock);
1439 return error;
1440 }
1441
1442 static long
xfs_file_zoned_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1443 xfs_file_zoned_fallocate(
1444 struct file *file,
1445 int mode,
1446 loff_t offset,
1447 loff_t len)
1448 {
1449 struct xfs_zone_alloc_ctx ac = { };
1450 struct xfs_inode *ip = XFS_I(file_inode(file));
1451 struct xfs_mount *mp = ip->i_mount;
1452 xfs_filblks_t count_fsb;
1453 int error;
1454
1455 /*
1456 * If full zeroing is forced by the error injection knob, we need a
1457 * space reservation that covers the entire range. See the comment in
1458 * xfs_zoned_write_space_reserve for the rationale for the calculation.
1459 * Otherwise just reserve space for the two boundary blocks.
1460 */
1461 count_fsb = XFS_ZONED_ZERO_EDGE_SPACE_RES;
1462 if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ZERO_RANGE &&
1463 XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_ZERO_RANGE))
1464 count_fsb += XFS_B_TO_FSB(mp, len) + 1;
1465
1466 error = xfs_zoned_space_reserve(mp, count_fsb, XFS_ZR_RESERVED, &ac);
1467 if (error)
1468 return error;
1469 error = __xfs_file_fallocate(file, mode, offset, len, &ac);
1470 xfs_zoned_space_unreserve(mp, &ac);
1471 return error;
1472 }
1473
1474 static long
xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1475 xfs_file_fallocate(
1476 struct file *file,
1477 int mode,
1478 loff_t offset,
1479 loff_t len)
1480 {
1481 struct inode *inode = file_inode(file);
1482
1483 if (!S_ISREG(inode->i_mode))
1484 return -EINVAL;
1485 if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1486 return -EOPNOTSUPP;
1487
1488 /*
1489 * For zoned file systems, zeroing the first and last block of a hole
1490 * punch requires allocating a new block to rewrite the remaining data
1491 * and new zeroes out of place. Get a reservations for those before
1492 * taking the iolock. Dip into the reserved pool because we are
1493 * expected to be able to punch a hole even on a completely full
1494 * file system.
1495 */
1496 if (xfs_is_zoned_inode(XFS_I(inode)) &&
1497 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1498 FALLOC_FL_COLLAPSE_RANGE)))
1499 return xfs_file_zoned_fallocate(file, mode, offset, len);
1500 return __xfs_file_fallocate(file, mode, offset, len, NULL);
1501 }
1502
1503 STATIC int
xfs_file_fadvise(struct file * file,loff_t start,loff_t end,int advice)1504 xfs_file_fadvise(
1505 struct file *file,
1506 loff_t start,
1507 loff_t end,
1508 int advice)
1509 {
1510 struct xfs_inode *ip = XFS_I(file_inode(file));
1511 int ret;
1512 int lockflags = 0;
1513
1514 /*
1515 * Operations creating pages in page cache need protection from hole
1516 * punching and similar ops
1517 */
1518 if (advice == POSIX_FADV_WILLNEED) {
1519 lockflags = XFS_IOLOCK_SHARED;
1520 xfs_ilock(ip, lockflags);
1521 }
1522 ret = generic_fadvise(file, start, end, advice);
1523 if (lockflags)
1524 xfs_iunlock(ip, lockflags);
1525 return ret;
1526 }
1527
1528 STATIC loff_t
xfs_file_remap_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)1529 xfs_file_remap_range(
1530 struct file *file_in,
1531 loff_t pos_in,
1532 struct file *file_out,
1533 loff_t pos_out,
1534 loff_t len,
1535 unsigned int remap_flags)
1536 {
1537 struct inode *inode_in = file_inode(file_in);
1538 struct xfs_inode *src = XFS_I(inode_in);
1539 struct inode *inode_out = file_inode(file_out);
1540 struct xfs_inode *dest = XFS_I(inode_out);
1541 struct xfs_mount *mp = src->i_mount;
1542 loff_t remapped = 0;
1543 xfs_extlen_t cowextsize;
1544 int ret;
1545
1546 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1547 return -EINVAL;
1548
1549 if (!xfs_has_reflink(mp))
1550 return -EOPNOTSUPP;
1551
1552 if (xfs_is_shutdown(mp))
1553 return -EIO;
1554
1555 /* Prepare and then clone file data. */
1556 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1557 &len, remap_flags);
1558 if (ret || len == 0)
1559 return ret;
1560
1561 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1562
1563 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1564 &remapped);
1565 if (ret)
1566 goto out_unlock;
1567
1568 /*
1569 * Carry the cowextsize hint from src to dest if we're sharing the
1570 * entire source file to the entire destination file, the source file
1571 * has a cowextsize hint, and the destination file does not.
1572 */
1573 cowextsize = 0;
1574 if (pos_in == 0 && len == i_size_read(inode_in) &&
1575 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1576 pos_out == 0 && len >= i_size_read(inode_out) &&
1577 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1578 cowextsize = src->i_cowextsize;
1579
1580 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1581 remap_flags);
1582 if (ret)
1583 goto out_unlock;
1584
1585 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1586 xfs_log_force_inode(dest);
1587 out_unlock:
1588 xfs_iunlock2_remapping(src, dest);
1589 if (ret)
1590 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1591 /*
1592 * If the caller did not set CAN_SHORTEN, then it is not prepared to
1593 * handle partial results -- either the whole remap succeeds, or we
1594 * must say why it did not. In this case, any error should be returned
1595 * to the caller.
1596 */
1597 if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1598 return ret;
1599 return remapped > 0 ? remapped : ret;
1600 }
1601
1602 STATIC int
xfs_file_open(struct inode * inode,struct file * file)1603 xfs_file_open(
1604 struct inode *inode,
1605 struct file *file)
1606 {
1607 if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1608 return -EIO;
1609 file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
1610 if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
1611 file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
1612 return generic_file_open(inode, file);
1613 }
1614
1615 STATIC int
xfs_dir_open(struct inode * inode,struct file * file)1616 xfs_dir_open(
1617 struct inode *inode,
1618 struct file *file)
1619 {
1620 struct xfs_inode *ip = XFS_I(inode);
1621 unsigned int mode;
1622 int error;
1623
1624 if (xfs_is_shutdown(ip->i_mount))
1625 return -EIO;
1626 error = generic_file_open(inode, file);
1627 if (error)
1628 return error;
1629
1630 /*
1631 * If there are any blocks, read-ahead block 0 as we're almost
1632 * certain to have the next operation be a read there.
1633 */
1634 mode = xfs_ilock_data_map_shared(ip);
1635 if (ip->i_df.if_nextents > 0)
1636 error = xfs_dir3_data_readahead(ip, 0, 0);
1637 xfs_iunlock(ip, mode);
1638 return error;
1639 }
1640
1641 /*
1642 * Don't bother propagating errors. We're just doing cleanup, and the caller
1643 * ignores the return value anyway.
1644 */
1645 STATIC int
xfs_file_release(struct inode * inode,struct file * file)1646 xfs_file_release(
1647 struct inode *inode,
1648 struct file *file)
1649 {
1650 struct xfs_inode *ip = XFS_I(inode);
1651 struct xfs_mount *mp = ip->i_mount;
1652
1653 /*
1654 * If this is a read-only mount or the file system has been shut down,
1655 * don't generate I/O.
1656 */
1657 if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
1658 return 0;
1659
1660 /*
1661 * If we previously truncated this file and removed old data in the
1662 * process, we want to initiate "early" writeout on the last close.
1663 * This is an attempt to combat the notorious NULL files problem which
1664 * is particularly noticeable from a truncate down, buffered (re-)write
1665 * (delalloc), followed by a crash. What we are effectively doing here
1666 * is significantly reducing the time window where we'd otherwise be
1667 * exposed to that problem.
1668 */
1669 if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1670 xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1671 if (ip->i_delayed_blks > 0)
1672 filemap_flush(inode->i_mapping);
1673 }
1674
1675 /*
1676 * XFS aggressively preallocates post-EOF space to generate contiguous
1677 * allocations for writers that append to the end of the file.
1678 *
1679 * To support workloads that close and reopen the file frequently, these
1680 * preallocations usually persist after a close unless it is the first
1681 * close for the inode. This is a tradeoff to generate tightly packed
1682 * data layouts for unpacking tarballs or similar archives that write
1683 * one file after another without going back to it while keeping the
1684 * preallocation for files that have recurring open/write/close cycles.
1685 *
1686 * This heuristic is skipped for inodes with the append-only flag as
1687 * that flag is rather pointless for inodes written only once.
1688 *
1689 * There is no point in freeing blocks here for open but unlinked files
1690 * as they will be taken care of by the inactivation path soon.
1691 *
1692 * When releasing a read-only context, don't flush data or trim post-EOF
1693 * blocks. This avoids open/read/close workloads from removing EOF
1694 * blocks that other writers depend upon to reduce fragmentation.
1695 *
1696 * Inodes on the zoned RT device never have preallocations, so skip
1697 * taking the locks below.
1698 */
1699 if (!inode->i_nlink ||
1700 !(file->f_mode & FMODE_WRITE) ||
1701 (ip->i_diflags & XFS_DIFLAG_APPEND) ||
1702 xfs_is_zoned_inode(ip))
1703 return 0;
1704
1705 /*
1706 * If we can't get the iolock just skip truncating the blocks past EOF
1707 * because we could deadlock with the mmap_lock otherwise. We'll get
1708 * another chance to drop them once the last reference to the inode is
1709 * dropped, so we'll never leak blocks permanently.
1710 */
1711 if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1712 xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1713 if (xfs_can_free_eofblocks(ip) &&
1714 !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1715 xfs_free_eofblocks(ip);
1716 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1717 }
1718
1719 return 0;
1720 }
1721
1722 STATIC int
xfs_file_readdir(struct file * file,struct dir_context * ctx)1723 xfs_file_readdir(
1724 struct file *file,
1725 struct dir_context *ctx)
1726 {
1727 struct inode *inode = file_inode(file);
1728 xfs_inode_t *ip = XFS_I(inode);
1729 size_t bufsize;
1730
1731 /*
1732 * The Linux API doesn't pass down the total size of the buffer
1733 * we read into down to the filesystem. With the filldir concept
1734 * it's not needed for correct information, but the XFS dir2 leaf
1735 * code wants an estimate of the buffer size to calculate it's
1736 * readahead window and size the buffers used for mapping to
1737 * physical blocks.
1738 *
1739 * Try to give it an estimate that's good enough, maybe at some
1740 * point we can change the ->readdir prototype to include the
1741 * buffer size. For now we use the current glibc buffer size.
1742 */
1743 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1744
1745 return xfs_readdir(NULL, ip, ctx, bufsize);
1746 }
1747
1748 STATIC loff_t
xfs_file_llseek(struct file * file,loff_t offset,int whence)1749 xfs_file_llseek(
1750 struct file *file,
1751 loff_t offset,
1752 int whence)
1753 {
1754 struct inode *inode = file->f_mapping->host;
1755
1756 if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1757 return -EIO;
1758
1759 switch (whence) {
1760 default:
1761 return generic_file_llseek(file, offset, whence);
1762 case SEEK_HOLE:
1763 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1764 break;
1765 case SEEK_DATA:
1766 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1767 break;
1768 }
1769
1770 if (offset < 0)
1771 return offset;
1772 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1773 }
1774
1775 static inline vm_fault_t
xfs_dax_fault_locked(struct vm_fault * vmf,unsigned int order,bool write_fault)1776 xfs_dax_fault_locked(
1777 struct vm_fault *vmf,
1778 unsigned int order,
1779 bool write_fault)
1780 {
1781 vm_fault_t ret;
1782 unsigned long pfn;
1783
1784 if (!IS_ENABLED(CONFIG_FS_DAX)) {
1785 ASSERT(0);
1786 return VM_FAULT_SIGBUS;
1787 }
1788 ret = dax_iomap_fault(vmf, order, &pfn, NULL,
1789 (write_fault && !vmf->cow_page) ?
1790 &xfs_dax_write_iomap_ops :
1791 &xfs_read_iomap_ops);
1792 if (ret & VM_FAULT_NEEDDSYNC)
1793 ret = dax_finish_sync_fault(vmf, order, pfn);
1794 return ret;
1795 }
1796
1797 static vm_fault_t
xfs_dax_read_fault(struct vm_fault * vmf,unsigned int order)1798 xfs_dax_read_fault(
1799 struct vm_fault *vmf,
1800 unsigned int order)
1801 {
1802 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
1803 vm_fault_t ret;
1804
1805 trace_xfs_read_fault(ip, order);
1806
1807 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1808 ret = xfs_dax_fault_locked(vmf, order, false);
1809 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1810
1811 return ret;
1812 }
1813
1814 /*
1815 * Locking for serialisation of IO during page faults. This results in a lock
1816 * ordering of:
1817 *
1818 * mmap_lock (MM)
1819 * sb_start_pagefault(vfs, freeze)
1820 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1821 * page_lock (MM)
1822 * i_lock (XFS - extent map serialisation)
1823 */
1824 static vm_fault_t
__xfs_write_fault(struct vm_fault * vmf,unsigned int order,struct xfs_zone_alloc_ctx * ac)1825 __xfs_write_fault(
1826 struct vm_fault *vmf,
1827 unsigned int order,
1828 struct xfs_zone_alloc_ctx *ac)
1829 {
1830 struct inode *inode = file_inode(vmf->vma->vm_file);
1831 struct xfs_inode *ip = XFS_I(inode);
1832 unsigned int lock_mode = XFS_MMAPLOCK_SHARED;
1833 vm_fault_t ret;
1834
1835 trace_xfs_write_fault(ip, order);
1836
1837 sb_start_pagefault(inode->i_sb);
1838 file_update_time(vmf->vma->vm_file);
1839
1840 /*
1841 * Normally we only need the shared mmaplock, but if a reflink remap is
1842 * in progress we take the exclusive lock to wait for the remap to
1843 * finish before taking a write fault.
1844 */
1845 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1846 if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1847 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1848 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1849 lock_mode = XFS_MMAPLOCK_EXCL;
1850 }
1851
1852 if (IS_DAX(inode))
1853 ret = xfs_dax_fault_locked(vmf, order, true);
1854 else
1855 ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
1856 ac);
1857 xfs_iunlock(ip, lock_mode);
1858
1859 sb_end_pagefault(inode->i_sb);
1860 return ret;
1861 }
1862
1863 static vm_fault_t
xfs_write_fault_zoned(struct vm_fault * vmf,unsigned int order)1864 xfs_write_fault_zoned(
1865 struct vm_fault *vmf,
1866 unsigned int order)
1867 {
1868 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
1869 unsigned int len = folio_size(page_folio(vmf->page));
1870 struct xfs_zone_alloc_ctx ac = { };
1871 int error;
1872 vm_fault_t ret;
1873
1874 /*
1875 * This could over-allocate as it doesn't check for truncation.
1876 *
1877 * But as the overallocation is limited to less than a folio and will be
1878 * release instantly that's just fine.
1879 */
1880 error = xfs_zoned_space_reserve(ip->i_mount,
1881 XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
1882 if (error < 0)
1883 return vmf_fs_error(error);
1884 ret = __xfs_write_fault(vmf, order, &ac);
1885 xfs_zoned_space_unreserve(ip->i_mount, &ac);
1886 return ret;
1887 }
1888
1889 static vm_fault_t
xfs_write_fault(struct vm_fault * vmf,unsigned int order)1890 xfs_write_fault(
1891 struct vm_fault *vmf,
1892 unsigned int order)
1893 {
1894 if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
1895 return xfs_write_fault_zoned(vmf, order);
1896 return __xfs_write_fault(vmf, order, NULL);
1897 }
1898
1899 static inline bool
xfs_is_write_fault(struct vm_fault * vmf)1900 xfs_is_write_fault(
1901 struct vm_fault *vmf)
1902 {
1903 return (vmf->flags & FAULT_FLAG_WRITE) &&
1904 (vmf->vma->vm_flags & VM_SHARED);
1905 }
1906
1907 static vm_fault_t
xfs_filemap_fault(struct vm_fault * vmf)1908 xfs_filemap_fault(
1909 struct vm_fault *vmf)
1910 {
1911 struct inode *inode = file_inode(vmf->vma->vm_file);
1912
1913 /* DAX can shortcut the normal fault path on write faults! */
1914 if (IS_DAX(inode)) {
1915 if (xfs_is_write_fault(vmf))
1916 return xfs_write_fault(vmf, 0);
1917 return xfs_dax_read_fault(vmf, 0);
1918 }
1919
1920 trace_xfs_read_fault(XFS_I(inode), 0);
1921 return filemap_fault(vmf);
1922 }
1923
1924 static vm_fault_t
xfs_filemap_huge_fault(struct vm_fault * vmf,unsigned int order)1925 xfs_filemap_huge_fault(
1926 struct vm_fault *vmf,
1927 unsigned int order)
1928 {
1929 if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1930 return VM_FAULT_FALLBACK;
1931
1932 /* DAX can shortcut the normal fault path on write faults! */
1933 if (xfs_is_write_fault(vmf))
1934 return xfs_write_fault(vmf, order);
1935 return xfs_dax_read_fault(vmf, order);
1936 }
1937
1938 static vm_fault_t
xfs_filemap_page_mkwrite(struct vm_fault * vmf)1939 xfs_filemap_page_mkwrite(
1940 struct vm_fault *vmf)
1941 {
1942 return xfs_write_fault(vmf, 0);
1943 }
1944
1945 /*
1946 * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1947 * on write faults. In reality, it needs to serialise against truncate and
1948 * prepare memory for writing so handle is as standard write fault.
1949 */
1950 static vm_fault_t
xfs_filemap_pfn_mkwrite(struct vm_fault * vmf)1951 xfs_filemap_pfn_mkwrite(
1952 struct vm_fault *vmf)
1953 {
1954 return xfs_write_fault(vmf, 0);
1955 }
1956
1957 static const struct vm_operations_struct xfs_file_vm_ops = {
1958 .fault = xfs_filemap_fault,
1959 .huge_fault = xfs_filemap_huge_fault,
1960 .map_pages = filemap_map_pages,
1961 .page_mkwrite = xfs_filemap_page_mkwrite,
1962 .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
1963 };
1964
1965 STATIC int
xfs_file_mmap_prepare(struct vm_area_desc * desc)1966 xfs_file_mmap_prepare(
1967 struct vm_area_desc *desc)
1968 {
1969 struct file *file = desc->file;
1970 struct inode *inode = file_inode(file);
1971 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
1972
1973 /*
1974 * We don't support synchronous mappings for non-DAX files and
1975 * for DAX files if underneath dax_device is not synchronous.
1976 */
1977 if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
1978 target->bt_daxdev))
1979 return -EOPNOTSUPP;
1980
1981 file_accessed(file);
1982 desc->vm_ops = &xfs_file_vm_ops;
1983 if (IS_DAX(inode))
1984 desc->vm_flags |= VM_HUGEPAGE;
1985 return 0;
1986 }
1987
1988 const struct file_operations xfs_file_operations = {
1989 .llseek = xfs_file_llseek,
1990 .read_iter = xfs_file_read_iter,
1991 .write_iter = xfs_file_write_iter,
1992 .splice_read = xfs_file_splice_read,
1993 .splice_write = iter_file_splice_write,
1994 .iopoll = iocb_bio_iopoll,
1995 .unlocked_ioctl = xfs_file_ioctl,
1996 #ifdef CONFIG_COMPAT
1997 .compat_ioctl = xfs_file_compat_ioctl,
1998 #endif
1999 .mmap_prepare = xfs_file_mmap_prepare,
2000 .open = xfs_file_open,
2001 .release = xfs_file_release,
2002 .fsync = xfs_file_fsync,
2003 .get_unmapped_area = thp_get_unmapped_area,
2004 .fallocate = xfs_file_fallocate,
2005 .fadvise = xfs_file_fadvise,
2006 .remap_file_range = xfs_file_remap_range,
2007 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
2008 FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
2009 FOP_DONTCACHE,
2010 };
2011
2012 const struct file_operations xfs_dir_file_operations = {
2013 .open = xfs_dir_open,
2014 .read = generic_read_dir,
2015 .iterate_shared = xfs_file_readdir,
2016 .llseek = generic_file_llseek,
2017 .unlocked_ioctl = xfs_file_ioctl,
2018 #ifdef CONFIG_COMPAT
2019 .compat_ioctl = xfs_file_compat_ioctl,
2020 #endif
2021 .fsync = xfs_dir_fsync,
2022 };
2023