1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 #include "xfs_file.h"
28 #include "xfs_aops.h"
29 #include "xfs_zone_alloc.h"
30 #include "xfs_error.h"
31 #include "xfs_errortag.h"
32
33 #include <linux/dax.h>
34 #include <linux/falloc.h>
35 #include <linux/backing-dev.h>
36 #include <linux/mman.h>
37 #include <linux/fadvise.h>
38 #include <linux/mount.h>
39 #include <linux/filelock.h>
40
41 static const struct vm_operations_struct xfs_file_vm_ops;
42
43 /*
44 * Decide if the given file range is aligned to the size of the fundamental
45 * allocation unit for the file.
46 */
47 bool
xfs_is_falloc_aligned(struct xfs_inode * ip,loff_t pos,long long int len)48 xfs_is_falloc_aligned(
49 struct xfs_inode *ip,
50 loff_t pos,
51 long long int len)
52 {
53 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
54
55 if (!is_power_of_2(alloc_unit))
56 return isaligned_64(pos, alloc_unit) &&
57 isaligned_64(len, alloc_unit);
58
59 return !((pos | len) & (alloc_unit - 1));
60 }
61
62 /*
63 * Fsync operations on directories are much simpler than on regular files,
64 * as there is no file data to flush, and thus also no need for explicit
65 * cache flush operations, and there are no non-transaction metadata updates
66 * on directories either.
67 */
68 STATIC int
xfs_dir_fsync(struct file * file,loff_t start,loff_t end,int datasync)69 xfs_dir_fsync(
70 struct file *file,
71 loff_t start,
72 loff_t end,
73 int datasync)
74 {
75 struct xfs_inode *ip = XFS_I(file->f_mapping->host);
76
77 trace_xfs_dir_fsync(ip);
78 return xfs_log_force_inode(ip);
79 }
80
81 /*
82 * All metadata updates are logged, which means that we just have to push the
83 * journal to the required sequence number than holds the updates. We track
84 * datasync commits separately to full sync commits, and hence only need to
85 * select the correct sequence number for the log force here.
86 *
87 * We don't have to serialise against concurrent modifications, as we do not
88 * have to wait for modifications that have not yet completed. We define a
89 * transaction commit as completing when the commit sequence number is updated,
90 * hence if the sequence number has not updated, the sync operation has been
91 * run before the commit completed and we don't have to wait for it.
92 *
93 * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
94 * set on the log item until - at least - the journal flush completes. In
95 * reality, they are only cleared when the inode is fully unpinned (i.e.
96 * persistent in the journal and not dirty in the CIL), and so we rely on
97 * xfs_log_force_seq() either skipping sequences that have been persisted or
98 * waiting on sequences that are still in flight to correctly order concurrent
99 * sync operations.
100 */
101 static int
xfs_fsync_flush_log(struct xfs_inode * ip,bool datasync,int * log_flushed)102 xfs_fsync_flush_log(
103 struct xfs_inode *ip,
104 bool datasync,
105 int *log_flushed)
106 {
107 struct xfs_inode_log_item *iip = ip->i_itemp;
108 xfs_csn_t seq = 0;
109
110 spin_lock(&iip->ili_lock);
111 if (datasync)
112 seq = iip->ili_datasync_seq;
113 else
114 seq = iip->ili_commit_seq;
115 spin_unlock(&iip->ili_lock);
116
117 if (!seq)
118 return 0;
119
120 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
121 log_flushed);
122 }
123
124 STATIC int
xfs_file_fsync(struct file * file,loff_t start,loff_t end,int datasync)125 xfs_file_fsync(
126 struct file *file,
127 loff_t start,
128 loff_t end,
129 int datasync)
130 {
131 struct xfs_inode *ip = XFS_I(file->f_mapping->host);
132 struct xfs_mount *mp = ip->i_mount;
133 int error, err2;
134 int log_flushed = 0;
135
136 trace_xfs_file_fsync(ip);
137
138 error = file_write_and_wait_range(file, start, end);
139 if (error)
140 return error;
141
142 if (xfs_is_shutdown(mp))
143 return -EIO;
144
145 xfs_iflags_clear(ip, XFS_ITRUNCATED);
146
147 /*
148 * If we have an RT and/or log subvolume we need to make sure to flush
149 * the write cache the device used for file data first. This is to
150 * ensure newly written file data make it to disk before logging the new
151 * inode size in case of an extending write.
152 */
153 if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
154 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
155 else if (mp->m_logdev_targp != mp->m_ddev_targp)
156 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
157
158 /*
159 * If the inode has a inode log item attached, it may need the journal
160 * flushed to persist any changes the log item might be tracking.
161 */
162 if (ip->i_itemp) {
163 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
164 if (err2 && !error)
165 error = err2;
166 }
167
168 /*
169 * If we only have a single device, and the log force about was
170 * a no-op we might have to flush the data device cache here.
171 * This can only happen for fdatasync/O_DSYNC if we were overwriting
172 * an already allocated file and thus do not have any metadata to
173 * commit.
174 */
175 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
176 mp->m_logdev_targp == mp->m_ddev_targp) {
177 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
178 if (err2 && !error)
179 error = err2;
180 }
181
182 return error;
183 }
184
185 static int
xfs_ilock_iocb(struct kiocb * iocb,unsigned int lock_mode)186 xfs_ilock_iocb(
187 struct kiocb *iocb,
188 unsigned int lock_mode)
189 {
190 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
191
192 if (iocb->ki_flags & IOCB_NOWAIT) {
193 if (!xfs_ilock_nowait(ip, lock_mode))
194 return -EAGAIN;
195 } else {
196 xfs_ilock(ip, lock_mode);
197 }
198
199 return 0;
200 }
201
202 static int
xfs_ilock_iocb_for_write(struct kiocb * iocb,unsigned int * lock_mode)203 xfs_ilock_iocb_for_write(
204 struct kiocb *iocb,
205 unsigned int *lock_mode)
206 {
207 ssize_t ret;
208 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
209
210 ret = xfs_ilock_iocb(iocb, *lock_mode);
211 if (ret)
212 return ret;
213
214 /*
215 * If a reflink remap is in progress we always need to take the iolock
216 * exclusively to wait for it to finish.
217 */
218 if (*lock_mode == XFS_IOLOCK_SHARED &&
219 xfs_iflags_test(ip, XFS_IREMAPPING)) {
220 xfs_iunlock(ip, *lock_mode);
221 *lock_mode = XFS_IOLOCK_EXCL;
222 return xfs_ilock_iocb(iocb, *lock_mode);
223 }
224
225 return 0;
226 }
227
228 /*
229 * Bounce buffering dio reads need a user context to copy back the data.
230 * Use an ioend to provide that.
231 */
232 static void
xfs_dio_read_bounce_submit_io(const struct iomap_iter * iter,struct bio * bio,loff_t file_offset)233 xfs_dio_read_bounce_submit_io(
234 const struct iomap_iter *iter,
235 struct bio *bio,
236 loff_t file_offset)
237 {
238 iomap_init_ioend(iter->inode, bio, file_offset, IOMAP_IOEND_DIRECT);
239 bio->bi_end_io = xfs_end_bio;
240 submit_bio(bio);
241 }
242
243 static const struct iomap_dio_ops xfs_dio_read_bounce_ops = {
244 .submit_io = xfs_dio_read_bounce_submit_io,
245 .bio_set = &iomap_ioend_bioset,
246 };
247
248 STATIC ssize_t
xfs_file_dio_read(struct kiocb * iocb,struct iov_iter * to)249 xfs_file_dio_read(
250 struct kiocb *iocb,
251 struct iov_iter *to)
252 {
253 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
254 unsigned int dio_flags = 0;
255 const struct iomap_dio_ops *dio_ops = NULL;
256 ssize_t ret;
257
258 trace_xfs_file_direct_read(iocb, to);
259
260 if (!iov_iter_count(to))
261 return 0; /* skip atime */
262
263 file_accessed(iocb->ki_filp);
264
265 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
266 if (ret)
267 return ret;
268 if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
269 dio_ops = &xfs_dio_read_bounce_ops;
270 dio_flags |= IOMAP_DIO_BOUNCE;
271 }
272 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, dio_ops, dio_flags,
273 NULL, 0);
274 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
275
276 return ret;
277 }
278
279 static noinline ssize_t
xfs_file_dax_read(struct kiocb * iocb,struct iov_iter * to)280 xfs_file_dax_read(
281 struct kiocb *iocb,
282 struct iov_iter *to)
283 {
284 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
285 ssize_t ret = 0;
286
287 trace_xfs_file_dax_read(iocb, to);
288
289 if (!iov_iter_count(to))
290 return 0; /* skip atime */
291
292 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
293 if (ret)
294 return ret;
295 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
296 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
297
298 file_accessed(iocb->ki_filp);
299 return ret;
300 }
301
302 STATIC ssize_t
xfs_file_buffered_read(struct kiocb * iocb,struct iov_iter * to)303 xfs_file_buffered_read(
304 struct kiocb *iocb,
305 struct iov_iter *to)
306 {
307 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
308 ssize_t ret;
309
310 trace_xfs_file_buffered_read(iocb, to);
311
312 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
313 if (ret)
314 return ret;
315 ret = generic_file_read_iter(iocb, to);
316 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
317
318 return ret;
319 }
320
321 STATIC ssize_t
xfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)322 xfs_file_read_iter(
323 struct kiocb *iocb,
324 struct iov_iter *to)
325 {
326 struct inode *inode = file_inode(iocb->ki_filp);
327 struct xfs_mount *mp = XFS_I(inode)->i_mount;
328 ssize_t ret = 0;
329
330 XFS_STATS_INC(mp, xs_read_calls);
331
332 if (xfs_is_shutdown(mp))
333 return -EIO;
334
335 if (IS_DAX(inode))
336 ret = xfs_file_dax_read(iocb, to);
337 else if (iocb->ki_flags & IOCB_DIRECT)
338 ret = xfs_file_dio_read(iocb, to);
339 else
340 ret = xfs_file_buffered_read(iocb, to);
341
342 if (ret > 0)
343 XFS_STATS_ADD(mp, xs_read_bytes, ret);
344 return ret;
345 }
346
347 STATIC ssize_t
xfs_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)348 xfs_file_splice_read(
349 struct file *in,
350 loff_t *ppos,
351 struct pipe_inode_info *pipe,
352 size_t len,
353 unsigned int flags)
354 {
355 struct inode *inode = file_inode(in);
356 struct xfs_inode *ip = XFS_I(inode);
357 struct xfs_mount *mp = ip->i_mount;
358 ssize_t ret = 0;
359
360 XFS_STATS_INC(mp, xs_read_calls);
361
362 if (xfs_is_shutdown(mp))
363 return -EIO;
364
365 trace_xfs_file_splice_read(ip, *ppos, len);
366
367 xfs_ilock(ip, XFS_IOLOCK_SHARED);
368 ret = filemap_splice_read(in, ppos, pipe, len, flags);
369 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
370 if (ret > 0)
371 XFS_STATS_ADD(mp, xs_read_bytes, ret);
372 return ret;
373 }
374
375 /*
376 * Take care of zeroing post-EOF blocks when they might exist.
377 *
378 * Returns 0 if successfully, a negative error for a failure, or 1 if this
379 * function dropped the iolock and reacquired it exclusively and the caller
380 * needs to restart the write sanity checks.
381 */
382 static ssize_t
xfs_file_write_zero_eof(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,size_t count,bool * drained_dio,struct xfs_zone_alloc_ctx * ac)383 xfs_file_write_zero_eof(
384 struct kiocb *iocb,
385 struct iov_iter *from,
386 unsigned int *iolock,
387 size_t count,
388 bool *drained_dio,
389 struct xfs_zone_alloc_ctx *ac)
390 {
391 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
392 loff_t isize;
393 int error;
394
395 /*
396 * We need to serialise against EOF updates that occur in IO completions
397 * here. We want to make sure that nobody is changing the size while
398 * we do this check until we have placed an IO barrier (i.e. hold
399 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
400 * spinlock effectively forms a memory barrier once we have
401 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
402 * hence be able to correctly determine if we need to run zeroing.
403 */
404 spin_lock(&ip->i_flags_lock);
405 isize = i_size_read(VFS_I(ip));
406 if (iocb->ki_pos <= isize) {
407 spin_unlock(&ip->i_flags_lock);
408 return 0;
409 }
410 spin_unlock(&ip->i_flags_lock);
411
412 if (iocb->ki_flags & IOCB_NOWAIT)
413 return -EAGAIN;
414
415 if (!*drained_dio) {
416 /*
417 * If zeroing is needed and we are currently holding the iolock
418 * shared, we need to update it to exclusive which implies
419 * having to redo all checks before.
420 */
421 if (*iolock == XFS_IOLOCK_SHARED) {
422 xfs_iunlock(ip, *iolock);
423 *iolock = XFS_IOLOCK_EXCL;
424 xfs_ilock(ip, *iolock);
425 iov_iter_reexpand(from, count);
426 }
427
428 /*
429 * We now have an IO submission barrier in place, but AIO can do
430 * EOF updates during IO completion and hence we now need to
431 * wait for all of them to drain. Non-AIO DIO will have drained
432 * before we are given the XFS_IOLOCK_EXCL, and so for most
433 * cases this wait is a no-op.
434 */
435 inode_dio_wait(VFS_I(ip));
436 *drained_dio = true;
437 return 1;
438 }
439
440 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
441
442 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
443 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
444 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
445
446 return error;
447 }
448
449 /*
450 * Common pre-write limit and setup checks.
451 *
452 * Called with the iolock held either shared and exclusive according to
453 * @iolock, and returns with it held. Might upgrade the iolock to exclusive
454 * if called for a direct write beyond i_size.
455 */
456 STATIC ssize_t
xfs_file_write_checks(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,struct xfs_zone_alloc_ctx * ac)457 xfs_file_write_checks(
458 struct kiocb *iocb,
459 struct iov_iter *from,
460 unsigned int *iolock,
461 struct xfs_zone_alloc_ctx *ac)
462 {
463 struct inode *inode = iocb->ki_filp->f_mapping->host;
464 size_t count = iov_iter_count(from);
465 bool drained_dio = false;
466 ssize_t error;
467
468 restart:
469 error = generic_write_checks(iocb, from);
470 if (error <= 0)
471 return error;
472
473 if (iocb->ki_flags & IOCB_NOWAIT) {
474 error = break_layout(inode, false);
475 if (error == -EWOULDBLOCK)
476 error = -EAGAIN;
477 } else {
478 error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
479 }
480
481 if (error)
482 return error;
483
484 /*
485 * For changing security info in file_remove_privs() we need i_rwsem
486 * exclusively.
487 */
488 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
489 xfs_iunlock(XFS_I(inode), *iolock);
490 *iolock = XFS_IOLOCK_EXCL;
491 error = xfs_ilock_iocb(iocb, *iolock);
492 if (error) {
493 *iolock = 0;
494 return error;
495 }
496 goto restart;
497 }
498
499 /*
500 * If the offset is beyond the size of the file, we need to zero all
501 * blocks that fall between the existing EOF and the start of this
502 * write.
503 *
504 * We can do an unlocked check for i_size here safely as I/O completion
505 * can only extend EOF. Truncate is locked out at this point, so the
506 * EOF can not move backwards, only forwards. Hence we only need to take
507 * the slow path when we are at or beyond the current EOF.
508 */
509 if (iocb->ki_pos > i_size_read(inode)) {
510 error = xfs_file_write_zero_eof(iocb, from, iolock, count,
511 &drained_dio, ac);
512 if (error == 1)
513 goto restart;
514 if (error)
515 return error;
516 }
517
518 return kiocb_modified(iocb);
519 }
520
521 static ssize_t
xfs_zoned_write_space_reserve(struct xfs_mount * mp,struct kiocb * iocb,struct iov_iter * from,unsigned int flags,struct xfs_zone_alloc_ctx * ac)522 xfs_zoned_write_space_reserve(
523 struct xfs_mount *mp,
524 struct kiocb *iocb,
525 struct iov_iter *from,
526 unsigned int flags,
527 struct xfs_zone_alloc_ctx *ac)
528 {
529 loff_t count = iov_iter_count(from);
530 int error;
531
532 if (iocb->ki_flags & IOCB_NOWAIT)
533 flags |= XFS_ZR_NOWAIT;
534
535 /*
536 * Check the rlimit and LFS boundary first so that we don't over-reserve
537 * by possibly a lot.
538 *
539 * The generic write path will redo this check later, and it might have
540 * changed by then. If it got expanded we'll stick to our earlier
541 * smaller limit, and if it is decreased the new smaller limit will be
542 * used and our extra space reservation will be returned after finishing
543 * the write.
544 */
545 error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
546 if (error)
547 return error;
548
549 /*
550 * Sloppily round up count to file system blocks.
551 *
552 * This will often reserve an extra block, but that avoids having to look
553 * at the start offset, which isn't stable for O_APPEND until taking the
554 * iolock. Also we need to reserve a block each for zeroing the old
555 * EOF block and the new start block if they are unaligned.
556 *
557 * Any remaining block will be returned after the write.
558 */
559 return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
560 flags, ac);
561 }
562
563 static int
xfs_dio_write_end_io(struct kiocb * iocb,ssize_t size,int error,unsigned flags)564 xfs_dio_write_end_io(
565 struct kiocb *iocb,
566 ssize_t size,
567 int error,
568 unsigned flags)
569 {
570 struct inode *inode = file_inode(iocb->ki_filp);
571 struct xfs_inode *ip = XFS_I(inode);
572 loff_t offset = iocb->ki_pos;
573 unsigned int nofs_flag;
574
575 ASSERT(!xfs_is_zoned_inode(ip) ||
576 !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
577
578 trace_xfs_end_io_direct_write(ip, offset, size);
579
580 if (xfs_is_shutdown(ip->i_mount))
581 return -EIO;
582
583 if (error)
584 return error;
585 if (!size)
586 return 0;
587
588 /*
589 * Capture amount written on completion as we can't reliably account
590 * for it on submission.
591 */
592 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
593
594 /*
595 * We can allocate memory here while doing writeback on behalf of
596 * memory reclaim. To avoid memory allocation deadlocks set the
597 * task-wide nofs context for the following operations.
598 */
599 nofs_flag = memalloc_nofs_save();
600
601 if (flags & IOMAP_DIO_COW) {
602 if (iocb->ki_flags & IOCB_ATOMIC)
603 error = xfs_reflink_end_atomic_cow(ip, offset, size);
604 else
605 error = xfs_reflink_end_cow(ip, offset, size);
606 if (error)
607 goto out;
608 }
609
610 /*
611 * Unwritten conversion updates the in-core isize after extent
612 * conversion but before updating the on-disk size. Updating isize any
613 * earlier allows a racing dio read to find unwritten extents before
614 * they are converted.
615 */
616 if (flags & IOMAP_DIO_UNWRITTEN) {
617 error = xfs_iomap_write_unwritten(ip, offset, size, true);
618 goto out;
619 }
620
621 /*
622 * We need to update the in-core inode size here so that we don't end up
623 * with the on-disk inode size being outside the in-core inode size. We
624 * have no other method of updating EOF for AIO, so always do it here
625 * if necessary.
626 *
627 * We need to lock the test/set EOF update as we can be racing with
628 * other IO completions here to update the EOF. Failing to serialise
629 * here can result in EOF moving backwards and Bad Things Happen when
630 * that occurs.
631 *
632 * As IO completion only ever extends EOF, we can do an unlocked check
633 * here to avoid taking the spinlock. If we land within the current EOF,
634 * then we do not need to do an extending update at all, and we don't
635 * need to take the lock to check this. If we race with an update moving
636 * EOF, then we'll either still be beyond EOF and need to take the lock,
637 * or we'll be within EOF and we don't need to take it at all.
638 */
639 if (offset + size <= i_size_read(inode))
640 goto out;
641
642 spin_lock(&ip->i_flags_lock);
643 if (offset + size > i_size_read(inode)) {
644 i_size_write(inode, offset + size);
645 spin_unlock(&ip->i_flags_lock);
646 error = xfs_setfilesize(ip, offset, size);
647 } else {
648 spin_unlock(&ip->i_flags_lock);
649 }
650
651 out:
652 memalloc_nofs_restore(nofs_flag);
653 return error;
654 }
655
656 static const struct iomap_dio_ops xfs_dio_write_ops = {
657 .end_io = xfs_dio_write_end_io,
658 };
659
660 static void
xfs_dio_zoned_submit_io(const struct iomap_iter * iter,struct bio * bio,loff_t file_offset)661 xfs_dio_zoned_submit_io(
662 const struct iomap_iter *iter,
663 struct bio *bio,
664 loff_t file_offset)
665 {
666 struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
667 struct xfs_zone_alloc_ctx *ac = iter->private;
668 xfs_filblks_t count_fsb;
669 struct iomap_ioend *ioend;
670
671 count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
672 if (count_fsb > ac->reserved_blocks) {
673 xfs_err(mp,
674 "allocation (%lld) larger than reservation (%lld).",
675 count_fsb, ac->reserved_blocks);
676 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
677 bio_io_error(bio);
678 return;
679 }
680 ac->reserved_blocks -= count_fsb;
681
682 bio->bi_end_io = xfs_end_bio;
683 ioend = iomap_init_ioend(iter->inode, bio, file_offset,
684 IOMAP_IOEND_DIRECT);
685 xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
686 }
687
688 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
689 .bio_set = &iomap_ioend_bioset,
690 .submit_io = xfs_dio_zoned_submit_io,
691 .end_io = xfs_dio_write_end_io,
692 };
693
694 /*
695 * Handle block aligned direct I/O writes.
696 */
697 static noinline ssize_t
xfs_file_dio_write_aligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from,const struct iomap_ops * ops,const struct iomap_dio_ops * dops,struct xfs_zone_alloc_ctx * ac)698 xfs_file_dio_write_aligned(
699 struct xfs_inode *ip,
700 struct kiocb *iocb,
701 struct iov_iter *from,
702 const struct iomap_ops *ops,
703 const struct iomap_dio_ops *dops,
704 struct xfs_zone_alloc_ctx *ac)
705 {
706 unsigned int iolock = XFS_IOLOCK_SHARED;
707 unsigned int dio_flags = 0;
708 ssize_t ret;
709
710 /*
711 * For always COW inodes, each bio must be aligned to the file system
712 * block size and not just the device sector size because we need to
713 * allocate a block-aligned amount of space for each write.
714 */
715 if (xfs_is_always_cow_inode(ip))
716 dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
717
718 ret = xfs_ilock_iocb_for_write(iocb, &iolock);
719 if (ret)
720 return ret;
721 ret = xfs_file_write_checks(iocb, from, &iolock, ac);
722 if (ret)
723 goto out_unlock;
724
725 /*
726 * We don't need to hold the IOLOCK exclusively across the IO, so demote
727 * the iolock back to shared if we had to take the exclusive lock in
728 * xfs_file_write_checks() for other reasons.
729 */
730 if (iolock == XFS_IOLOCK_EXCL) {
731 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
732 iolock = XFS_IOLOCK_SHARED;
733 }
734 if (mapping_stable_writes(iocb->ki_filp->f_mapping))
735 dio_flags |= IOMAP_DIO_BOUNCE;
736 trace_xfs_file_direct_write(iocb, from);
737 ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
738 out_unlock:
739 xfs_iunlock(ip, iolock);
740 return ret;
741 }
742
743 /*
744 * Handle block aligned direct I/O writes to zoned devices.
745 */
746 static noinline ssize_t
xfs_file_dio_write_zoned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)747 xfs_file_dio_write_zoned(
748 struct xfs_inode *ip,
749 struct kiocb *iocb,
750 struct iov_iter *from)
751 {
752 struct xfs_zone_alloc_ctx ac = { };
753 ssize_t ret;
754
755 ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
756 if (ret < 0)
757 return ret;
758 ret = xfs_file_dio_write_aligned(ip, iocb, from,
759 &xfs_zoned_direct_write_iomap_ops,
760 &xfs_dio_zoned_write_ops, &ac);
761 xfs_zoned_space_unreserve(ip->i_mount, &ac);
762 return ret;
763 }
764
765 /*
766 * Handle block atomic writes
767 *
768 * Two methods of atomic writes are supported:
769 * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
770 * disk
771 * - COW-based, which uses a COW fork as a staging extent for data updates
772 * before atomically updating extent mappings for the range being written
773 *
774 */
775 static noinline ssize_t
xfs_file_dio_write_atomic(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)776 xfs_file_dio_write_atomic(
777 struct xfs_inode *ip,
778 struct kiocb *iocb,
779 struct iov_iter *from)
780 {
781 unsigned int iolock = XFS_IOLOCK_SHARED;
782 ssize_t ret, ocount = iov_iter_count(from);
783 unsigned int dio_flags = 0;
784 const struct iomap_ops *dops;
785
786 /*
787 * HW offload should be faster, so try that first if it is already
788 * known that the write length is not too large.
789 */
790 if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
791 dops = &xfs_atomic_write_cow_iomap_ops;
792 else
793 dops = &xfs_direct_write_iomap_ops;
794
795 retry:
796 ret = xfs_ilock_iocb_for_write(iocb, &iolock);
797 if (ret)
798 return ret;
799
800 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
801 if (ret)
802 goto out_unlock;
803
804 /* Demote similar to xfs_file_dio_write_aligned() */
805 if (iolock == XFS_IOLOCK_EXCL) {
806 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
807 iolock = XFS_IOLOCK_SHARED;
808 }
809
810 trace_xfs_file_direct_write(iocb, from);
811 if (mapping_stable_writes(iocb->ki_filp->f_mapping))
812 dio_flags |= IOMAP_DIO_BOUNCE;
813 ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, dio_flags,
814 NULL, 0);
815
816 /*
817 * The retry mechanism is based on the ->iomap_begin method returning
818 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
819 * possible. The REQ_ATOMIC-based method typically not be possible if
820 * the write spans multiple extents or the disk blocks are misaligned.
821 */
822 if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
823 xfs_iunlock(ip, iolock);
824 dops = &xfs_atomic_write_cow_iomap_ops;
825 goto retry;
826 }
827
828 out_unlock:
829 if (iolock)
830 xfs_iunlock(ip, iolock);
831 return ret;
832 }
833
834 /*
835 * Handle block unaligned direct I/O writes
836 *
837 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
838 * them to be done in parallel with reads and other direct I/O writes. However,
839 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
840 * to do sub-block zeroing and that requires serialisation against other direct
841 * I/O to the same block. In this case we need to serialise the submission of
842 * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
843 * In the case where sub-block zeroing is not required, we can do concurrent
844 * sub-block dios to the same block successfully.
845 *
846 * Optimistically submit the I/O using the shared lock first, but use the
847 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
848 * if block allocation or partial block zeroing would be required. In that case
849 * we try again with the exclusive lock.
850 */
851 static noinline ssize_t
xfs_file_dio_write_unaligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)852 xfs_file_dio_write_unaligned(
853 struct xfs_inode *ip,
854 struct kiocb *iocb,
855 struct iov_iter *from)
856 {
857 size_t isize = i_size_read(VFS_I(ip));
858 size_t count = iov_iter_count(from);
859 unsigned int iolock = XFS_IOLOCK_SHARED;
860 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
861 ssize_t ret;
862
863 /*
864 * Extending writes need exclusivity because of the sub-block zeroing
865 * that the DIO code always does for partial tail blocks beyond EOF, so
866 * don't even bother trying the fast path in this case.
867 */
868 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
869 if (iocb->ki_flags & IOCB_NOWAIT)
870 return -EAGAIN;
871 retry_exclusive:
872 iolock = XFS_IOLOCK_EXCL;
873 flags = IOMAP_DIO_FORCE_WAIT;
874 }
875
876 ret = xfs_ilock_iocb_for_write(iocb, &iolock);
877 if (ret)
878 return ret;
879
880 /*
881 * We can't properly handle unaligned direct I/O to reflink files yet,
882 * as we can't unshare a partial block.
883 */
884 if (xfs_is_cow_inode(ip)) {
885 trace_xfs_reflink_bounce_dio_write(iocb, from);
886 ret = -ENOTBLK;
887 goto out_unlock;
888 }
889
890 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
891 if (ret)
892 goto out_unlock;
893
894 /*
895 * If we are doing exclusive unaligned I/O, this must be the only I/O
896 * in-flight. Otherwise we risk data corruption due to unwritten extent
897 * conversions from the AIO end_io handler. Wait for all other I/O to
898 * drain first.
899 */
900 if (flags & IOMAP_DIO_FORCE_WAIT)
901 inode_dio_wait(VFS_I(ip));
902
903 if (mapping_stable_writes(iocb->ki_filp->f_mapping))
904 flags |= IOMAP_DIO_BOUNCE;
905
906 trace_xfs_file_direct_write(iocb, from);
907 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
908 &xfs_dio_write_ops, flags, NULL, 0);
909
910 /*
911 * Retry unaligned I/O with exclusive blocking semantics if the DIO
912 * layer rejected it for mapping or locking reasons. If we are doing
913 * nonblocking user I/O, propagate the error.
914 */
915 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
916 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
917 xfs_iunlock(ip, iolock);
918 goto retry_exclusive;
919 }
920
921 out_unlock:
922 if (iolock)
923 xfs_iunlock(ip, iolock);
924 return ret;
925 }
926
927 static ssize_t
xfs_file_dio_write(struct kiocb * iocb,struct iov_iter * from)928 xfs_file_dio_write(
929 struct kiocb *iocb,
930 struct iov_iter *from)
931 {
932 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
933 struct xfs_buftarg *target = xfs_inode_buftarg(ip);
934 size_t count = iov_iter_count(from);
935
936 /* direct I/O must be aligned to device logical sector size */
937 if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
938 return -EINVAL;
939
940 if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
941 return xfs_file_dio_write_unaligned(ip, iocb, from);
942 if (xfs_is_zoned_inode(ip))
943 return xfs_file_dio_write_zoned(ip, iocb, from);
944 if (iocb->ki_flags & IOCB_ATOMIC)
945 return xfs_file_dio_write_atomic(ip, iocb, from);
946 return xfs_file_dio_write_aligned(ip, iocb, from,
947 &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
948 }
949
950 static noinline ssize_t
xfs_file_dax_write(struct kiocb * iocb,struct iov_iter * from)951 xfs_file_dax_write(
952 struct kiocb *iocb,
953 struct iov_iter *from)
954 {
955 struct inode *inode = iocb->ki_filp->f_mapping->host;
956 struct xfs_inode *ip = XFS_I(inode);
957 unsigned int iolock = XFS_IOLOCK_EXCL;
958 ssize_t ret, error = 0;
959 loff_t pos;
960
961 ret = xfs_ilock_iocb(iocb, iolock);
962 if (ret)
963 return ret;
964 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
965 if (ret)
966 goto out;
967
968 pos = iocb->ki_pos;
969
970 trace_xfs_file_dax_write(iocb, from);
971 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
972 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
973 i_size_write(inode, iocb->ki_pos);
974 error = xfs_setfilesize(ip, pos, ret);
975 }
976 out:
977 if (iolock)
978 xfs_iunlock(ip, iolock);
979 if (error)
980 return error;
981
982 if (ret > 0) {
983 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
984
985 /* Handle various SYNC-type writes */
986 ret = generic_write_sync(iocb, ret);
987 }
988 return ret;
989 }
990
991 STATIC ssize_t
xfs_file_buffered_write(struct kiocb * iocb,struct iov_iter * from)992 xfs_file_buffered_write(
993 struct kiocb *iocb,
994 struct iov_iter *from)
995 {
996 struct inode *inode = iocb->ki_filp->f_mapping->host;
997 struct xfs_inode *ip = XFS_I(inode);
998 ssize_t ret;
999 bool cleared_space = false;
1000 unsigned int iolock;
1001
1002 write_retry:
1003 iolock = XFS_IOLOCK_EXCL;
1004 ret = xfs_ilock_iocb(iocb, iolock);
1005 if (ret)
1006 return ret;
1007
1008 ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
1009 if (ret)
1010 goto out;
1011
1012 trace_xfs_file_buffered_write(iocb, from);
1013 ret = iomap_file_buffered_write(iocb, from,
1014 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
1015 NULL);
1016
1017 /*
1018 * If we hit a space limit, try to free up some lingering preallocated
1019 * space before returning an error. In the case of ENOSPC, first try to
1020 * write back all dirty inodes to free up some of the excess reserved
1021 * metadata space. This reduces the chances that the eofblocks scan
1022 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
1023 * also behaves as a filter to prevent too many eofblocks scans from
1024 * running at the same time. Use a synchronous scan to increase the
1025 * effectiveness of the scan.
1026 */
1027 if (ret == -EDQUOT && !cleared_space) {
1028 xfs_iunlock(ip, iolock);
1029 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
1030 cleared_space = true;
1031 goto write_retry;
1032 } else if (ret == -ENOSPC && !cleared_space) {
1033 struct xfs_icwalk icw = {0};
1034
1035 cleared_space = true;
1036 xfs_flush_inodes(ip->i_mount);
1037
1038 xfs_iunlock(ip, iolock);
1039 icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
1040 xfs_blockgc_free_space(ip->i_mount, &icw);
1041 goto write_retry;
1042 }
1043
1044 out:
1045 if (iolock)
1046 xfs_iunlock(ip, iolock);
1047
1048 if (ret > 0) {
1049 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
1050 /* Handle various SYNC-type writes */
1051 ret = generic_write_sync(iocb, ret);
1052 }
1053 return ret;
1054 }
1055
1056 STATIC ssize_t
xfs_file_buffered_write_zoned(struct kiocb * iocb,struct iov_iter * from)1057 xfs_file_buffered_write_zoned(
1058 struct kiocb *iocb,
1059 struct iov_iter *from)
1060 {
1061 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
1062 struct xfs_mount *mp = ip->i_mount;
1063 unsigned int iolock = XFS_IOLOCK_EXCL;
1064 bool cleared_space = false;
1065 struct xfs_zone_alloc_ctx ac = { };
1066 ssize_t ret;
1067
1068 ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
1069 if (ret < 0)
1070 return ret;
1071
1072 ret = xfs_ilock_iocb(iocb, iolock);
1073 if (ret)
1074 goto out_unreserve;
1075
1076 ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
1077 if (ret)
1078 goto out_unlock;
1079
1080 /*
1081 * Truncate the iter to the length that we were actually able to
1082 * allocate blocks for. This needs to happen after
1083 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
1084 * writes.
1085 */
1086 iov_iter_truncate(from,
1087 XFS_FSB_TO_B(mp, ac.reserved_blocks) -
1088 (iocb->ki_pos & mp->m_blockmask));
1089 if (!iov_iter_count(from))
1090 goto out_unlock;
1091
1092 retry:
1093 trace_xfs_file_buffered_write(iocb, from);
1094 ret = iomap_file_buffered_write(iocb, from,
1095 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
1096 &ac);
1097 if (ret == -ENOSPC && !cleared_space) {
1098 /*
1099 * Kick off writeback to convert delalloc space and release the
1100 * usually too pessimistic indirect block reservations.
1101 */
1102 xfs_flush_inodes(mp);
1103 cleared_space = true;
1104 goto retry;
1105 }
1106
1107 out_unlock:
1108 xfs_iunlock(ip, iolock);
1109 out_unreserve:
1110 xfs_zoned_space_unreserve(ip->i_mount, &ac);
1111 if (ret > 0) {
1112 XFS_STATS_ADD(mp, xs_write_bytes, ret);
1113 ret = generic_write_sync(iocb, ret);
1114 }
1115 return ret;
1116 }
1117
1118 STATIC ssize_t
xfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1119 xfs_file_write_iter(
1120 struct kiocb *iocb,
1121 struct iov_iter *from)
1122 {
1123 struct inode *inode = iocb->ki_filp->f_mapping->host;
1124 struct xfs_inode *ip = XFS_I(inode);
1125 ssize_t ret;
1126 size_t ocount = iov_iter_count(from);
1127
1128 XFS_STATS_INC(ip->i_mount, xs_write_calls);
1129
1130 if (ocount == 0)
1131 return 0;
1132
1133 if (xfs_is_shutdown(ip->i_mount))
1134 return -EIO;
1135
1136 if (iocb->ki_flags & IOCB_ATOMIC) {
1137 if (ocount < xfs_get_atomic_write_min(ip))
1138 return -EINVAL;
1139
1140 if (ocount > xfs_get_atomic_write_max(ip))
1141 return -EINVAL;
1142
1143 ret = generic_atomic_write_valid(iocb, from);
1144 if (ret)
1145 return ret;
1146 }
1147
1148 if (IS_DAX(inode))
1149 return xfs_file_dax_write(iocb, from);
1150
1151 if (iocb->ki_flags & IOCB_DIRECT) {
1152 /*
1153 * Allow a directio write to fall back to a buffered
1154 * write *only* in the case that we're doing a reflink
1155 * CoW. In all other directio scenarios we do not
1156 * allow an operation to fall back to buffered mode.
1157 */
1158 ret = xfs_file_dio_write(iocb, from);
1159 if (ret != -ENOTBLK)
1160 return ret;
1161 }
1162
1163 if (xfs_is_zoned_inode(ip))
1164 return xfs_file_buffered_write_zoned(iocb, from);
1165 return xfs_file_buffered_write(iocb, from);
1166 }
1167
1168 /* Does this file, inode, or mount want synchronous writes? */
xfs_file_sync_writes(struct file * filp)1169 static inline bool xfs_file_sync_writes(struct file *filp)
1170 {
1171 struct xfs_inode *ip = XFS_I(file_inode(filp));
1172
1173 if (xfs_has_wsync(ip->i_mount))
1174 return true;
1175 if (filp->f_flags & (__O_SYNC | O_DSYNC))
1176 return true;
1177 if (IS_SYNC(file_inode(filp)))
1178 return true;
1179
1180 return false;
1181 }
1182
1183 static int
xfs_falloc_newsize(struct file * file,int mode,loff_t offset,loff_t len,loff_t * new_size)1184 xfs_falloc_newsize(
1185 struct file *file,
1186 int mode,
1187 loff_t offset,
1188 loff_t len,
1189 loff_t *new_size)
1190 {
1191 struct inode *inode = file_inode(file);
1192
1193 if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
1194 return 0;
1195 *new_size = offset + len;
1196 return inode_newsize_ok(inode, *new_size);
1197 }
1198
1199 static int
xfs_falloc_setsize(struct file * file,loff_t new_size)1200 xfs_falloc_setsize(
1201 struct file *file,
1202 loff_t new_size)
1203 {
1204 struct iattr iattr = {
1205 .ia_valid = ATTR_SIZE,
1206 .ia_size = new_size,
1207 };
1208
1209 if (!new_size)
1210 return 0;
1211 return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
1212 &iattr);
1213 }
1214
1215 static int
xfs_falloc_collapse_range(struct file * file,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1216 xfs_falloc_collapse_range(
1217 struct file *file,
1218 loff_t offset,
1219 loff_t len,
1220 struct xfs_zone_alloc_ctx *ac)
1221 {
1222 struct inode *inode = file_inode(file);
1223 loff_t new_size = i_size_read(inode) - len;
1224 int error;
1225
1226 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1227 return -EINVAL;
1228
1229 /*
1230 * There is no need to overlap collapse range with EOF, in which case it
1231 * is effectively a truncate operation
1232 */
1233 if (offset + len >= i_size_read(inode))
1234 return -EINVAL;
1235
1236 error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1237 if (error)
1238 return error;
1239 return xfs_falloc_setsize(file, new_size);
1240 }
1241
1242 static int
xfs_falloc_insert_range(struct file * file,loff_t offset,loff_t len)1243 xfs_falloc_insert_range(
1244 struct file *file,
1245 loff_t offset,
1246 loff_t len)
1247 {
1248 struct inode *inode = file_inode(file);
1249 loff_t isize = i_size_read(inode);
1250 int error;
1251
1252 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1253 return -EINVAL;
1254
1255 /*
1256 * New inode size must not exceed ->s_maxbytes, accounting for
1257 * possible signed overflow.
1258 */
1259 if (inode->i_sb->s_maxbytes - isize < len)
1260 return -EFBIG;
1261
1262 /* Offset should be less than i_size */
1263 if (offset >= isize)
1264 return -EINVAL;
1265
1266 error = xfs_falloc_setsize(file, isize + len);
1267 if (error)
1268 return error;
1269
1270 /*
1271 * Perform hole insertion now that the file size has been updated so
1272 * that if we crash during the operation we don't leave shifted extents
1273 * past EOF and hence losing access to the data that is contained within
1274 * them.
1275 */
1276 return xfs_insert_file_space(XFS_I(inode), offset, len);
1277 }
1278
1279 /*
1280 * For various operations we need to zero up to one block at each end of
1281 * the affected range. For zoned file systems this will require a space
1282 * allocation, for which we need a reservation ahead of time.
1283 */
1284 #define XFS_ZONED_ZERO_EDGE_SPACE_RES 2
1285
1286 /*
1287 * Zero range implements a full zeroing mechanism but is only used in limited
1288 * situations. It is more efficient to allocate unwritten extents than to
1289 * perform zeroing here, so use an errortag to randomly force zeroing on DEBUG
1290 * kernels for added test coverage.
1291 *
1292 * On zoned file systems, the error is already injected by
1293 * xfs_file_zoned_fallocate, which then reserves the additional space needed.
1294 * We only check for this extra space reservation here.
1295 */
1296 static inline bool
xfs_falloc_force_zero(struct xfs_inode * ip,struct xfs_zone_alloc_ctx * ac)1297 xfs_falloc_force_zero(
1298 struct xfs_inode *ip,
1299 struct xfs_zone_alloc_ctx *ac)
1300 {
1301 if (xfs_is_zoned_inode(ip)) {
1302 if (ac->reserved_blocks > XFS_ZONED_ZERO_EDGE_SPACE_RES) {
1303 ASSERT(IS_ENABLED(CONFIG_XFS_DEBUG));
1304 return true;
1305 }
1306 return false;
1307 }
1308 return XFS_TEST_ERROR(ip->i_mount, XFS_ERRTAG_FORCE_ZERO_RANGE);
1309 }
1310
1311 /*
1312 * Punch a hole and prealloc the range. We use a hole punch rather than
1313 * unwritten extent conversion for two reasons:
1314 *
1315 * 1.) Hole punch handles partial block zeroing for us.
1316 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1317 * virtue of the hole punch.
1318 */
1319 static int
xfs_falloc_zero_range(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1320 xfs_falloc_zero_range(
1321 struct file *file,
1322 int mode,
1323 loff_t offset,
1324 loff_t len,
1325 struct xfs_zone_alloc_ctx *ac)
1326 {
1327 struct inode *inode = file_inode(file);
1328 struct xfs_inode *ip = XFS_I(inode);
1329 unsigned int blksize = i_blocksize(inode);
1330 loff_t new_size = 0;
1331 int error;
1332
1333 trace_xfs_zero_file_space(ip);
1334
1335 error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1336 if (error)
1337 return error;
1338
1339 if (xfs_falloc_force_zero(ip, ac)) {
1340 error = xfs_zero_range(ip, offset, len, ac, NULL);
1341 } else {
1342 error = xfs_free_file_space(ip, offset, len, ac);
1343 if (error)
1344 return error;
1345
1346 len = round_up(offset + len, blksize) -
1347 round_down(offset, blksize);
1348 offset = round_down(offset, blksize);
1349 error = xfs_alloc_file_space(ip, offset, len);
1350 }
1351 if (error)
1352 return error;
1353 return xfs_falloc_setsize(file, new_size);
1354 }
1355
1356 static int
xfs_falloc_unshare_range(struct file * file,int mode,loff_t offset,loff_t len)1357 xfs_falloc_unshare_range(
1358 struct file *file,
1359 int mode,
1360 loff_t offset,
1361 loff_t len)
1362 {
1363 struct inode *inode = file_inode(file);
1364 loff_t new_size = 0;
1365 int error;
1366
1367 error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1368 if (error)
1369 return error;
1370
1371 error = xfs_reflink_unshare(XFS_I(inode), offset, len);
1372 if (error)
1373 return error;
1374
1375 error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1376 if (error)
1377 return error;
1378 return xfs_falloc_setsize(file, new_size);
1379 }
1380
1381 static int
xfs_falloc_allocate_range(struct file * file,int mode,loff_t offset,loff_t len)1382 xfs_falloc_allocate_range(
1383 struct file *file,
1384 int mode,
1385 loff_t offset,
1386 loff_t len)
1387 {
1388 struct inode *inode = file_inode(file);
1389 loff_t new_size = 0;
1390 int error;
1391
1392 /*
1393 * If always_cow mode we can't use preallocations and thus should not
1394 * create them.
1395 */
1396 if (xfs_is_always_cow_inode(XFS_I(inode)))
1397 return -EOPNOTSUPP;
1398
1399 error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1400 if (error)
1401 return error;
1402
1403 error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1404 if (error)
1405 return error;
1406 return xfs_falloc_setsize(file, new_size);
1407 }
1408
1409 #define XFS_FALLOC_FL_SUPPORTED \
1410 (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \
1411 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \
1412 FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \
1413 FALLOC_FL_UNSHARE_RANGE)
1414
1415 STATIC long
__xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1416 __xfs_file_fallocate(
1417 struct file *file,
1418 int mode,
1419 loff_t offset,
1420 loff_t len,
1421 struct xfs_zone_alloc_ctx *ac)
1422 {
1423 struct inode *inode = file_inode(file);
1424 struct xfs_inode *ip = XFS_I(inode);
1425 long error;
1426 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1427
1428 xfs_ilock(ip, iolock);
1429 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1430 if (error)
1431 goto out_unlock;
1432
1433 /*
1434 * Must wait for all AIO to complete before we continue as AIO can
1435 * change the file size on completion without holding any locks we
1436 * currently hold. We must do this first because AIO can update both
1437 * the on disk and in memory inode sizes, and the operations that follow
1438 * require the in-memory size to be fully up-to-date.
1439 */
1440 inode_dio_wait(inode);
1441
1442 error = file_modified(file);
1443 if (error)
1444 goto out_unlock;
1445
1446 switch (mode & FALLOC_FL_MODE_MASK) {
1447 case FALLOC_FL_PUNCH_HOLE:
1448 error = xfs_free_file_space(ip, offset, len, ac);
1449 break;
1450 case FALLOC_FL_COLLAPSE_RANGE:
1451 error = xfs_falloc_collapse_range(file, offset, len, ac);
1452 break;
1453 case FALLOC_FL_INSERT_RANGE:
1454 error = xfs_falloc_insert_range(file, offset, len);
1455 break;
1456 case FALLOC_FL_ZERO_RANGE:
1457 error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1458 break;
1459 case FALLOC_FL_UNSHARE_RANGE:
1460 error = xfs_falloc_unshare_range(file, mode, offset, len);
1461 break;
1462 case FALLOC_FL_ALLOCATE_RANGE:
1463 error = xfs_falloc_allocate_range(file, mode, offset, len);
1464 break;
1465 default:
1466 error = -EOPNOTSUPP;
1467 break;
1468 }
1469
1470 if (!error && xfs_file_sync_writes(file))
1471 error = xfs_log_force_inode(ip);
1472
1473 out_unlock:
1474 xfs_iunlock(ip, iolock);
1475 return error;
1476 }
1477
1478 static long
xfs_file_zoned_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1479 xfs_file_zoned_fallocate(
1480 struct file *file,
1481 int mode,
1482 loff_t offset,
1483 loff_t len)
1484 {
1485 struct xfs_zone_alloc_ctx ac = { };
1486 struct xfs_inode *ip = XFS_I(file_inode(file));
1487 struct xfs_mount *mp = ip->i_mount;
1488 xfs_filblks_t count_fsb;
1489 int error;
1490
1491 /*
1492 * If full zeroing is forced by the error injection knob, we need a
1493 * space reservation that covers the entire range. See the comment in
1494 * xfs_zoned_write_space_reserve for the rationale for the calculation.
1495 * Otherwise just reserve space for the two boundary blocks.
1496 */
1497 count_fsb = XFS_ZONED_ZERO_EDGE_SPACE_RES;
1498 if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ZERO_RANGE &&
1499 XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_ZERO_RANGE))
1500 count_fsb += XFS_B_TO_FSB(mp, len) + 1;
1501
1502 error = xfs_zoned_space_reserve(mp, count_fsb, XFS_ZR_RESERVED, &ac);
1503 if (error)
1504 return error;
1505 error = __xfs_file_fallocate(file, mode, offset, len, &ac);
1506 xfs_zoned_space_unreserve(mp, &ac);
1507 return error;
1508 }
1509
1510 static long
xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1511 xfs_file_fallocate(
1512 struct file *file,
1513 int mode,
1514 loff_t offset,
1515 loff_t len)
1516 {
1517 struct inode *inode = file_inode(file);
1518
1519 if (!S_ISREG(inode->i_mode))
1520 return -EINVAL;
1521 if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1522 return -EOPNOTSUPP;
1523
1524 /*
1525 * For zoned file systems, zeroing the first and last block of a hole
1526 * punch requires allocating a new block to rewrite the remaining data
1527 * and new zeroes out of place. Get a reservations for those before
1528 * taking the iolock. Dip into the reserved pool because we are
1529 * expected to be able to punch a hole even on a completely full
1530 * file system.
1531 */
1532 if (xfs_is_zoned_inode(XFS_I(inode)) &&
1533 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1534 FALLOC_FL_COLLAPSE_RANGE)))
1535 return xfs_file_zoned_fallocate(file, mode, offset, len);
1536 return __xfs_file_fallocate(file, mode, offset, len, NULL);
1537 }
1538
1539 STATIC int
xfs_file_fadvise(struct file * file,loff_t start,loff_t end,int advice)1540 xfs_file_fadvise(
1541 struct file *file,
1542 loff_t start,
1543 loff_t end,
1544 int advice)
1545 {
1546 struct xfs_inode *ip = XFS_I(file_inode(file));
1547 int ret;
1548 int lockflags = 0;
1549
1550 /*
1551 * Operations creating pages in page cache need protection from hole
1552 * punching and similar ops
1553 */
1554 if (advice == POSIX_FADV_WILLNEED) {
1555 lockflags = XFS_IOLOCK_SHARED;
1556 xfs_ilock(ip, lockflags);
1557 }
1558 ret = generic_fadvise(file, start, end, advice);
1559 if (lockflags)
1560 xfs_iunlock(ip, lockflags);
1561 return ret;
1562 }
1563
1564 STATIC loff_t
xfs_file_remap_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)1565 xfs_file_remap_range(
1566 struct file *file_in,
1567 loff_t pos_in,
1568 struct file *file_out,
1569 loff_t pos_out,
1570 loff_t len,
1571 unsigned int remap_flags)
1572 {
1573 struct inode *inode_in = file_inode(file_in);
1574 struct xfs_inode *src = XFS_I(inode_in);
1575 struct inode *inode_out = file_inode(file_out);
1576 struct xfs_inode *dest = XFS_I(inode_out);
1577 struct xfs_mount *mp = src->i_mount;
1578 loff_t remapped = 0;
1579 xfs_extlen_t cowextsize;
1580 int ret;
1581
1582 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1583 return -EINVAL;
1584
1585 if (!xfs_has_reflink(mp))
1586 return -EOPNOTSUPP;
1587
1588 if (xfs_is_shutdown(mp))
1589 return -EIO;
1590
1591 /* Prepare and then clone file data. */
1592 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1593 &len, remap_flags);
1594 if (ret || len == 0)
1595 return ret;
1596
1597 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1598
1599 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1600 &remapped);
1601 if (ret)
1602 goto out_unlock;
1603
1604 /*
1605 * Carry the cowextsize hint from src to dest if we're sharing the
1606 * entire source file to the entire destination file, the source file
1607 * has a cowextsize hint, and the destination file does not.
1608 */
1609 cowextsize = 0;
1610 if (pos_in == 0 && len == i_size_read(inode_in) &&
1611 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1612 pos_out == 0 && len >= i_size_read(inode_out) &&
1613 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1614 cowextsize = src->i_cowextsize;
1615
1616 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1617 remap_flags);
1618 if (ret)
1619 goto out_unlock;
1620
1621 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1622 xfs_log_force_inode(dest);
1623 out_unlock:
1624 xfs_iunlock2_remapping(src, dest);
1625 if (ret)
1626 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1627 /*
1628 * If the caller did not set CAN_SHORTEN, then it is not prepared to
1629 * handle partial results -- either the whole remap succeeds, or we
1630 * must say why it did not. In this case, any error should be returned
1631 * to the caller.
1632 */
1633 if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1634 return ret;
1635 return remapped > 0 ? remapped : ret;
1636 }
1637
1638 STATIC int
xfs_file_open(struct inode * inode,struct file * file)1639 xfs_file_open(
1640 struct inode *inode,
1641 struct file *file)
1642 {
1643 if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1644 return -EIO;
1645 file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
1646 if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
1647 file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
1648 return generic_file_open(inode, file);
1649 }
1650
1651 STATIC int
xfs_dir_open(struct inode * inode,struct file * file)1652 xfs_dir_open(
1653 struct inode *inode,
1654 struct file *file)
1655 {
1656 struct xfs_inode *ip = XFS_I(inode);
1657 unsigned int mode;
1658 int error;
1659
1660 if (xfs_is_shutdown(ip->i_mount))
1661 return -EIO;
1662 error = generic_file_open(inode, file);
1663 if (error)
1664 return error;
1665
1666 /*
1667 * If there are any blocks, read-ahead block 0 as we're almost
1668 * certain to have the next operation be a read there.
1669 */
1670 mode = xfs_ilock_data_map_shared(ip);
1671 if (ip->i_df.if_nextents > 0)
1672 error = xfs_dir3_data_readahead(ip, 0, 0);
1673 xfs_iunlock(ip, mode);
1674 return error;
1675 }
1676
1677 /*
1678 * Don't bother propagating errors. We're just doing cleanup, and the caller
1679 * ignores the return value anyway.
1680 */
1681 STATIC int
xfs_file_release(struct inode * inode,struct file * file)1682 xfs_file_release(
1683 struct inode *inode,
1684 struct file *file)
1685 {
1686 struct xfs_inode *ip = XFS_I(inode);
1687 struct xfs_mount *mp = ip->i_mount;
1688
1689 /*
1690 * If this is a read-only mount or the file system has been shut down,
1691 * don't generate I/O.
1692 */
1693 if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
1694 return 0;
1695
1696 /*
1697 * If we previously truncated this file and removed old data in the
1698 * process, we want to initiate "early" writeout on the last close.
1699 * This is an attempt to combat the notorious NULL files problem which
1700 * is particularly noticeable from a truncate down, buffered (re-)write
1701 * (delalloc), followed by a crash. What we are effectively doing here
1702 * is significantly reducing the time window where we'd otherwise be
1703 * exposed to that problem.
1704 */
1705 if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1706 xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1707 if (ip->i_delayed_blks > 0)
1708 filemap_flush(inode->i_mapping);
1709 }
1710
1711 /*
1712 * XFS aggressively preallocates post-EOF space to generate contiguous
1713 * allocations for writers that append to the end of the file.
1714 *
1715 * To support workloads that close and reopen the file frequently, these
1716 * preallocations usually persist after a close unless it is the first
1717 * close for the inode. This is a tradeoff to generate tightly packed
1718 * data layouts for unpacking tarballs or similar archives that write
1719 * one file after another without going back to it while keeping the
1720 * preallocation for files that have recurring open/write/close cycles.
1721 *
1722 * This heuristic is skipped for inodes with the append-only flag as
1723 * that flag is rather pointless for inodes written only once.
1724 *
1725 * There is no point in freeing blocks here for open but unlinked files
1726 * as they will be taken care of by the inactivation path soon.
1727 *
1728 * When releasing a read-only context, don't flush data or trim post-EOF
1729 * blocks. This avoids open/read/close workloads from removing EOF
1730 * blocks that other writers depend upon to reduce fragmentation.
1731 *
1732 * Inodes on the zoned RT device never have preallocations, so skip
1733 * taking the locks below.
1734 */
1735 if (!inode->i_nlink ||
1736 !(file->f_mode & FMODE_WRITE) ||
1737 (ip->i_diflags & XFS_DIFLAG_APPEND) ||
1738 xfs_is_zoned_inode(ip))
1739 return 0;
1740
1741 /*
1742 * If we can't get the iolock just skip truncating the blocks past EOF
1743 * because we could deadlock with the mmap_lock otherwise. We'll get
1744 * another chance to drop them once the last reference to the inode is
1745 * dropped, so we'll never leak blocks permanently.
1746 */
1747 if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1748 xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1749 if (xfs_can_free_eofblocks(ip) &&
1750 !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1751 xfs_free_eofblocks(ip);
1752 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1753 }
1754
1755 return 0;
1756 }
1757
1758 STATIC int
xfs_file_readdir(struct file * file,struct dir_context * ctx)1759 xfs_file_readdir(
1760 struct file *file,
1761 struct dir_context *ctx)
1762 {
1763 struct inode *inode = file_inode(file);
1764 xfs_inode_t *ip = XFS_I(inode);
1765 size_t bufsize;
1766
1767 /*
1768 * The Linux API doesn't pass down the total size of the buffer
1769 * we read into down to the filesystem. With the filldir concept
1770 * it's not needed for correct information, but the XFS dir2 leaf
1771 * code wants an estimate of the buffer size to calculate it's
1772 * readahead window and size the buffers used for mapping to
1773 * physical blocks.
1774 *
1775 * Try to give it an estimate that's good enough, maybe at some
1776 * point we can change the ->readdir prototype to include the
1777 * buffer size. For now we use the current glibc buffer size.
1778 */
1779 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1780
1781 return xfs_readdir(NULL, ip, ctx, bufsize);
1782 }
1783
1784 STATIC loff_t
xfs_file_llseek(struct file * file,loff_t offset,int whence)1785 xfs_file_llseek(
1786 struct file *file,
1787 loff_t offset,
1788 int whence)
1789 {
1790 struct inode *inode = file->f_mapping->host;
1791
1792 if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1793 return -EIO;
1794
1795 switch (whence) {
1796 default:
1797 return generic_file_llseek(file, offset, whence);
1798 case SEEK_HOLE:
1799 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1800 break;
1801 case SEEK_DATA:
1802 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1803 break;
1804 }
1805
1806 if (offset < 0)
1807 return offset;
1808 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1809 }
1810
1811 static inline vm_fault_t
xfs_dax_fault_locked(struct vm_fault * vmf,unsigned int order,bool write_fault)1812 xfs_dax_fault_locked(
1813 struct vm_fault *vmf,
1814 unsigned int order,
1815 bool write_fault)
1816 {
1817 vm_fault_t ret;
1818 unsigned long pfn;
1819
1820 if (!IS_ENABLED(CONFIG_FS_DAX)) {
1821 ASSERT(0);
1822 return VM_FAULT_SIGBUS;
1823 }
1824 ret = dax_iomap_fault(vmf, order, &pfn, NULL,
1825 (write_fault && !vmf->cow_page) ?
1826 &xfs_dax_write_iomap_ops :
1827 &xfs_read_iomap_ops);
1828 if (ret & VM_FAULT_NEEDDSYNC)
1829 ret = dax_finish_sync_fault(vmf, order, pfn);
1830 return ret;
1831 }
1832
1833 static vm_fault_t
xfs_dax_read_fault(struct vm_fault * vmf,unsigned int order)1834 xfs_dax_read_fault(
1835 struct vm_fault *vmf,
1836 unsigned int order)
1837 {
1838 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
1839 vm_fault_t ret;
1840
1841 trace_xfs_read_fault(ip, order);
1842
1843 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1844 ret = xfs_dax_fault_locked(vmf, order, false);
1845 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1846
1847 return ret;
1848 }
1849
1850 /*
1851 * Locking for serialisation of IO during page faults. This results in a lock
1852 * ordering of:
1853 *
1854 * mmap_lock (MM)
1855 * sb_start_pagefault(vfs, freeze)
1856 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1857 * page_lock (MM)
1858 * i_lock (XFS - extent map serialisation)
1859 */
1860 static vm_fault_t
__xfs_write_fault(struct vm_fault * vmf,unsigned int order,struct xfs_zone_alloc_ctx * ac)1861 __xfs_write_fault(
1862 struct vm_fault *vmf,
1863 unsigned int order,
1864 struct xfs_zone_alloc_ctx *ac)
1865 {
1866 struct inode *inode = file_inode(vmf->vma->vm_file);
1867 struct xfs_inode *ip = XFS_I(inode);
1868 unsigned int lock_mode = XFS_MMAPLOCK_SHARED;
1869 vm_fault_t ret;
1870
1871 trace_xfs_write_fault(ip, order);
1872
1873 sb_start_pagefault(inode->i_sb);
1874 file_update_time(vmf->vma->vm_file);
1875
1876 /*
1877 * Normally we only need the shared mmaplock, but if a reflink remap is
1878 * in progress we take the exclusive lock to wait for the remap to
1879 * finish before taking a write fault.
1880 */
1881 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1882 if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1883 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1884 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1885 lock_mode = XFS_MMAPLOCK_EXCL;
1886 }
1887
1888 if (IS_DAX(inode))
1889 ret = xfs_dax_fault_locked(vmf, order, true);
1890 else
1891 ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
1892 ac);
1893 xfs_iunlock(ip, lock_mode);
1894
1895 sb_end_pagefault(inode->i_sb);
1896 return ret;
1897 }
1898
1899 static vm_fault_t
xfs_write_fault_zoned(struct vm_fault * vmf,unsigned int order)1900 xfs_write_fault_zoned(
1901 struct vm_fault *vmf,
1902 unsigned int order)
1903 {
1904 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
1905 unsigned int len = folio_size(page_folio(vmf->page));
1906 struct xfs_zone_alloc_ctx ac = { };
1907 int error;
1908 vm_fault_t ret;
1909
1910 /*
1911 * This could over-allocate as it doesn't check for truncation.
1912 *
1913 * But as the overallocation is limited to less than a folio and will be
1914 * release instantly that's just fine.
1915 */
1916 error = xfs_zoned_space_reserve(ip->i_mount,
1917 XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
1918 if (error < 0)
1919 return vmf_fs_error(error);
1920 ret = __xfs_write_fault(vmf, order, &ac);
1921 xfs_zoned_space_unreserve(ip->i_mount, &ac);
1922 return ret;
1923 }
1924
1925 static vm_fault_t
xfs_write_fault(struct vm_fault * vmf,unsigned int order)1926 xfs_write_fault(
1927 struct vm_fault *vmf,
1928 unsigned int order)
1929 {
1930 if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
1931 return xfs_write_fault_zoned(vmf, order);
1932 return __xfs_write_fault(vmf, order, NULL);
1933 }
1934
1935 static inline bool
xfs_is_write_fault(struct vm_fault * vmf)1936 xfs_is_write_fault(
1937 struct vm_fault *vmf)
1938 {
1939 return (vmf->flags & FAULT_FLAG_WRITE) &&
1940 (vmf->vma->vm_flags & VM_SHARED);
1941 }
1942
1943 static vm_fault_t
xfs_filemap_fault(struct vm_fault * vmf)1944 xfs_filemap_fault(
1945 struct vm_fault *vmf)
1946 {
1947 struct inode *inode = file_inode(vmf->vma->vm_file);
1948
1949 /* DAX can shortcut the normal fault path on write faults! */
1950 if (IS_DAX(inode)) {
1951 if (xfs_is_write_fault(vmf))
1952 return xfs_write_fault(vmf, 0);
1953 return xfs_dax_read_fault(vmf, 0);
1954 }
1955
1956 trace_xfs_read_fault(XFS_I(inode), 0);
1957 return filemap_fault(vmf);
1958 }
1959
1960 static vm_fault_t
xfs_filemap_huge_fault(struct vm_fault * vmf,unsigned int order)1961 xfs_filemap_huge_fault(
1962 struct vm_fault *vmf,
1963 unsigned int order)
1964 {
1965 if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1966 return VM_FAULT_FALLBACK;
1967
1968 /* DAX can shortcut the normal fault path on write faults! */
1969 if (xfs_is_write_fault(vmf))
1970 return xfs_write_fault(vmf, order);
1971 return xfs_dax_read_fault(vmf, order);
1972 }
1973
1974 static vm_fault_t
xfs_filemap_page_mkwrite(struct vm_fault * vmf)1975 xfs_filemap_page_mkwrite(
1976 struct vm_fault *vmf)
1977 {
1978 return xfs_write_fault(vmf, 0);
1979 }
1980
1981 /*
1982 * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1983 * on write faults. In reality, it needs to serialise against truncate and
1984 * prepare memory for writing so handle is as standard write fault.
1985 */
1986 static vm_fault_t
xfs_filemap_pfn_mkwrite(struct vm_fault * vmf)1987 xfs_filemap_pfn_mkwrite(
1988 struct vm_fault *vmf)
1989 {
1990 return xfs_write_fault(vmf, 0);
1991 }
1992
1993 static const struct vm_operations_struct xfs_file_vm_ops = {
1994 .fault = xfs_filemap_fault,
1995 .huge_fault = xfs_filemap_huge_fault,
1996 .map_pages = filemap_map_pages,
1997 .page_mkwrite = xfs_filemap_page_mkwrite,
1998 .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
1999 };
2000
2001 STATIC int
xfs_file_mmap_prepare(struct vm_area_desc * desc)2002 xfs_file_mmap_prepare(
2003 struct vm_area_desc *desc)
2004 {
2005 struct file *file = desc->file;
2006 struct inode *inode = file_inode(file);
2007 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
2008
2009 /*
2010 * We don't support synchronous mappings for non-DAX files and
2011 * for DAX files if underneath dax_device is not synchronous.
2012 */
2013 if (!daxdev_mapping_supported(desc, file_inode(file),
2014 target->bt_daxdev))
2015 return -EOPNOTSUPP;
2016
2017 file_accessed(file);
2018 desc->vm_ops = &xfs_file_vm_ops;
2019 if (IS_DAX(inode))
2020 vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT);
2021 return 0;
2022 }
2023
2024 const struct file_operations xfs_file_operations = {
2025 .llseek = xfs_file_llseek,
2026 .read_iter = xfs_file_read_iter,
2027 .write_iter = xfs_file_write_iter,
2028 .splice_read = xfs_file_splice_read,
2029 .splice_write = iter_file_splice_write,
2030 .iopoll = iocb_bio_iopoll,
2031 .unlocked_ioctl = xfs_file_ioctl,
2032 #ifdef CONFIG_COMPAT
2033 .compat_ioctl = xfs_file_compat_ioctl,
2034 #endif
2035 .mmap_prepare = xfs_file_mmap_prepare,
2036 .open = xfs_file_open,
2037 .release = xfs_file_release,
2038 .fsync = xfs_file_fsync,
2039 .get_unmapped_area = thp_get_unmapped_area,
2040 .fallocate = xfs_file_fallocate,
2041 .fadvise = xfs_file_fadvise,
2042 .remap_file_range = xfs_file_remap_range,
2043 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
2044 FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
2045 FOP_DONTCACHE,
2046 .setlease = generic_setlease,
2047 };
2048
2049 const struct file_operations xfs_dir_file_operations = {
2050 .open = xfs_dir_open,
2051 .read = generic_read_dir,
2052 .iterate_shared = xfs_file_readdir,
2053 .llseek = generic_file_llseek,
2054 .unlocked_ioctl = xfs_file_ioctl,
2055 #ifdef CONFIG_COMPAT
2056 .compat_ioctl = xfs_file_compat_ioctl,
2057 #endif
2058 .fsync = xfs_dir_fsync,
2059 .setlease = generic_setlease,
2060 };
2061