xref: /linux/fs/xfs/xfs_file.c (revision 56feb532bb927ae1c26726e2e7c0de95f54a3d67)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 #include "xfs_file.h"
28 #include "xfs_aops.h"
29 #include "xfs_zone_alloc.h"
30 #include "xfs_error.h"
31 #include "xfs_errortag.h"
32 
33 #include <linux/dax.h>
34 #include <linux/falloc.h>
35 #include <linux/backing-dev.h>
36 #include <linux/mman.h>
37 #include <linux/fadvise.h>
38 #include <linux/mount.h>
39 #include <linux/filelock.h>
40 
41 static const struct vm_operations_struct xfs_file_vm_ops;
42 
43 /*
44  * Decide if the given file range is aligned to the size of the fundamental
45  * allocation unit for the file.
46  */
47 bool
48 xfs_is_falloc_aligned(
49 	struct xfs_inode	*ip,
50 	loff_t			pos,
51 	long long int		len)
52 {
53 	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip);
54 
55 	if (!is_power_of_2(alloc_unit))
56 		return isaligned_64(pos, alloc_unit) &&
57 		       isaligned_64(len, alloc_unit);
58 
59 	return !((pos | len) & (alloc_unit - 1));
60 }
61 
62 /*
63  * Fsync operations on directories are much simpler than on regular files,
64  * as there is no file data to flush, and thus also no need for explicit
65  * cache flush operations, and there are no non-transaction metadata updates
66  * on directories either.
67  */
68 STATIC int
69 xfs_dir_fsync(
70 	struct file		*file,
71 	loff_t			start,
72 	loff_t			end,
73 	int			datasync)
74 {
75 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
76 
77 	trace_xfs_dir_fsync(ip);
78 	return xfs_log_force_inode(ip);
79 }
80 
81 /*
82  * All metadata updates are logged, which means that we just have to push the
83  * journal to the required sequence number than holds the updates. We track
84  * datasync commits separately to full sync commits, and hence only need to
85  * select the correct sequence number for the log force here.
86  *
87  * We don't have to serialise against concurrent modifications, as we do not
88  * have to wait for modifications that have not yet completed. We define a
89  * transaction commit as completing when the commit sequence number is updated,
90  * hence if the sequence number has not updated, the sync operation has been
91  * run before the commit completed and we don't have to wait for it.
92  *
93  * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
94  * set on the log item until - at least - the journal flush completes. In
95  * reality, they are only cleared when the inode is fully unpinned (i.e.
96  * persistent in the journal and not dirty in the CIL), and so we rely on
97  * xfs_log_force_seq() either skipping sequences that have been persisted or
98  * waiting on sequences that are still in flight to correctly order concurrent
99  * sync operations.
100  */
101 static int
102 xfs_fsync_flush_log(
103 	struct xfs_inode	*ip,
104 	bool			datasync,
105 	int			*log_flushed)
106 {
107 	struct xfs_inode_log_item *iip = ip->i_itemp;
108 	xfs_csn_t		seq = 0;
109 
110 	spin_lock(&iip->ili_lock);
111 	if (datasync)
112 		seq = iip->ili_datasync_seq;
113 	else
114 		seq = iip->ili_commit_seq;
115 	spin_unlock(&iip->ili_lock);
116 
117 	if (!seq)
118 		return 0;
119 
120 	return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
121 					  log_flushed);
122 }
123 
124 STATIC int
125 xfs_file_fsync(
126 	struct file		*file,
127 	loff_t			start,
128 	loff_t			end,
129 	int			datasync)
130 {
131 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
132 	struct xfs_mount	*mp = ip->i_mount;
133 	int			error, err2;
134 	int			log_flushed = 0;
135 
136 	trace_xfs_file_fsync(ip);
137 
138 	error = file_write_and_wait_range(file, start, end);
139 	if (error)
140 		return error;
141 
142 	if (xfs_is_shutdown(mp))
143 		return -EIO;
144 
145 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
146 
147 	/*
148 	 * If we have an RT and/or log subvolume we need to make sure to flush
149 	 * the write cache the device used for file data first.  This is to
150 	 * ensure newly written file data make it to disk before logging the new
151 	 * inode size in case of an extending write.
152 	 */
153 	if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
154 		error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
155 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
156 		error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
157 
158 	/*
159 	 * If the inode has a inode log item attached, it may need the journal
160 	 * flushed to persist any changes the log item might be tracking.
161 	 */
162 	if (ip->i_itemp) {
163 		err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
164 		if (err2 && !error)
165 			error = err2;
166 	}
167 
168 	/*
169 	 * If we only have a single device, and the log force about was
170 	 * a no-op we might have to flush the data device cache here.
171 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
172 	 * an already allocated file and thus do not have any metadata to
173 	 * commit.
174 	 */
175 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
176 	    mp->m_logdev_targp == mp->m_ddev_targp) {
177 		err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
178 		if (err2 && !error)
179 			error = err2;
180 	}
181 
182 	return error;
183 }
184 
185 static int
186 xfs_ilock_iocb(
187 	struct kiocb		*iocb,
188 	unsigned int		lock_mode)
189 {
190 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
191 
192 	if (iocb->ki_flags & IOCB_NOWAIT) {
193 		if (!xfs_ilock_nowait(ip, lock_mode))
194 			return -EAGAIN;
195 	} else {
196 		xfs_ilock(ip, lock_mode);
197 	}
198 
199 	return 0;
200 }
201 
202 static int
203 xfs_ilock_iocb_for_write(
204 	struct kiocb		*iocb,
205 	unsigned int		*lock_mode)
206 {
207 	ssize_t			ret;
208 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
209 
210 	ret = xfs_ilock_iocb(iocb, *lock_mode);
211 	if (ret)
212 		return ret;
213 
214 	/*
215 	 * If a reflink remap is in progress we always need to take the iolock
216 	 * exclusively to wait for it to finish.
217 	 */
218 	if (*lock_mode == XFS_IOLOCK_SHARED &&
219 	    xfs_iflags_test(ip, XFS_IREMAPPING)) {
220 		xfs_iunlock(ip, *lock_mode);
221 		*lock_mode = XFS_IOLOCK_EXCL;
222 		return xfs_ilock_iocb(iocb, *lock_mode);
223 	}
224 
225 	return 0;
226 }
227 
228 STATIC ssize_t
229 xfs_file_dio_read(
230 	struct kiocb		*iocb,
231 	struct iov_iter		*to)
232 {
233 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
234 	ssize_t			ret;
235 
236 	trace_xfs_file_direct_read(iocb, to);
237 
238 	if (!iov_iter_count(to))
239 		return 0; /* skip atime */
240 
241 	file_accessed(iocb->ki_filp);
242 
243 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
244 	if (ret)
245 		return ret;
246 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
247 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
248 
249 	return ret;
250 }
251 
252 static noinline ssize_t
253 xfs_file_dax_read(
254 	struct kiocb		*iocb,
255 	struct iov_iter		*to)
256 {
257 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
258 	ssize_t			ret = 0;
259 
260 	trace_xfs_file_dax_read(iocb, to);
261 
262 	if (!iov_iter_count(to))
263 		return 0; /* skip atime */
264 
265 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
266 	if (ret)
267 		return ret;
268 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
269 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
270 
271 	file_accessed(iocb->ki_filp);
272 	return ret;
273 }
274 
275 STATIC ssize_t
276 xfs_file_buffered_read(
277 	struct kiocb		*iocb,
278 	struct iov_iter		*to)
279 {
280 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
281 	ssize_t			ret;
282 
283 	trace_xfs_file_buffered_read(iocb, to);
284 
285 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
286 	if (ret)
287 		return ret;
288 	ret = generic_file_read_iter(iocb, to);
289 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
290 
291 	return ret;
292 }
293 
294 STATIC ssize_t
295 xfs_file_read_iter(
296 	struct kiocb		*iocb,
297 	struct iov_iter		*to)
298 {
299 	struct inode		*inode = file_inode(iocb->ki_filp);
300 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
301 	ssize_t			ret = 0;
302 
303 	XFS_STATS_INC(mp, xs_read_calls);
304 
305 	if (xfs_is_shutdown(mp))
306 		return -EIO;
307 
308 	if (IS_DAX(inode))
309 		ret = xfs_file_dax_read(iocb, to);
310 	else if (iocb->ki_flags & IOCB_DIRECT)
311 		ret = xfs_file_dio_read(iocb, to);
312 	else
313 		ret = xfs_file_buffered_read(iocb, to);
314 
315 	if (ret > 0)
316 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
317 	return ret;
318 }
319 
320 STATIC ssize_t
321 xfs_file_splice_read(
322 	struct file		*in,
323 	loff_t			*ppos,
324 	struct pipe_inode_info	*pipe,
325 	size_t			len,
326 	unsigned int		flags)
327 {
328 	struct inode		*inode = file_inode(in);
329 	struct xfs_inode	*ip = XFS_I(inode);
330 	struct xfs_mount	*mp = ip->i_mount;
331 	ssize_t			ret = 0;
332 
333 	XFS_STATS_INC(mp, xs_read_calls);
334 
335 	if (xfs_is_shutdown(mp))
336 		return -EIO;
337 
338 	trace_xfs_file_splice_read(ip, *ppos, len);
339 
340 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
341 	ret = filemap_splice_read(in, ppos, pipe, len, flags);
342 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
343 	if (ret > 0)
344 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
345 	return ret;
346 }
347 
348 /*
349  * Take care of zeroing post-EOF blocks when they might exist.
350  *
351  * Returns 0 if successfully, a negative error for a failure, or 1 if this
352  * function dropped the iolock and reacquired it exclusively and the caller
353  * needs to restart the write sanity checks.
354  */
355 static ssize_t
356 xfs_file_write_zero_eof(
357 	struct kiocb		*iocb,
358 	struct iov_iter		*from,
359 	unsigned int		*iolock,
360 	size_t			count,
361 	bool			*drained_dio,
362 	struct xfs_zone_alloc_ctx *ac)
363 {
364 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
365 	loff_t			isize;
366 	int			error;
367 
368 	/*
369 	 * We need to serialise against EOF updates that occur in IO completions
370 	 * here. We want to make sure that nobody is changing the size while
371 	 * we do this check until we have placed an IO barrier (i.e. hold
372 	 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
373 	 * spinlock effectively forms a memory barrier once we have
374 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
375 	 * hence be able to correctly determine if we need to run zeroing.
376 	 */
377 	spin_lock(&ip->i_flags_lock);
378 	isize = i_size_read(VFS_I(ip));
379 	if (iocb->ki_pos <= isize) {
380 		spin_unlock(&ip->i_flags_lock);
381 		return 0;
382 	}
383 	spin_unlock(&ip->i_flags_lock);
384 
385 	if (iocb->ki_flags & IOCB_NOWAIT)
386 		return -EAGAIN;
387 
388 	if (!*drained_dio) {
389 		/*
390 		 * If zeroing is needed and we are currently holding the iolock
391 		 * shared, we need to update it to exclusive which implies
392 		 * having to redo all checks before.
393 		 */
394 		if (*iolock == XFS_IOLOCK_SHARED) {
395 			xfs_iunlock(ip, *iolock);
396 			*iolock = XFS_IOLOCK_EXCL;
397 			xfs_ilock(ip, *iolock);
398 			iov_iter_reexpand(from, count);
399 		}
400 
401 		/*
402 		 * We now have an IO submission barrier in place, but AIO can do
403 		 * EOF updates during IO completion and hence we now need to
404 		 * wait for all of them to drain.  Non-AIO DIO will have drained
405 		 * before we are given the XFS_IOLOCK_EXCL, and so for most
406 		 * cases this wait is a no-op.
407 		 */
408 		inode_dio_wait(VFS_I(ip));
409 		*drained_dio = true;
410 		return 1;
411 	}
412 
413 	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
414 
415 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
416 	error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
417 	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
418 
419 	return error;
420 }
421 
422 /*
423  * Common pre-write limit and setup checks.
424  *
425  * Called with the iolock held either shared and exclusive according to
426  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
427  * if called for a direct write beyond i_size.
428  */
429 STATIC ssize_t
430 xfs_file_write_checks(
431 	struct kiocb		*iocb,
432 	struct iov_iter		*from,
433 	unsigned int		*iolock,
434 	struct xfs_zone_alloc_ctx *ac)
435 {
436 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
437 	size_t			count = iov_iter_count(from);
438 	bool			drained_dio = false;
439 	ssize_t			error;
440 
441 restart:
442 	error = generic_write_checks(iocb, from);
443 	if (error <= 0)
444 		return error;
445 
446 	if (iocb->ki_flags & IOCB_NOWAIT) {
447 		error = break_layout(inode, false);
448 		if (error == -EWOULDBLOCK)
449 			error = -EAGAIN;
450 	} else {
451 		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
452 	}
453 
454 	if (error)
455 		return error;
456 
457 	/*
458 	 * For changing security info in file_remove_privs() we need i_rwsem
459 	 * exclusively.
460 	 */
461 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
462 		xfs_iunlock(XFS_I(inode), *iolock);
463 		*iolock = XFS_IOLOCK_EXCL;
464 		error = xfs_ilock_iocb(iocb, *iolock);
465 		if (error) {
466 			*iolock = 0;
467 			return error;
468 		}
469 		goto restart;
470 	}
471 
472 	/*
473 	 * If the offset is beyond the size of the file, we need to zero all
474 	 * blocks that fall between the existing EOF and the start of this
475 	 * write.
476 	 *
477 	 * We can do an unlocked check for i_size here safely as I/O completion
478 	 * can only extend EOF.  Truncate is locked out at this point, so the
479 	 * EOF can not move backwards, only forwards. Hence we only need to take
480 	 * the slow path when we are at or beyond the current EOF.
481 	 */
482 	if (iocb->ki_pos > i_size_read(inode)) {
483 		error = xfs_file_write_zero_eof(iocb, from, iolock, count,
484 				&drained_dio, ac);
485 		if (error == 1)
486 			goto restart;
487 		if (error)
488 			return error;
489 	}
490 
491 	return kiocb_modified(iocb);
492 }
493 
494 static ssize_t
495 xfs_zoned_write_space_reserve(
496 	struct xfs_mount		*mp,
497 	struct kiocb			*iocb,
498 	struct iov_iter			*from,
499 	unsigned int			flags,
500 	struct xfs_zone_alloc_ctx	*ac)
501 {
502 	loff_t				count = iov_iter_count(from);
503 	int				error;
504 
505 	if (iocb->ki_flags & IOCB_NOWAIT)
506 		flags |= XFS_ZR_NOWAIT;
507 
508 	/*
509 	 * Check the rlimit and LFS boundary first so that we don't over-reserve
510 	 * by possibly a lot.
511 	 *
512 	 * The generic write path will redo this check later, and it might have
513 	 * changed by then.  If it got expanded we'll stick to our earlier
514 	 * smaller limit, and if it is decreased the new smaller limit will be
515 	 * used and our extra space reservation will be returned after finishing
516 	 * the write.
517 	 */
518 	error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
519 	if (error)
520 		return error;
521 
522 	/*
523 	 * Sloppily round up count to file system blocks.
524 	 *
525 	 * This will often reserve an extra block, but that avoids having to look
526 	 * at the start offset, which isn't stable for O_APPEND until taking the
527 	 * iolock.  Also we need to reserve a block each for zeroing the old
528 	 * EOF block and the new start block if they are unaligned.
529 	 *
530 	 * Any remaining block will be returned after the write.
531 	 */
532 	return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
533 			flags, ac);
534 }
535 
536 static int
537 xfs_dio_write_end_io(
538 	struct kiocb		*iocb,
539 	ssize_t			size,
540 	int			error,
541 	unsigned		flags)
542 {
543 	struct inode		*inode = file_inode(iocb->ki_filp);
544 	struct xfs_inode	*ip = XFS_I(inode);
545 	loff_t			offset = iocb->ki_pos;
546 	unsigned int		nofs_flag;
547 
548 	ASSERT(!xfs_is_zoned_inode(ip) ||
549 	       !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
550 
551 	trace_xfs_end_io_direct_write(ip, offset, size);
552 
553 	if (xfs_is_shutdown(ip->i_mount))
554 		return -EIO;
555 
556 	if (error)
557 		return error;
558 	if (!size)
559 		return 0;
560 
561 	/*
562 	 * Capture amount written on completion as we can't reliably account
563 	 * for it on submission.
564 	 */
565 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
566 
567 	/*
568 	 * We can allocate memory here while doing writeback on behalf of
569 	 * memory reclaim.  To avoid memory allocation deadlocks set the
570 	 * task-wide nofs context for the following operations.
571 	 */
572 	nofs_flag = memalloc_nofs_save();
573 
574 	if (flags & IOMAP_DIO_COW) {
575 		if (iocb->ki_flags & IOCB_ATOMIC)
576 			error = xfs_reflink_end_atomic_cow(ip, offset, size);
577 		else
578 			error = xfs_reflink_end_cow(ip, offset, size);
579 		if (error)
580 			goto out;
581 	}
582 
583 	/*
584 	 * Unwritten conversion updates the in-core isize after extent
585 	 * conversion but before updating the on-disk size. Updating isize any
586 	 * earlier allows a racing dio read to find unwritten extents before
587 	 * they are converted.
588 	 */
589 	if (flags & IOMAP_DIO_UNWRITTEN) {
590 		error = xfs_iomap_write_unwritten(ip, offset, size, true);
591 		goto out;
592 	}
593 
594 	/*
595 	 * We need to update the in-core inode size here so that we don't end up
596 	 * with the on-disk inode size being outside the in-core inode size. We
597 	 * have no other method of updating EOF for AIO, so always do it here
598 	 * if necessary.
599 	 *
600 	 * We need to lock the test/set EOF update as we can be racing with
601 	 * other IO completions here to update the EOF. Failing to serialise
602 	 * here can result in EOF moving backwards and Bad Things Happen when
603 	 * that occurs.
604 	 *
605 	 * As IO completion only ever extends EOF, we can do an unlocked check
606 	 * here to avoid taking the spinlock. If we land within the current EOF,
607 	 * then we do not need to do an extending update at all, and we don't
608 	 * need to take the lock to check this. If we race with an update moving
609 	 * EOF, then we'll either still be beyond EOF and need to take the lock,
610 	 * or we'll be within EOF and we don't need to take it at all.
611 	 */
612 	if (offset + size <= i_size_read(inode))
613 		goto out;
614 
615 	spin_lock(&ip->i_flags_lock);
616 	if (offset + size > i_size_read(inode)) {
617 		i_size_write(inode, offset + size);
618 		spin_unlock(&ip->i_flags_lock);
619 		error = xfs_setfilesize(ip, offset, size);
620 	} else {
621 		spin_unlock(&ip->i_flags_lock);
622 	}
623 
624 out:
625 	memalloc_nofs_restore(nofs_flag);
626 	return error;
627 }
628 
629 static const struct iomap_dio_ops xfs_dio_write_ops = {
630 	.end_io		= xfs_dio_write_end_io,
631 };
632 
633 static void
634 xfs_dio_zoned_submit_io(
635 	const struct iomap_iter	*iter,
636 	struct bio		*bio,
637 	loff_t			file_offset)
638 {
639 	struct xfs_mount	*mp = XFS_I(iter->inode)->i_mount;
640 	struct xfs_zone_alloc_ctx *ac = iter->private;
641 	xfs_filblks_t		count_fsb;
642 	struct iomap_ioend	*ioend;
643 
644 	count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
645 	if (count_fsb > ac->reserved_blocks) {
646 		xfs_err(mp,
647 "allocation (%lld) larger than reservation (%lld).",
648 			count_fsb, ac->reserved_blocks);
649 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
650 		bio_io_error(bio);
651 		return;
652 	}
653 	ac->reserved_blocks -= count_fsb;
654 
655 	bio->bi_end_io = xfs_end_bio;
656 	ioend = iomap_init_ioend(iter->inode, bio, file_offset,
657 			IOMAP_IOEND_DIRECT);
658 	xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
659 }
660 
661 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
662 	.bio_set	= &iomap_ioend_bioset,
663 	.submit_io	= xfs_dio_zoned_submit_io,
664 	.end_io		= xfs_dio_write_end_io,
665 };
666 
667 /*
668  * Handle block aligned direct I/O writes.
669  */
670 static noinline ssize_t
671 xfs_file_dio_write_aligned(
672 	struct xfs_inode	*ip,
673 	struct kiocb		*iocb,
674 	struct iov_iter		*from,
675 	const struct iomap_ops	*ops,
676 	const struct iomap_dio_ops *dops,
677 	struct xfs_zone_alloc_ctx *ac)
678 {
679 	unsigned int		iolock = XFS_IOLOCK_SHARED;
680 	unsigned int		dio_flags = 0;
681 	ssize_t			ret;
682 
683 	/*
684 	 * For always COW inodes, each bio must be aligned to the file system
685 	 * block size and not just the device sector size because we need to
686 	 * allocate a block-aligned amount of space for each write.
687 	 */
688 	if (xfs_is_always_cow_inode(ip))
689 		dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
690 
691 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
692 	if (ret)
693 		return ret;
694 	ret = xfs_file_write_checks(iocb, from, &iolock, ac);
695 	if (ret)
696 		goto out_unlock;
697 
698 	/*
699 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
700 	 * the iolock back to shared if we had to take the exclusive lock in
701 	 * xfs_file_write_checks() for other reasons.
702 	 */
703 	if (iolock == XFS_IOLOCK_EXCL) {
704 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
705 		iolock = XFS_IOLOCK_SHARED;
706 	}
707 	trace_xfs_file_direct_write(iocb, from);
708 	ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
709 out_unlock:
710 	xfs_iunlock(ip, iolock);
711 	return ret;
712 }
713 
714 /*
715  * Handle block aligned direct I/O writes to zoned devices.
716  */
717 static noinline ssize_t
718 xfs_file_dio_write_zoned(
719 	struct xfs_inode	*ip,
720 	struct kiocb		*iocb,
721 	struct iov_iter		*from)
722 {
723 	struct xfs_zone_alloc_ctx ac = { };
724 	ssize_t			ret;
725 
726 	ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
727 	if (ret < 0)
728 		return ret;
729 	ret = xfs_file_dio_write_aligned(ip, iocb, from,
730 			&xfs_zoned_direct_write_iomap_ops,
731 			&xfs_dio_zoned_write_ops, &ac);
732 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
733 	return ret;
734 }
735 
736 /*
737  * Handle block atomic writes
738  *
739  * Two methods of atomic writes are supported:
740  * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
741  *   disk
742  * - COW-based, which uses a COW fork as a staging extent for data updates
743  *   before atomically updating extent mappings for the range being written
744  *
745  */
746 static noinline ssize_t
747 xfs_file_dio_write_atomic(
748 	struct xfs_inode	*ip,
749 	struct kiocb		*iocb,
750 	struct iov_iter		*from)
751 {
752 	unsigned int		iolock = XFS_IOLOCK_SHARED;
753 	ssize_t			ret, ocount = iov_iter_count(from);
754 	const struct iomap_ops	*dops;
755 
756 	/*
757 	 * HW offload should be faster, so try that first if it is already
758 	 * known that the write length is not too large.
759 	 */
760 	if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
761 		dops = &xfs_atomic_write_cow_iomap_ops;
762 	else
763 		dops = &xfs_direct_write_iomap_ops;
764 
765 retry:
766 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
767 	if (ret)
768 		return ret;
769 
770 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
771 	if (ret)
772 		goto out_unlock;
773 
774 	/* Demote similar to xfs_file_dio_write_aligned() */
775 	if (iolock == XFS_IOLOCK_EXCL) {
776 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
777 		iolock = XFS_IOLOCK_SHARED;
778 	}
779 
780 	trace_xfs_file_direct_write(iocb, from);
781 	ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
782 			0, NULL, 0);
783 
784 	/*
785 	 * The retry mechanism is based on the ->iomap_begin method returning
786 	 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
787 	 * possible. The REQ_ATOMIC-based method typically not be possible if
788 	 * the write spans multiple extents or the disk blocks are misaligned.
789 	 */
790 	if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
791 		xfs_iunlock(ip, iolock);
792 		dops = &xfs_atomic_write_cow_iomap_ops;
793 		goto retry;
794 	}
795 
796 out_unlock:
797 	if (iolock)
798 		xfs_iunlock(ip, iolock);
799 	return ret;
800 }
801 
802 /*
803  * Handle block unaligned direct I/O writes
804  *
805  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
806  * them to be done in parallel with reads and other direct I/O writes.  However,
807  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
808  * to do sub-block zeroing and that requires serialisation against other direct
809  * I/O to the same block.  In this case we need to serialise the submission of
810  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
811  * In the case where sub-block zeroing is not required, we can do concurrent
812  * sub-block dios to the same block successfully.
813  *
814  * Optimistically submit the I/O using the shared lock first, but use the
815  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
816  * if block allocation or partial block zeroing would be required.  In that case
817  * we try again with the exclusive lock.
818  */
819 static noinline ssize_t
820 xfs_file_dio_write_unaligned(
821 	struct xfs_inode	*ip,
822 	struct kiocb		*iocb,
823 	struct iov_iter		*from)
824 {
825 	size_t			isize = i_size_read(VFS_I(ip));
826 	size_t			count = iov_iter_count(from);
827 	unsigned int		iolock = XFS_IOLOCK_SHARED;
828 	unsigned int		flags = IOMAP_DIO_OVERWRITE_ONLY;
829 	ssize_t			ret;
830 
831 	/*
832 	 * Extending writes need exclusivity because of the sub-block zeroing
833 	 * that the DIO code always does for partial tail blocks beyond EOF, so
834 	 * don't even bother trying the fast path in this case.
835 	 */
836 	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
837 		if (iocb->ki_flags & IOCB_NOWAIT)
838 			return -EAGAIN;
839 retry_exclusive:
840 		iolock = XFS_IOLOCK_EXCL;
841 		flags = IOMAP_DIO_FORCE_WAIT;
842 	}
843 
844 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
845 	if (ret)
846 		return ret;
847 
848 	/*
849 	 * We can't properly handle unaligned direct I/O to reflink files yet,
850 	 * as we can't unshare a partial block.
851 	 */
852 	if (xfs_is_cow_inode(ip)) {
853 		trace_xfs_reflink_bounce_dio_write(iocb, from);
854 		ret = -ENOTBLK;
855 		goto out_unlock;
856 	}
857 
858 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
859 	if (ret)
860 		goto out_unlock;
861 
862 	/*
863 	 * If we are doing exclusive unaligned I/O, this must be the only I/O
864 	 * in-flight.  Otherwise we risk data corruption due to unwritten extent
865 	 * conversions from the AIO end_io handler.  Wait for all other I/O to
866 	 * drain first.
867 	 */
868 	if (flags & IOMAP_DIO_FORCE_WAIT)
869 		inode_dio_wait(VFS_I(ip));
870 
871 	trace_xfs_file_direct_write(iocb, from);
872 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
873 			   &xfs_dio_write_ops, flags, NULL, 0);
874 
875 	/*
876 	 * Retry unaligned I/O with exclusive blocking semantics if the DIO
877 	 * layer rejected it for mapping or locking reasons. If we are doing
878 	 * nonblocking user I/O, propagate the error.
879 	 */
880 	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
881 		ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
882 		xfs_iunlock(ip, iolock);
883 		goto retry_exclusive;
884 	}
885 
886 out_unlock:
887 	if (iolock)
888 		xfs_iunlock(ip, iolock);
889 	return ret;
890 }
891 
892 static ssize_t
893 xfs_file_dio_write(
894 	struct kiocb		*iocb,
895 	struct iov_iter		*from)
896 {
897 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
898 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
899 	size_t			count = iov_iter_count(from);
900 
901 	/* direct I/O must be aligned to device logical sector size */
902 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
903 		return -EINVAL;
904 
905 	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
906 		return xfs_file_dio_write_unaligned(ip, iocb, from);
907 	if (xfs_is_zoned_inode(ip))
908 		return xfs_file_dio_write_zoned(ip, iocb, from);
909 	if (iocb->ki_flags & IOCB_ATOMIC)
910 		return xfs_file_dio_write_atomic(ip, iocb, from);
911 	return xfs_file_dio_write_aligned(ip, iocb, from,
912 			&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
913 }
914 
915 static noinline ssize_t
916 xfs_file_dax_write(
917 	struct kiocb		*iocb,
918 	struct iov_iter		*from)
919 {
920 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
921 	struct xfs_inode	*ip = XFS_I(inode);
922 	unsigned int		iolock = XFS_IOLOCK_EXCL;
923 	ssize_t			ret, error = 0;
924 	loff_t			pos;
925 
926 	ret = xfs_ilock_iocb(iocb, iolock);
927 	if (ret)
928 		return ret;
929 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
930 	if (ret)
931 		goto out;
932 
933 	pos = iocb->ki_pos;
934 
935 	trace_xfs_file_dax_write(iocb, from);
936 	ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
937 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
938 		i_size_write(inode, iocb->ki_pos);
939 		error = xfs_setfilesize(ip, pos, ret);
940 	}
941 out:
942 	if (iolock)
943 		xfs_iunlock(ip, iolock);
944 	if (error)
945 		return error;
946 
947 	if (ret > 0) {
948 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
949 
950 		/* Handle various SYNC-type writes */
951 		ret = generic_write_sync(iocb, ret);
952 	}
953 	return ret;
954 }
955 
956 STATIC ssize_t
957 xfs_file_buffered_write(
958 	struct kiocb		*iocb,
959 	struct iov_iter		*from)
960 {
961 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
962 	struct xfs_inode	*ip = XFS_I(inode);
963 	ssize_t			ret;
964 	bool			cleared_space = false;
965 	unsigned int		iolock;
966 
967 write_retry:
968 	iolock = XFS_IOLOCK_EXCL;
969 	ret = xfs_ilock_iocb(iocb, iolock);
970 	if (ret)
971 		return ret;
972 
973 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
974 	if (ret)
975 		goto out;
976 
977 	trace_xfs_file_buffered_write(iocb, from);
978 	ret = iomap_file_buffered_write(iocb, from,
979 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
980 			NULL);
981 
982 	/*
983 	 * If we hit a space limit, try to free up some lingering preallocated
984 	 * space before returning an error. In the case of ENOSPC, first try to
985 	 * write back all dirty inodes to free up some of the excess reserved
986 	 * metadata space. This reduces the chances that the eofblocks scan
987 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
988 	 * also behaves as a filter to prevent too many eofblocks scans from
989 	 * running at the same time.  Use a synchronous scan to increase the
990 	 * effectiveness of the scan.
991 	 */
992 	if (ret == -EDQUOT && !cleared_space) {
993 		xfs_iunlock(ip, iolock);
994 		xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
995 		cleared_space = true;
996 		goto write_retry;
997 	} else if (ret == -ENOSPC && !cleared_space) {
998 		struct xfs_icwalk	icw = {0};
999 
1000 		cleared_space = true;
1001 		xfs_flush_inodes(ip->i_mount);
1002 
1003 		xfs_iunlock(ip, iolock);
1004 		icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
1005 		xfs_blockgc_free_space(ip->i_mount, &icw);
1006 		goto write_retry;
1007 	}
1008 
1009 out:
1010 	if (iolock)
1011 		xfs_iunlock(ip, iolock);
1012 
1013 	if (ret > 0) {
1014 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
1015 		/* Handle various SYNC-type writes */
1016 		ret = generic_write_sync(iocb, ret);
1017 	}
1018 	return ret;
1019 }
1020 
1021 STATIC ssize_t
1022 xfs_file_buffered_write_zoned(
1023 	struct kiocb		*iocb,
1024 	struct iov_iter		*from)
1025 {
1026 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
1027 	struct xfs_mount	*mp = ip->i_mount;
1028 	unsigned int		iolock = XFS_IOLOCK_EXCL;
1029 	bool			cleared_space = false;
1030 	struct xfs_zone_alloc_ctx ac = { };
1031 	ssize_t			ret;
1032 
1033 	ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
1034 	if (ret < 0)
1035 		return ret;
1036 
1037 	ret = xfs_ilock_iocb(iocb, iolock);
1038 	if (ret)
1039 		goto out_unreserve;
1040 
1041 	ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
1042 	if (ret)
1043 		goto out_unlock;
1044 
1045 	/*
1046 	 * Truncate the iter to the length that we were actually able to
1047 	 * allocate blocks for.  This needs to happen after
1048 	 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
1049 	 * writes.
1050 	 */
1051 	iov_iter_truncate(from,
1052 			XFS_FSB_TO_B(mp, ac.reserved_blocks) -
1053 			(iocb->ki_pos & mp->m_blockmask));
1054 	if (!iov_iter_count(from))
1055 		goto out_unlock;
1056 
1057 retry:
1058 	trace_xfs_file_buffered_write(iocb, from);
1059 	ret = iomap_file_buffered_write(iocb, from,
1060 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
1061 			&ac);
1062 	if (ret == -ENOSPC && !cleared_space) {
1063 		/*
1064 		 * Kick off writeback to convert delalloc space and release the
1065 		 * usually too pessimistic indirect block reservations.
1066 		 */
1067 		xfs_flush_inodes(mp);
1068 		cleared_space = true;
1069 		goto retry;
1070 	}
1071 
1072 out_unlock:
1073 	xfs_iunlock(ip, iolock);
1074 out_unreserve:
1075 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1076 	if (ret > 0) {
1077 		XFS_STATS_ADD(mp, xs_write_bytes, ret);
1078 		ret = generic_write_sync(iocb, ret);
1079 	}
1080 	return ret;
1081 }
1082 
1083 STATIC ssize_t
1084 xfs_file_write_iter(
1085 	struct kiocb		*iocb,
1086 	struct iov_iter		*from)
1087 {
1088 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
1089 	struct xfs_inode	*ip = XFS_I(inode);
1090 	ssize_t			ret;
1091 	size_t			ocount = iov_iter_count(from);
1092 
1093 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
1094 
1095 	if (ocount == 0)
1096 		return 0;
1097 
1098 	if (xfs_is_shutdown(ip->i_mount))
1099 		return -EIO;
1100 
1101 	if (iocb->ki_flags & IOCB_ATOMIC) {
1102 		if (ocount < xfs_get_atomic_write_min(ip))
1103 			return -EINVAL;
1104 
1105 		if (ocount > xfs_get_atomic_write_max(ip))
1106 			return -EINVAL;
1107 
1108 		ret = generic_atomic_write_valid(iocb, from);
1109 		if (ret)
1110 			return ret;
1111 	}
1112 
1113 	if (IS_DAX(inode))
1114 		return xfs_file_dax_write(iocb, from);
1115 
1116 	if (iocb->ki_flags & IOCB_DIRECT) {
1117 		/*
1118 		 * Allow a directio write to fall back to a buffered
1119 		 * write *only* in the case that we're doing a reflink
1120 		 * CoW.  In all other directio scenarios we do not
1121 		 * allow an operation to fall back to buffered mode.
1122 		 */
1123 		ret = xfs_file_dio_write(iocb, from);
1124 		if (ret != -ENOTBLK)
1125 			return ret;
1126 	}
1127 
1128 	if (xfs_is_zoned_inode(ip))
1129 		return xfs_file_buffered_write_zoned(iocb, from);
1130 	return xfs_file_buffered_write(iocb, from);
1131 }
1132 
1133 /* Does this file, inode, or mount want synchronous writes? */
1134 static inline bool xfs_file_sync_writes(struct file *filp)
1135 {
1136 	struct xfs_inode	*ip = XFS_I(file_inode(filp));
1137 
1138 	if (xfs_has_wsync(ip->i_mount))
1139 		return true;
1140 	if (filp->f_flags & (__O_SYNC | O_DSYNC))
1141 		return true;
1142 	if (IS_SYNC(file_inode(filp)))
1143 		return true;
1144 
1145 	return false;
1146 }
1147 
1148 static int
1149 xfs_falloc_newsize(
1150 	struct file		*file,
1151 	int			mode,
1152 	loff_t			offset,
1153 	loff_t			len,
1154 	loff_t			*new_size)
1155 {
1156 	struct inode		*inode = file_inode(file);
1157 
1158 	if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
1159 		return 0;
1160 	*new_size = offset + len;
1161 	return inode_newsize_ok(inode, *new_size);
1162 }
1163 
1164 static int
1165 xfs_falloc_setsize(
1166 	struct file		*file,
1167 	loff_t			new_size)
1168 {
1169 	struct iattr iattr = {
1170 		.ia_valid	= ATTR_SIZE,
1171 		.ia_size	= new_size,
1172 	};
1173 
1174 	if (!new_size)
1175 		return 0;
1176 	return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
1177 			&iattr);
1178 }
1179 
1180 static int
1181 xfs_falloc_collapse_range(
1182 	struct file		*file,
1183 	loff_t			offset,
1184 	loff_t			len,
1185 	struct xfs_zone_alloc_ctx *ac)
1186 {
1187 	struct inode		*inode = file_inode(file);
1188 	loff_t			new_size = i_size_read(inode) - len;
1189 	int			error;
1190 
1191 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1192 		return -EINVAL;
1193 
1194 	/*
1195 	 * There is no need to overlap collapse range with EOF, in which case it
1196 	 * is effectively a truncate operation
1197 	 */
1198 	if (offset + len >= i_size_read(inode))
1199 		return -EINVAL;
1200 
1201 	error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1202 	if (error)
1203 		return error;
1204 	return xfs_falloc_setsize(file, new_size);
1205 }
1206 
1207 static int
1208 xfs_falloc_insert_range(
1209 	struct file		*file,
1210 	loff_t			offset,
1211 	loff_t			len)
1212 {
1213 	struct inode		*inode = file_inode(file);
1214 	loff_t			isize = i_size_read(inode);
1215 	int			error;
1216 
1217 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1218 		return -EINVAL;
1219 
1220 	/*
1221 	 * New inode size must not exceed ->s_maxbytes, accounting for
1222 	 * possible signed overflow.
1223 	 */
1224 	if (inode->i_sb->s_maxbytes - isize < len)
1225 		return -EFBIG;
1226 
1227 	/* Offset should be less than i_size */
1228 	if (offset >= isize)
1229 		return -EINVAL;
1230 
1231 	error = xfs_falloc_setsize(file, isize + len);
1232 	if (error)
1233 		return error;
1234 
1235 	/*
1236 	 * Perform hole insertion now that the file size has been updated so
1237 	 * that if we crash during the operation we don't leave shifted extents
1238 	 * past EOF and hence losing access to the data that is contained within
1239 	 * them.
1240 	 */
1241 	return xfs_insert_file_space(XFS_I(inode), offset, len);
1242 }
1243 
1244 /*
1245  * For various operations we need to zero up to one block at each end of
1246  * the affected range.  For zoned file systems this will require a space
1247  * allocation, for which we need a reservation ahead of time.
1248  */
1249 #define XFS_ZONED_ZERO_EDGE_SPACE_RES		2
1250 
1251 /*
1252  * Zero range implements a full zeroing mechanism but is only used in limited
1253  * situations. It is more efficient to allocate unwritten extents than to
1254  * perform zeroing here, so use an errortag to randomly force zeroing on DEBUG
1255  * kernels for added test coverage.
1256  *
1257  * On zoned file systems, the error is already injected by
1258  * xfs_file_zoned_fallocate, which then reserves the additional space needed.
1259  * We only check for this extra space reservation here.
1260  */
1261 static inline bool
1262 xfs_falloc_force_zero(
1263 	struct xfs_inode		*ip,
1264 	struct xfs_zone_alloc_ctx	*ac)
1265 {
1266 	if (xfs_is_zoned_inode(ip)) {
1267 		if (ac->reserved_blocks > XFS_ZONED_ZERO_EDGE_SPACE_RES) {
1268 			ASSERT(IS_ENABLED(CONFIG_XFS_DEBUG));
1269 			return true;
1270 		}
1271 		return false;
1272 	}
1273 	return XFS_TEST_ERROR(ip->i_mount, XFS_ERRTAG_FORCE_ZERO_RANGE);
1274 }
1275 
1276 /*
1277  * Punch a hole and prealloc the range.  We use a hole punch rather than
1278  * unwritten extent conversion for two reasons:
1279  *
1280  *   1.) Hole punch handles partial block zeroing for us.
1281  *   2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1282  *	 virtue of the hole punch.
1283  */
1284 static int
1285 xfs_falloc_zero_range(
1286 	struct file		*file,
1287 	int			mode,
1288 	loff_t			offset,
1289 	loff_t			len,
1290 	struct xfs_zone_alloc_ctx *ac)
1291 {
1292 	struct inode		*inode = file_inode(file);
1293 	struct xfs_inode	*ip = XFS_I(inode);
1294 	unsigned int		blksize = i_blocksize(inode);
1295 	loff_t			new_size = 0;
1296 	int			error;
1297 
1298 	trace_xfs_zero_file_space(ip);
1299 
1300 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1301 	if (error)
1302 		return error;
1303 
1304 	if (xfs_falloc_force_zero(ip, ac)) {
1305 		error = xfs_zero_range(ip, offset, len, ac, NULL);
1306 	} else {
1307 		error = xfs_free_file_space(ip, offset, len, ac);
1308 		if (error)
1309 			return error;
1310 
1311 		len = round_up(offset + len, blksize) -
1312 			round_down(offset, blksize);
1313 		offset = round_down(offset, blksize);
1314 		error = xfs_alloc_file_space(ip, offset, len);
1315 	}
1316 	if (error)
1317 		return error;
1318 	return xfs_falloc_setsize(file, new_size);
1319 }
1320 
1321 static int
1322 xfs_falloc_unshare_range(
1323 	struct file		*file,
1324 	int			mode,
1325 	loff_t			offset,
1326 	loff_t			len)
1327 {
1328 	struct inode		*inode = file_inode(file);
1329 	loff_t			new_size = 0;
1330 	int			error;
1331 
1332 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1333 	if (error)
1334 		return error;
1335 
1336 	error = xfs_reflink_unshare(XFS_I(inode), offset, len);
1337 	if (error)
1338 		return error;
1339 
1340 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1341 	if (error)
1342 		return error;
1343 	return xfs_falloc_setsize(file, new_size);
1344 }
1345 
1346 static int
1347 xfs_falloc_allocate_range(
1348 	struct file		*file,
1349 	int			mode,
1350 	loff_t			offset,
1351 	loff_t			len)
1352 {
1353 	struct inode		*inode = file_inode(file);
1354 	loff_t			new_size = 0;
1355 	int			error;
1356 
1357 	/*
1358 	 * If always_cow mode we can't use preallocations and thus should not
1359 	 * create them.
1360 	 */
1361 	if (xfs_is_always_cow_inode(XFS_I(inode)))
1362 		return -EOPNOTSUPP;
1363 
1364 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1365 	if (error)
1366 		return error;
1367 
1368 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1369 	if (error)
1370 		return error;
1371 	return xfs_falloc_setsize(file, new_size);
1372 }
1373 
1374 #define	XFS_FALLOC_FL_SUPPORTED						\
1375 		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
1376 		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
1377 		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
1378 		 FALLOC_FL_UNSHARE_RANGE)
1379 
1380 STATIC long
1381 __xfs_file_fallocate(
1382 	struct file		*file,
1383 	int			mode,
1384 	loff_t			offset,
1385 	loff_t			len,
1386 	struct xfs_zone_alloc_ctx *ac)
1387 {
1388 	struct inode		*inode = file_inode(file);
1389 	struct xfs_inode	*ip = XFS_I(inode);
1390 	long			error;
1391 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1392 
1393 	xfs_ilock(ip, iolock);
1394 	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1395 	if (error)
1396 		goto out_unlock;
1397 
1398 	/*
1399 	 * Must wait for all AIO to complete before we continue as AIO can
1400 	 * change the file size on completion without holding any locks we
1401 	 * currently hold. We must do this first because AIO can update both
1402 	 * the on disk and in memory inode sizes, and the operations that follow
1403 	 * require the in-memory size to be fully up-to-date.
1404 	 */
1405 	inode_dio_wait(inode);
1406 
1407 	error = file_modified(file);
1408 	if (error)
1409 		goto out_unlock;
1410 
1411 	switch (mode & FALLOC_FL_MODE_MASK) {
1412 	case FALLOC_FL_PUNCH_HOLE:
1413 		error = xfs_free_file_space(ip, offset, len, ac);
1414 		break;
1415 	case FALLOC_FL_COLLAPSE_RANGE:
1416 		error = xfs_falloc_collapse_range(file, offset, len, ac);
1417 		break;
1418 	case FALLOC_FL_INSERT_RANGE:
1419 		error = xfs_falloc_insert_range(file, offset, len);
1420 		break;
1421 	case FALLOC_FL_ZERO_RANGE:
1422 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1423 		break;
1424 	case FALLOC_FL_UNSHARE_RANGE:
1425 		error = xfs_falloc_unshare_range(file, mode, offset, len);
1426 		break;
1427 	case FALLOC_FL_ALLOCATE_RANGE:
1428 		error = xfs_falloc_allocate_range(file, mode, offset, len);
1429 		break;
1430 	default:
1431 		error = -EOPNOTSUPP;
1432 		break;
1433 	}
1434 
1435 	if (!error && xfs_file_sync_writes(file))
1436 		error = xfs_log_force_inode(ip);
1437 
1438 out_unlock:
1439 	xfs_iunlock(ip, iolock);
1440 	return error;
1441 }
1442 
1443 static long
1444 xfs_file_zoned_fallocate(
1445 	struct file		*file,
1446 	int			mode,
1447 	loff_t			offset,
1448 	loff_t			len)
1449 {
1450 	struct xfs_zone_alloc_ctx ac = { };
1451 	struct xfs_inode	*ip = XFS_I(file_inode(file));
1452 	struct xfs_mount	*mp = ip->i_mount;
1453 	xfs_filblks_t		count_fsb;
1454 	int			error;
1455 
1456 	/*
1457 	 * If full zeroing is forced by the error injection knob, we need a
1458 	 * space reservation that covers the entire range.  See the comment in
1459 	 * xfs_zoned_write_space_reserve for the rationale for the calculation.
1460 	 * Otherwise just reserve space for the two boundary blocks.
1461 	 */
1462 	count_fsb = XFS_ZONED_ZERO_EDGE_SPACE_RES;
1463 	if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ZERO_RANGE &&
1464 	    XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_ZERO_RANGE))
1465 		count_fsb += XFS_B_TO_FSB(mp, len) + 1;
1466 
1467 	error = xfs_zoned_space_reserve(mp, count_fsb, XFS_ZR_RESERVED, &ac);
1468 	if (error)
1469 		return error;
1470 	error = __xfs_file_fallocate(file, mode, offset, len, &ac);
1471 	xfs_zoned_space_unreserve(mp, &ac);
1472 	return error;
1473 }
1474 
1475 static long
1476 xfs_file_fallocate(
1477 	struct file		*file,
1478 	int			mode,
1479 	loff_t			offset,
1480 	loff_t			len)
1481 {
1482 	struct inode		*inode = file_inode(file);
1483 
1484 	if (!S_ISREG(inode->i_mode))
1485 		return -EINVAL;
1486 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1487 		return -EOPNOTSUPP;
1488 
1489 	/*
1490 	 * For zoned file systems, zeroing the first and last block of a hole
1491 	 * punch requires allocating a new block to rewrite the remaining data
1492 	 * and new zeroes out of place.  Get a reservations for those before
1493 	 * taking the iolock.  Dip into the reserved pool because we are
1494 	 * expected to be able to punch a hole even on a completely full
1495 	 * file system.
1496 	 */
1497 	if (xfs_is_zoned_inode(XFS_I(inode)) &&
1498 	    (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1499 		     FALLOC_FL_COLLAPSE_RANGE)))
1500 		return xfs_file_zoned_fallocate(file, mode, offset, len);
1501 	return __xfs_file_fallocate(file, mode, offset, len, NULL);
1502 }
1503 
1504 STATIC int
1505 xfs_file_fadvise(
1506 	struct file	*file,
1507 	loff_t		start,
1508 	loff_t		end,
1509 	int		advice)
1510 {
1511 	struct xfs_inode *ip = XFS_I(file_inode(file));
1512 	int ret;
1513 	int lockflags = 0;
1514 
1515 	/*
1516 	 * Operations creating pages in page cache need protection from hole
1517 	 * punching and similar ops
1518 	 */
1519 	if (advice == POSIX_FADV_WILLNEED) {
1520 		lockflags = XFS_IOLOCK_SHARED;
1521 		xfs_ilock(ip, lockflags);
1522 	}
1523 	ret = generic_fadvise(file, start, end, advice);
1524 	if (lockflags)
1525 		xfs_iunlock(ip, lockflags);
1526 	return ret;
1527 }
1528 
1529 STATIC loff_t
1530 xfs_file_remap_range(
1531 	struct file		*file_in,
1532 	loff_t			pos_in,
1533 	struct file		*file_out,
1534 	loff_t			pos_out,
1535 	loff_t			len,
1536 	unsigned int		remap_flags)
1537 {
1538 	struct inode		*inode_in = file_inode(file_in);
1539 	struct xfs_inode	*src = XFS_I(inode_in);
1540 	struct inode		*inode_out = file_inode(file_out);
1541 	struct xfs_inode	*dest = XFS_I(inode_out);
1542 	struct xfs_mount	*mp = src->i_mount;
1543 	loff_t			remapped = 0;
1544 	xfs_extlen_t		cowextsize;
1545 	int			ret;
1546 
1547 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1548 		return -EINVAL;
1549 
1550 	if (!xfs_has_reflink(mp))
1551 		return -EOPNOTSUPP;
1552 
1553 	if (xfs_is_shutdown(mp))
1554 		return -EIO;
1555 
1556 	/* Prepare and then clone file data. */
1557 	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1558 			&len, remap_flags);
1559 	if (ret || len == 0)
1560 		return ret;
1561 
1562 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1563 
1564 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1565 			&remapped);
1566 	if (ret)
1567 		goto out_unlock;
1568 
1569 	/*
1570 	 * Carry the cowextsize hint from src to dest if we're sharing the
1571 	 * entire source file to the entire destination file, the source file
1572 	 * has a cowextsize hint, and the destination file does not.
1573 	 */
1574 	cowextsize = 0;
1575 	if (pos_in == 0 && len == i_size_read(inode_in) &&
1576 	    (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1577 	    pos_out == 0 && len >= i_size_read(inode_out) &&
1578 	    !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1579 		cowextsize = src->i_cowextsize;
1580 
1581 	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1582 			remap_flags);
1583 	if (ret)
1584 		goto out_unlock;
1585 
1586 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1587 		xfs_log_force_inode(dest);
1588 out_unlock:
1589 	xfs_iunlock2_remapping(src, dest);
1590 	if (ret)
1591 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1592 	/*
1593 	 * If the caller did not set CAN_SHORTEN, then it is not prepared to
1594 	 * handle partial results -- either the whole remap succeeds, or we
1595 	 * must say why it did not.  In this case, any error should be returned
1596 	 * to the caller.
1597 	 */
1598 	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1599 		return ret;
1600 	return remapped > 0 ? remapped : ret;
1601 }
1602 
1603 STATIC int
1604 xfs_file_open(
1605 	struct inode	*inode,
1606 	struct file	*file)
1607 {
1608 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1609 		return -EIO;
1610 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
1611 	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
1612 		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
1613 	return generic_file_open(inode, file);
1614 }
1615 
1616 STATIC int
1617 xfs_dir_open(
1618 	struct inode	*inode,
1619 	struct file	*file)
1620 {
1621 	struct xfs_inode *ip = XFS_I(inode);
1622 	unsigned int	mode;
1623 	int		error;
1624 
1625 	if (xfs_is_shutdown(ip->i_mount))
1626 		return -EIO;
1627 	error = generic_file_open(inode, file);
1628 	if (error)
1629 		return error;
1630 
1631 	/*
1632 	 * If there are any blocks, read-ahead block 0 as we're almost
1633 	 * certain to have the next operation be a read there.
1634 	 */
1635 	mode = xfs_ilock_data_map_shared(ip);
1636 	if (ip->i_df.if_nextents > 0)
1637 		error = xfs_dir3_data_readahead(ip, 0, 0);
1638 	xfs_iunlock(ip, mode);
1639 	return error;
1640 }
1641 
1642 /*
1643  * Don't bother propagating errors.  We're just doing cleanup, and the caller
1644  * ignores the return value anyway.
1645  */
1646 STATIC int
1647 xfs_file_release(
1648 	struct inode		*inode,
1649 	struct file		*file)
1650 {
1651 	struct xfs_inode	*ip = XFS_I(inode);
1652 	struct xfs_mount	*mp = ip->i_mount;
1653 
1654 	/*
1655 	 * If this is a read-only mount or the file system has been shut down,
1656 	 * don't generate I/O.
1657 	 */
1658 	if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
1659 		return 0;
1660 
1661 	/*
1662 	 * If we previously truncated this file and removed old data in the
1663 	 * process, we want to initiate "early" writeout on the last close.
1664 	 * This is an attempt to combat the notorious NULL files problem which
1665 	 * is particularly noticeable from a truncate down, buffered (re-)write
1666 	 * (delalloc), followed by a crash.  What we are effectively doing here
1667 	 * is significantly reducing the time window where we'd otherwise be
1668 	 * exposed to that problem.
1669 	 */
1670 	if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1671 		xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1672 		if (ip->i_delayed_blks > 0)
1673 			filemap_flush(inode->i_mapping);
1674 	}
1675 
1676 	/*
1677 	 * XFS aggressively preallocates post-EOF space to generate contiguous
1678 	 * allocations for writers that append to the end of the file.
1679 	 *
1680 	 * To support workloads that close and reopen the file frequently, these
1681 	 * preallocations usually persist after a close unless it is the first
1682 	 * close for the inode.  This is a tradeoff to generate tightly packed
1683 	 * data layouts for unpacking tarballs or similar archives that write
1684 	 * one file after another without going back to it while keeping the
1685 	 * preallocation for files that have recurring open/write/close cycles.
1686 	 *
1687 	 * This heuristic is skipped for inodes with the append-only flag as
1688 	 * that flag is rather pointless for inodes written only once.
1689 	 *
1690 	 * There is no point in freeing blocks here for open but unlinked files
1691 	 * as they will be taken care of by the inactivation path soon.
1692 	 *
1693 	 * When releasing a read-only context, don't flush data or trim post-EOF
1694 	 * blocks.  This avoids open/read/close workloads from removing EOF
1695 	 * blocks that other writers depend upon to reduce fragmentation.
1696 	 *
1697 	 * Inodes on the zoned RT device never have preallocations, so skip
1698 	 * taking the locks below.
1699 	 */
1700 	if (!inode->i_nlink ||
1701 	    !(file->f_mode & FMODE_WRITE) ||
1702 	    (ip->i_diflags & XFS_DIFLAG_APPEND) ||
1703 	    xfs_is_zoned_inode(ip))
1704 		return 0;
1705 
1706 	/*
1707 	 * If we can't get the iolock just skip truncating the blocks past EOF
1708 	 * because we could deadlock with the mmap_lock otherwise. We'll get
1709 	 * another chance to drop them once the last reference to the inode is
1710 	 * dropped, so we'll never leak blocks permanently.
1711 	 */
1712 	if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1713 	    xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1714 		if (xfs_can_free_eofblocks(ip) &&
1715 		    !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1716 			xfs_free_eofblocks(ip);
1717 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1718 	}
1719 
1720 	return 0;
1721 }
1722 
1723 STATIC int
1724 xfs_file_readdir(
1725 	struct file	*file,
1726 	struct dir_context *ctx)
1727 {
1728 	struct inode	*inode = file_inode(file);
1729 	xfs_inode_t	*ip = XFS_I(inode);
1730 	size_t		bufsize;
1731 
1732 	/*
1733 	 * The Linux API doesn't pass down the total size of the buffer
1734 	 * we read into down to the filesystem.  With the filldir concept
1735 	 * it's not needed for correct information, but the XFS dir2 leaf
1736 	 * code wants an estimate of the buffer size to calculate it's
1737 	 * readahead window and size the buffers used for mapping to
1738 	 * physical blocks.
1739 	 *
1740 	 * Try to give it an estimate that's good enough, maybe at some
1741 	 * point we can change the ->readdir prototype to include the
1742 	 * buffer size.  For now we use the current glibc buffer size.
1743 	 */
1744 	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1745 
1746 	return xfs_readdir(NULL, ip, ctx, bufsize);
1747 }
1748 
1749 STATIC loff_t
1750 xfs_file_llseek(
1751 	struct file	*file,
1752 	loff_t		offset,
1753 	int		whence)
1754 {
1755 	struct inode		*inode = file->f_mapping->host;
1756 
1757 	if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1758 		return -EIO;
1759 
1760 	switch (whence) {
1761 	default:
1762 		return generic_file_llseek(file, offset, whence);
1763 	case SEEK_HOLE:
1764 		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1765 		break;
1766 	case SEEK_DATA:
1767 		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1768 		break;
1769 	}
1770 
1771 	if (offset < 0)
1772 		return offset;
1773 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1774 }
1775 
1776 static inline vm_fault_t
1777 xfs_dax_fault_locked(
1778 	struct vm_fault		*vmf,
1779 	unsigned int		order,
1780 	bool			write_fault)
1781 {
1782 	vm_fault_t		ret;
1783 	unsigned long		pfn;
1784 
1785 	if (!IS_ENABLED(CONFIG_FS_DAX)) {
1786 		ASSERT(0);
1787 		return VM_FAULT_SIGBUS;
1788 	}
1789 	ret = dax_iomap_fault(vmf, order, &pfn, NULL,
1790 			(write_fault && !vmf->cow_page) ?
1791 				&xfs_dax_write_iomap_ops :
1792 				&xfs_read_iomap_ops);
1793 	if (ret & VM_FAULT_NEEDDSYNC)
1794 		ret = dax_finish_sync_fault(vmf, order, pfn);
1795 	return ret;
1796 }
1797 
1798 static vm_fault_t
1799 xfs_dax_read_fault(
1800 	struct vm_fault		*vmf,
1801 	unsigned int		order)
1802 {
1803 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1804 	vm_fault_t		ret;
1805 
1806 	trace_xfs_read_fault(ip, order);
1807 
1808 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1809 	ret = xfs_dax_fault_locked(vmf, order, false);
1810 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1811 
1812 	return ret;
1813 }
1814 
1815 /*
1816  * Locking for serialisation of IO during page faults. This results in a lock
1817  * ordering of:
1818  *
1819  * mmap_lock (MM)
1820  *   sb_start_pagefault(vfs, freeze)
1821  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1822  *       page_lock (MM)
1823  *         i_lock (XFS - extent map serialisation)
1824  */
1825 static vm_fault_t
1826 __xfs_write_fault(
1827 	struct vm_fault		*vmf,
1828 	unsigned int		order,
1829 	struct xfs_zone_alloc_ctx *ac)
1830 {
1831 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1832 	struct xfs_inode	*ip = XFS_I(inode);
1833 	unsigned int		lock_mode = XFS_MMAPLOCK_SHARED;
1834 	vm_fault_t		ret;
1835 
1836 	trace_xfs_write_fault(ip, order);
1837 
1838 	sb_start_pagefault(inode->i_sb);
1839 	file_update_time(vmf->vma->vm_file);
1840 
1841 	/*
1842 	 * Normally we only need the shared mmaplock, but if a reflink remap is
1843 	 * in progress we take the exclusive lock to wait for the remap to
1844 	 * finish before taking a write fault.
1845 	 */
1846 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1847 	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1848 		xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1849 		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1850 		lock_mode = XFS_MMAPLOCK_EXCL;
1851 	}
1852 
1853 	if (IS_DAX(inode))
1854 		ret = xfs_dax_fault_locked(vmf, order, true);
1855 	else
1856 		ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
1857 				ac);
1858 	xfs_iunlock(ip, lock_mode);
1859 
1860 	sb_end_pagefault(inode->i_sb);
1861 	return ret;
1862 }
1863 
1864 static vm_fault_t
1865 xfs_write_fault_zoned(
1866 	struct vm_fault		*vmf,
1867 	unsigned int		order)
1868 {
1869 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1870 	unsigned int		len = folio_size(page_folio(vmf->page));
1871 	struct xfs_zone_alloc_ctx ac = { };
1872 	int			error;
1873 	vm_fault_t		ret;
1874 
1875 	/*
1876 	 * This could over-allocate as it doesn't check for truncation.
1877 	 *
1878 	 * But as the overallocation is limited to less than a folio and will be
1879 	 * release instantly that's just fine.
1880 	 */
1881 	error = xfs_zoned_space_reserve(ip->i_mount,
1882 			XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
1883 	if (error < 0)
1884 		return vmf_fs_error(error);
1885 	ret = __xfs_write_fault(vmf, order, &ac);
1886 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1887 	return ret;
1888 }
1889 
1890 static vm_fault_t
1891 xfs_write_fault(
1892 	struct vm_fault		*vmf,
1893 	unsigned int		order)
1894 {
1895 	if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
1896 		return xfs_write_fault_zoned(vmf, order);
1897 	return __xfs_write_fault(vmf, order, NULL);
1898 }
1899 
1900 static inline bool
1901 xfs_is_write_fault(
1902 	struct vm_fault		*vmf)
1903 {
1904 	return (vmf->flags & FAULT_FLAG_WRITE) &&
1905 	       (vmf->vma->vm_flags & VM_SHARED);
1906 }
1907 
1908 static vm_fault_t
1909 xfs_filemap_fault(
1910 	struct vm_fault		*vmf)
1911 {
1912 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1913 
1914 	/* DAX can shortcut the normal fault path on write faults! */
1915 	if (IS_DAX(inode)) {
1916 		if (xfs_is_write_fault(vmf))
1917 			return xfs_write_fault(vmf, 0);
1918 		return xfs_dax_read_fault(vmf, 0);
1919 	}
1920 
1921 	trace_xfs_read_fault(XFS_I(inode), 0);
1922 	return filemap_fault(vmf);
1923 }
1924 
1925 static vm_fault_t
1926 xfs_filemap_huge_fault(
1927 	struct vm_fault		*vmf,
1928 	unsigned int		order)
1929 {
1930 	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1931 		return VM_FAULT_FALLBACK;
1932 
1933 	/* DAX can shortcut the normal fault path on write faults! */
1934 	if (xfs_is_write_fault(vmf))
1935 		return xfs_write_fault(vmf, order);
1936 	return xfs_dax_read_fault(vmf, order);
1937 }
1938 
1939 static vm_fault_t
1940 xfs_filemap_page_mkwrite(
1941 	struct vm_fault		*vmf)
1942 {
1943 	return xfs_write_fault(vmf, 0);
1944 }
1945 
1946 /*
1947  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1948  * on write faults. In reality, it needs to serialise against truncate and
1949  * prepare memory for writing so handle is as standard write fault.
1950  */
1951 static vm_fault_t
1952 xfs_filemap_pfn_mkwrite(
1953 	struct vm_fault		*vmf)
1954 {
1955 	return xfs_write_fault(vmf, 0);
1956 }
1957 
1958 static const struct vm_operations_struct xfs_file_vm_ops = {
1959 	.fault		= xfs_filemap_fault,
1960 	.huge_fault	= xfs_filemap_huge_fault,
1961 	.map_pages	= filemap_map_pages,
1962 	.page_mkwrite	= xfs_filemap_page_mkwrite,
1963 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1964 };
1965 
1966 STATIC int
1967 xfs_file_mmap_prepare(
1968 	struct vm_area_desc	*desc)
1969 {
1970 	struct file		*file = desc->file;
1971 	struct inode		*inode = file_inode(file);
1972 	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
1973 
1974 	/*
1975 	 * We don't support synchronous mappings for non-DAX files and
1976 	 * for DAX files if underneath dax_device is not synchronous.
1977 	 */
1978 	if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
1979 				      target->bt_daxdev))
1980 		return -EOPNOTSUPP;
1981 
1982 	file_accessed(file);
1983 	desc->vm_ops = &xfs_file_vm_ops;
1984 	if (IS_DAX(inode))
1985 		desc->vm_flags |= VM_HUGEPAGE;
1986 	return 0;
1987 }
1988 
1989 const struct file_operations xfs_file_operations = {
1990 	.llseek		= xfs_file_llseek,
1991 	.read_iter	= xfs_file_read_iter,
1992 	.write_iter	= xfs_file_write_iter,
1993 	.splice_read	= xfs_file_splice_read,
1994 	.splice_write	= iter_file_splice_write,
1995 	.iopoll		= iocb_bio_iopoll,
1996 	.unlocked_ioctl	= xfs_file_ioctl,
1997 #ifdef CONFIG_COMPAT
1998 	.compat_ioctl	= xfs_file_compat_ioctl,
1999 #endif
2000 	.mmap_prepare	= xfs_file_mmap_prepare,
2001 	.open		= xfs_file_open,
2002 	.release	= xfs_file_release,
2003 	.fsync		= xfs_file_fsync,
2004 	.get_unmapped_area = thp_get_unmapped_area,
2005 	.fallocate	= xfs_file_fallocate,
2006 	.fadvise	= xfs_file_fadvise,
2007 	.remap_file_range = xfs_file_remap_range,
2008 	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
2009 			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
2010 			  FOP_DONTCACHE,
2011 	.setlease	= generic_setlease,
2012 };
2013 
2014 const struct file_operations xfs_dir_file_operations = {
2015 	.open		= xfs_dir_open,
2016 	.read		= generic_read_dir,
2017 	.iterate_shared	= xfs_file_readdir,
2018 	.llseek		= generic_file_llseek,
2019 	.unlocked_ioctl	= xfs_file_ioctl,
2020 #ifdef CONFIG_COMPAT
2021 	.compat_ioctl	= xfs_file_compat_ioctl,
2022 #endif
2023 	.fsync		= xfs_dir_fsync,
2024 	.setlease	= generic_setlease,
2025 };
2026